blob: cd58de6ea76d1ba663769f8a190a3f1910fea98d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092/* Generic helper macro to convert characters of different types.
93 from_type and to_type have to be valid type names, begin and end
94 are pointers to the source characters which should be of type
95 "from_type *". to is a pointer of type "to_type *" and points to the
96 buffer where the result characters are written to. */
97#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
98 do { \
99 const from_type *iter_; to_type *to_; \
100 for (iter_ = (begin), to_ = (to_type *)(to); \
101 iter_ < (end); \
102 ++iter_, ++to_) { \
103 *to_ = (to_type)*iter_; \
104 } \
105 } while (0)
106
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107#define _PyUnicode_UTF8(op) \
108 (((PyCompactUnicodeObject*)(op))->utf8)
109#define PyUnicode_UTF8(op) \
110 (assert(PyUnicode_Check(op)), \
111 assert(PyUnicode_IS_READY(op)), \
112 PyUnicode_IS_COMPACT_ASCII(op) ? \
113 ((char*)((PyASCIIObject*)(op) + 1)) : \
114 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200115#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 (((PyCompactUnicodeObject*)(op))->utf8_length)
117#define PyUnicode_UTF8_LENGTH(op) \
118 (assert(PyUnicode_Check(op)), \
119 assert(PyUnicode_IS_READY(op)), \
120 PyUnicode_IS_COMPACT_ASCII(op) ? \
121 ((PyASCIIObject*)(op))->length : \
122 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
124#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
125#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
126#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
128#define _PyUnicode_KIND(op) \
129 (assert(PyUnicode_Check(op)), \
130 ((PyASCIIObject *)(op))->state.kind)
131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(PyUnicode_Check(op)), \
133 ((PyASCIIObject *)(op))->length)
134
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200135/* The Unicode string has been modified: reset the hash */
136#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200138
Walter Dörwald16807132007-05-25 13:52:07 +0000139/* This dictionary holds all interned unicode strings. Note that references
140 to strings in this dictionary are *not* counted in the string's ob_refcnt.
141 When the interned string reaches a refcnt of 0 the string deallocation
142 function will delete the reference from this dictionary.
143
144 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000145 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000146*/
147static PyObject *interned;
148
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000149/* The empty Unicode object is shared to improve performance. */
150static PyUnicodeObject *unicode_empty;
151
152/* Single character Unicode strings in the Latin-1 range are being
153 shared as well. */
154static PyUnicodeObject *unicode_latin1[256];
155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Fast detection of the most frequent whitespace characters */
157const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000159/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000161/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* case 0x000C: * FORM FEED */
163/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 1, 1, 1, 1, 1, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000166/* case 0x001C: * FILE SEPARATOR */
167/* case 0x001D: * GROUP SEPARATOR */
168/* case 0x001E: * RECORD SEPARATOR */
169/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 1, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
Alexander Belopolsky40018472011-02-26 01:02:56 +0000187static PyObject *
188unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000189 PyObject **errorHandler,const char *encoding, const char *reason,
190 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
191 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
192
Alexander Belopolsky40018472011-02-26 01:02:56 +0000193static void
194raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300195 const char *encoding,
196 const Py_UNICODE *unicode, Py_ssize_t size,
197 Py_ssize_t startpos, Py_ssize_t endpos,
198 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000199
Christian Heimes190d79e2008-01-30 11:58:22 +0000200/* Same for linebreaks */
201static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000204/* 0x000B, * LINE TABULATION */
205/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000206/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* 0x001C, * FILE SEPARATOR */
210/* 0x001D, * GROUP SEPARATOR */
211/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 1, 1, 1, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300228/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
229 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000230Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000231PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000232{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000233#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000235#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 /* This is actually an illegal character, so it should
237 not be passed to unichr. */
238 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000239#endif
240}
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242/* --- Bloom Filters ----------------------------------------------------- */
243
244/* stuff to implement simple "bloom filters" for Unicode characters.
245 to keep things simple, we use a single bitmask, using the least 5
246 bits from each unicode characters as the bit index. */
247
248/* the linebreak mask is set up by Unicode_Init below */
249
Antoine Pitrouf068f942010-01-13 14:19:12 +0000250#if LONG_BIT >= 128
251#define BLOOM_WIDTH 128
252#elif LONG_BIT >= 64
253#define BLOOM_WIDTH 64
254#elif LONG_BIT >= 32
255#define BLOOM_WIDTH 32
256#else
257#error "LONG_BIT is smaller than 32"
258#endif
259
Thomas Wouters477c8d52006-05-27 19:21:47 +0000260#define BLOOM_MASK unsigned long
261
262static BLOOM_MASK bloom_linebreak;
263
Antoine Pitrouf068f942010-01-13 14:19:12 +0000264#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
265#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266
Benjamin Peterson29060642009-01-31 22:14:21 +0000267#define BLOOM_LINEBREAK(ch) \
268 ((ch) < 128U ? ascii_linebreak[(ch)] : \
269 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270
Alexander Belopolsky40018472011-02-26 01:02:56 +0000271Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200272make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273{
274 /* calculate simple bloom-style bitmask for a given unicode string */
275
Antoine Pitrouf068f942010-01-13 14:19:12 +0000276 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000277 Py_ssize_t i;
278
279 mask = 0;
280 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
283 return mask;
284}
285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200286#define BLOOM_MEMBER(mask, chr, str) \
287 (BLOOM(mask, chr) \
288 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000289
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290/* --- Unicode Object ----------------------------------------------------- */
291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200292static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200293fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
294
295Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
296 Py_ssize_t size, Py_UCS4 ch,
297 int direction)
298{
299 /* like wcschr, but doesn't stop at NULL characters */
300 Py_ssize_t i;
301 if (direction == 1) {
302 for(i = 0; i < size; i++)
303 if (PyUnicode_READ(kind, s, i) == ch)
304 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
305 }
306 else {
307 for(i = size-1; i >= 0; i--)
308 if (PyUnicode_READ(kind, s, i) == ch)
309 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
310 }
311 return NULL;
312}
313
Alexander Belopolsky40018472011-02-26 01:02:56 +0000314static int
315unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000317{
318 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200320 /* Resizing is only supported for old unicode objects. */
321 assert(!PyUnicode_IS_COMPACT(unicode));
322 assert(_PyUnicode_WSTR(unicode) != NULL);
323
324 /* ... and only if they have not been readied yet, because
325 callees usually rely on the wstr representation when resizing. */
326 assert(unicode->data.any == NULL);
327
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000328 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200329 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000330 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000332 /* Resizing shared object (unicode_empty or single character
333 objects) in-place is not allowed. Use PyUnicode_Resize()
334 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000335
Benjamin Peterson14339b62009-01-31 16:36:08 +0000336 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200337 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
338 _PyUnicode_WSTR(unicode)[0] < 256U &&
339 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000341 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 return -1;
343 }
344
Thomas Wouters477c8d52006-05-27 19:21:47 +0000345 /* We allocate one more byte to make sure the string is Ux0000 terminated.
346 The overallocation is also used by fastsearch, which assumes that it's
347 safe to look at str[length] (without making any assumptions about what
348 it contains). */
349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350 oldstr = _PyUnicode_WSTR(unicode);
351 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
352 sizeof(Py_UNICODE) * (length + 1));
353 if (!_PyUnicode_WSTR(unicode)) {
354 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 PyErr_NoMemory();
356 return -1;
357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 _PyUnicode_WSTR(unicode)[length] = 0;
359 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360
Benjamin Peterson29060642009-01-31 22:14:21 +0000361 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362 if (unicode->data.any != NULL) {
363 PyObject_FREE(unicode->data.any);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200364 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != unicode->data.any) {
365 PyObject_FREE(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200367 _PyUnicode_UTF8(unicode) = NULL;
368 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200369 unicode->data.any = NULL;
370 _PyUnicode_LENGTH(unicode) = 0;
371 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
372 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200374 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000375
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return 0;
377}
378
379/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000380 Ux0000 terminated; some code (e.g. new_identifier)
381 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382
383 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385
386*/
387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200388#ifdef Py_DEBUG
389int unicode_old_new_calls = 0;
390#endif
391
Alexander Belopolsky40018472011-02-26 01:02:56 +0000392static PyUnicodeObject *
393_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394{
395 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200396 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 if (length == 0 && unicode_empty != NULL) {
400 Py_INCREF(unicode_empty);
401 return unicode_empty;
402 }
403
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000404 /* Ensure we won't overflow the size. */
405 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
406 return (PyUnicodeObject *)PyErr_NoMemory();
407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200408 if (length < 0) {
409 PyErr_SetString(PyExc_SystemError,
410 "Negative size passed to _PyUnicode_New");
411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000412 }
413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200414#ifdef Py_DEBUG
415 ++unicode_old_new_calls;
416#endif
417
418 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
419 if (unicode == NULL)
420 return NULL;
421 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
422 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
423 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 PyErr_NoMemory();
425 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200427
Jeremy Hyltond8082792003-09-16 19:41:39 +0000428 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000429 * the caller fails before initializing str -- unicode_resize()
430 * reads str[0], and the Keep-Alive optimization can keep memory
431 * allocated for str alive across a call to unicode_dealloc(unicode).
432 * We don't want unicode_resize to read uninitialized memory in
433 * that case.
434 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200435 _PyUnicode_WSTR(unicode)[0] = 0;
436 _PyUnicode_WSTR(unicode)[length] = 0;
437 _PyUnicode_WSTR_LENGTH(unicode) = length;
438 _PyUnicode_HASH(unicode) = -1;
439 _PyUnicode_STATE(unicode).interned = 0;
440 _PyUnicode_STATE(unicode).kind = 0;
441 _PyUnicode_STATE(unicode).compact = 0;
442 _PyUnicode_STATE(unicode).ready = 0;
443 _PyUnicode_STATE(unicode).ascii = 0;
444 unicode->data.any = NULL;
445 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200446 _PyUnicode_UTF8(unicode) = NULL;
447 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000449
Benjamin Peterson29060642009-01-31 22:14:21 +0000450 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000451 /* XXX UNREF/NEWREF interface should be more symmetrical */
452 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000453 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000454 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000455 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456}
457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200458#ifdef Py_DEBUG
459int unicode_new_new_calls = 0;
460
461/* Functions wrapping macros for use in debugger */
462char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200463 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200464}
465
466void *_PyUnicode_compact_data(void *unicode) {
467 return _PyUnicode_COMPACT_DATA(unicode);
468}
469void *_PyUnicode_data(void *unicode){
470 printf("obj %p\n", unicode);
471 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
472 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
473 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
474 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
475 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
476 return PyUnicode_DATA(unicode);
477}
478#endif
479
480PyObject *
481PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
482{
483 PyObject *obj;
484 PyCompactUnicodeObject *unicode;
485 void *data;
486 int kind_state;
487 int is_sharing = 0, is_ascii = 0;
488 Py_ssize_t char_size;
489 Py_ssize_t struct_size;
490
491 /* Optimization for empty strings */
492 if (size == 0 && unicode_empty != NULL) {
493 Py_INCREF(unicode_empty);
494 return (PyObject *)unicode_empty;
495 }
496
497#ifdef Py_DEBUG
498 ++unicode_new_new_calls;
499#endif
500
501 struct_size = sizeof(PyCompactUnicodeObject);
502 if (maxchar < 128) {
503 kind_state = PyUnicode_1BYTE_KIND;
504 char_size = 1;
505 is_ascii = 1;
506 struct_size = sizeof(PyASCIIObject);
507 }
508 else if (maxchar < 256) {
509 kind_state = PyUnicode_1BYTE_KIND;
510 char_size = 1;
511 }
512 else if (maxchar < 65536) {
513 kind_state = PyUnicode_2BYTE_KIND;
514 char_size = 2;
515 if (sizeof(wchar_t) == 2)
516 is_sharing = 1;
517 }
518 else {
519 kind_state = PyUnicode_4BYTE_KIND;
520 char_size = 4;
521 if (sizeof(wchar_t) == 4)
522 is_sharing = 1;
523 }
524
525 /* Ensure we won't overflow the size. */
526 if (size < 0) {
527 PyErr_SetString(PyExc_SystemError,
528 "Negative size passed to PyUnicode_New");
529 return NULL;
530 }
531 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
532 return PyErr_NoMemory();
533
534 /* Duplicated allocation code from _PyObject_New() instead of a call to
535 * PyObject_New() so we are able to allocate space for the object and
536 * it's data buffer.
537 */
538 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
539 if (obj == NULL)
540 return PyErr_NoMemory();
541 obj = PyObject_INIT(obj, &PyUnicode_Type);
542 if (obj == NULL)
543 return NULL;
544
545 unicode = (PyCompactUnicodeObject *)obj;
546 if (is_ascii)
547 data = ((PyASCIIObject*)obj) + 1;
548 else
549 data = unicode + 1;
550 _PyUnicode_LENGTH(unicode) = size;
551 _PyUnicode_HASH(unicode) = -1;
552 _PyUnicode_STATE(unicode).interned = 0;
553 _PyUnicode_STATE(unicode).kind = kind_state;
554 _PyUnicode_STATE(unicode).compact = 1;
555 _PyUnicode_STATE(unicode).ready = 1;
556 _PyUnicode_STATE(unicode).ascii = is_ascii;
557 if (is_ascii) {
558 ((char*)data)[size] = 0;
559 _PyUnicode_WSTR(unicode) = NULL;
560 }
561 else if (kind_state == PyUnicode_1BYTE_KIND) {
562 ((char*)data)[size] = 0;
563 _PyUnicode_WSTR(unicode) = NULL;
564 _PyUnicode_WSTR_LENGTH(unicode) = 0;
565 unicode->utf8_length = 0;
566 unicode->utf8 = NULL;
567 }
568 else {
569 unicode->utf8 = NULL;
570 if (kind_state == PyUnicode_2BYTE_KIND)
571 ((Py_UCS2*)data)[size] = 0;
572 else /* kind_state == PyUnicode_4BYTE_KIND */
573 ((Py_UCS4*)data)[size] = 0;
574 if (is_sharing) {
575 _PyUnicode_WSTR_LENGTH(unicode) = size;
576 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
577 }
578 else {
579 _PyUnicode_WSTR_LENGTH(unicode) = 0;
580 _PyUnicode_WSTR(unicode) = NULL;
581 }
582 }
583 return obj;
584}
585
586#if SIZEOF_WCHAR_T == 2
587/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
588 will decode surrogate pairs, the other conversions are implemented as macros
589 for efficency.
590
591 This function assumes that unicode can hold one more code point than wstr
592 characters for a terminating null character. */
593static int
594unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
595 PyUnicodeObject *unicode)
596{
597 const wchar_t *iter;
598 Py_UCS4 *ucs4_out;
599
600 assert(unicode && PyUnicode_Check(unicode));
601 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
602 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
603
604 for (iter = begin; iter < end; ) {
605 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
606 _PyUnicode_GET_LENGTH(unicode)));
607 if (*iter >= 0xD800 && *iter <= 0xDBFF
608 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
609 {
610 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
611 iter += 2;
612 }
613 else {
614 *ucs4_out++ = *iter;
615 iter++;
616 }
617 }
618 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
619 _PyUnicode_GET_LENGTH(unicode)));
620
621 return 0;
622}
623#endif
624
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200625Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200626PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
627 PyObject *from, Py_ssize_t from_start,
628 Py_ssize_t how_many)
629{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200630 unsigned int from_kind, to_kind;
631 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200632
Victor Stinnerb1536152011-09-30 02:26:10 +0200633 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
634 PyErr_BadInternalCall();
635 return -1;
636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637
638 if (PyUnicode_READY(from))
639 return -1;
640 if (PyUnicode_READY(to))
641 return -1;
642
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200643 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200644 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
645 PyErr_Format(PyExc_ValueError,
646 "Cannot write %zi characters at %zi "
647 "in a string of %zi characters",
648 how_many, to_start, PyUnicode_GET_LENGTH(to));
649 return -1;
650 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200651 if (how_many == 0)
652 return 0;
653
654 if (Py_REFCNT(to) != 1) {
655 PyErr_SetString(PyExc_ValueError,
656 "Cannot modify a string having more than 1 reference");
657 return -1;
658 }
Victor Stinnerc17f5402011-09-29 00:16:58 +0200659 _PyUnicode_DIRTY(to);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200661 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200662 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200663 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200664 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200665
666 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200667 /* fast path */
Victor Stinnera0702ab2011-09-29 14:14:38 +0200668 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200669 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200670 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200671 + PyUnicode_KIND_SIZE(from_kind, from_start),
672 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200673 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200674 else if (from_kind == PyUnicode_1BYTE_KIND
675 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200676 {
677 _PyUnicode_CONVERT_BYTES(
678 Py_UCS1, Py_UCS2,
679 PyUnicode_1BYTE_DATA(from) + from_start,
680 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
681 PyUnicode_2BYTE_DATA(to) + to_start
682 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200683 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200684 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200685 && to_kind == PyUnicode_4BYTE_KIND)
686 {
687 _PyUnicode_CONVERT_BYTES(
688 Py_UCS1, Py_UCS4,
689 PyUnicode_1BYTE_DATA(from) + from_start,
690 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
691 PyUnicode_4BYTE_DATA(to) + to_start
692 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200693 }
694 else if (from_kind == PyUnicode_2BYTE_KIND
695 && to_kind == PyUnicode_4BYTE_KIND)
696 {
697 _PyUnicode_CONVERT_BYTES(
698 Py_UCS2, Py_UCS4,
699 PyUnicode_2BYTE_DATA(from) + from_start,
700 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
701 PyUnicode_4BYTE_DATA(to) + to_start
702 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200703 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200704 else {
705 int invalid_kinds;
706 if (from_kind > to_kind) {
707 /* slow path to check for character overflow */
708 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
709 Py_UCS4 ch, maxchar;
710 Py_ssize_t i;
711
712 maxchar = 0;
713 invalid_kinds = 0;
714 for (i=0; i < how_many; i++) {
715 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
716 if (ch > maxchar) {
717 maxchar = ch;
718 if (maxchar > to_maxchar) {
719 invalid_kinds = 1;
720 break;
721 }
722 }
723 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
724 }
725 }
726 else
727 invalid_kinds = 1;
728 if (invalid_kinds) {
729 PyErr_Format(PyExc_ValueError,
730 "Cannot copy UCS%u characters "
731 "into a string of UCS%u characters",
732 1 << (from_kind - 1),
733 1 << (to_kind -1));
734 return -1;
735 }
736 }
737 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200738}
739
Victor Stinner17222162011-09-28 22:15:37 +0200740/* Find the maximum code point and count the number of surrogate pairs so a
741 correct string length can be computed before converting a string to UCS4.
742 This function counts single surrogates as a character and not as a pair.
743
744 Return 0 on success, or -1 on error. */
745static int
746find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
747 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200748{
749 const wchar_t *iter;
750
751 if (num_surrogates == NULL || maxchar == NULL) {
752 PyErr_SetString(PyExc_SystemError,
753 "unexpected NULL arguments to "
754 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
755 return -1;
756 }
757
758 *num_surrogates = 0;
759 *maxchar = 0;
760
761 for (iter = begin; iter < end; ) {
762 if (*iter > *maxchar)
763 *maxchar = *iter;
764#if SIZEOF_WCHAR_T == 2
765 if (*iter >= 0xD800 && *iter <= 0xDBFF
766 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
767 {
768 Py_UCS4 surrogate_val;
769 surrogate_val = (((iter[0] & 0x3FF)<<10)
770 | (iter[1] & 0x3FF)) + 0x10000;
771 ++(*num_surrogates);
772 if (surrogate_val > *maxchar)
773 *maxchar = surrogate_val;
774 iter += 2;
775 }
776 else
777 iter++;
778#else
779 iter++;
780#endif
781 }
782 return 0;
783}
784
785#ifdef Py_DEBUG
786int unicode_ready_calls = 0;
787#endif
788
789int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200790_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200791{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200792 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200793 wchar_t *end;
794 Py_UCS4 maxchar = 0;
795 Py_ssize_t num_surrogates;
796#if SIZEOF_WCHAR_T == 2
797 Py_ssize_t length_wo_surrogates;
798#endif
799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200800 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200801 strings were created using _PyObject_New() and where no canonical
802 representation (the str field) has been set yet aka strings
803 which are not yet ready. */
804 assert(PyUnicode_Check(obj));
805 assert(!PyUnicode_IS_READY(obj));
806 assert(!PyUnicode_IS_COMPACT(obj));
807 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200809 assert(unicode->data.any == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200810 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200811 /* Actually, it should neither be interned nor be anything else: */
812 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813
814#ifdef Py_DEBUG
815 ++unicode_ready_calls;
816#endif
817
818 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200819 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200820 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200822
823 if (maxchar < 256) {
824 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
825 if (!unicode->data.any) {
826 PyErr_NoMemory();
827 return -1;
828 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200829 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200830 _PyUnicode_WSTR(unicode), end,
831 PyUnicode_1BYTE_DATA(unicode));
832 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
833 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
834 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
835 if (maxchar < 128) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200836 _PyUnicode_UTF8(unicode) = unicode->data.any;
837 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838 }
839 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200840 _PyUnicode_UTF8(unicode) = NULL;
841 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842 }
843 PyObject_FREE(_PyUnicode_WSTR(unicode));
844 _PyUnicode_WSTR(unicode) = NULL;
845 _PyUnicode_WSTR_LENGTH(unicode) = 0;
846 }
847 /* In this case we might have to convert down from 4-byte native
848 wchar_t to 2-byte unicode. */
849 else if (maxchar < 65536) {
850 assert(num_surrogates == 0 &&
851 "FindMaxCharAndNumSurrogatePairs() messed up");
852
Victor Stinner506f5922011-09-28 22:34:18 +0200853#if SIZEOF_WCHAR_T == 2
854 /* We can share representations and are done. */
855 unicode->data.any = _PyUnicode_WSTR(unicode);
856 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
857 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
858 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200859 _PyUnicode_UTF8(unicode) = NULL;
860 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200861#else
862 /* sizeof(wchar_t) == 4 */
863 unicode->data.any = PyObject_MALLOC(
864 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
865 if (!unicode->data.any) {
866 PyErr_NoMemory();
867 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200868 }
Victor Stinner506f5922011-09-28 22:34:18 +0200869 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
870 _PyUnicode_WSTR(unicode), end,
871 PyUnicode_2BYTE_DATA(unicode));
872 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
873 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
874 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200875 _PyUnicode_UTF8(unicode) = NULL;
876 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200877 PyObject_FREE(_PyUnicode_WSTR(unicode));
878 _PyUnicode_WSTR(unicode) = NULL;
879 _PyUnicode_WSTR_LENGTH(unicode) = 0;
880#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881 }
882 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
883 else {
884#if SIZEOF_WCHAR_T == 2
885 /* in case the native representation is 2-bytes, we need to allocate a
886 new normalized 4-byte version. */
887 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
888 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
889 if (!unicode->data.any) {
890 PyErr_NoMemory();
891 return -1;
892 }
893 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
894 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200895 _PyUnicode_UTF8(unicode) = NULL;
896 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
898 unicode) < 0) {
899 assert(0 && "ConvertWideCharToUCS4 failed");
900 return -1;
901 }
902 PyObject_FREE(_PyUnicode_WSTR(unicode));
903 _PyUnicode_WSTR(unicode) = NULL;
904 _PyUnicode_WSTR_LENGTH(unicode) = 0;
905#else
906 assert(num_surrogates == 0);
907
908 unicode->data.any = _PyUnicode_WSTR(unicode);
909 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200910 _PyUnicode_UTF8(unicode) = NULL;
911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
913#endif
914 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
915 }
916 _PyUnicode_STATE(unicode).ready = 1;
917 return 0;
918}
919
Alexander Belopolsky40018472011-02-26 01:02:56 +0000920static void
921unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000922{
Walter Dörwald16807132007-05-25 13:52:07 +0000923 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000924 case SSTATE_NOT_INTERNED:
925 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000926
Benjamin Peterson29060642009-01-31 22:14:21 +0000927 case SSTATE_INTERNED_MORTAL:
928 /* revive dead object temporarily for DelItem */
929 Py_REFCNT(unicode) = 3;
930 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
931 Py_FatalError(
932 "deletion of interned string failed");
933 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000934
Benjamin Peterson29060642009-01-31 22:14:21 +0000935 case SSTATE_INTERNED_IMMORTAL:
936 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000937
Benjamin Peterson29060642009-01-31 22:14:21 +0000938 default:
939 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000940 }
941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200942 if (_PyUnicode_WSTR(unicode) &&
943 (!PyUnicode_IS_READY(unicode) ||
944 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
945 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200946 if (!PyUnicode_IS_COMPACT_ASCII(unicode)
947 && _PyUnicode_UTF8(unicode)
948 && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
949 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200950
951 if (PyUnicode_IS_COMPACT(unicode)) {
952 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000953 }
954 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200955 if (unicode->data.any)
956 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000957 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000958 }
959}
960
Alexander Belopolsky40018472011-02-26 01:02:56 +0000961static int
962_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000963{
964 register PyUnicodeObject *v;
965
966 /* Argument checks */
967 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000968 PyErr_BadInternalCall();
969 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000970 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000971 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
973 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000974 PyErr_BadInternalCall();
975 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000976 }
977
978 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 possible since these are being shared.
980 The same goes for new-representation unicode objects or objects which
981 have already been readied.
982 For these, we simply return a fresh copy with the same Unicode content.
983 */
984 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
985 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
986 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000987 PyUnicodeObject *w = _PyUnicode_New(length);
988 if (w == NULL)
989 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
991 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000992 Py_DECREF(*unicode);
993 *unicode = w;
994 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000995 }
996
997 /* Note that we don't have to modify *unicode for unshared Unicode
998 objects, since we can modify them in-place. */
999 return unicode_resize(v, length);
1000}
1001
Alexander Belopolsky40018472011-02-26 01:02:56 +00001002int
1003PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001004{
1005 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
1006}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001008static PyObject*
1009get_latin1_char(unsigned char ch)
1010{
1011 PyUnicodeObject *unicode = unicode_latin1[ch];
1012 if (!unicode) {
1013 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
1014 if (!unicode)
1015 return NULL;
1016 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1017 unicode_latin1[ch] = unicode;
1018 }
1019 Py_INCREF(unicode);
1020 return (PyObject *)unicode;
1021}
1022
Alexander Belopolsky40018472011-02-26 01:02:56 +00001023PyObject *
1024PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025{
1026 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027 Py_UCS4 maxchar = 0;
1028 Py_ssize_t num_surrogates;
1029
1030 if (u == NULL)
1031 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001032
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001033 /* If the Unicode data is known at construction time, we can apply
1034 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001036 /* Optimization for empty strings */
1037 if (size == 0 && unicode_empty != NULL) {
1038 Py_INCREF(unicode_empty);
1039 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001040 }
Tim Petersced69f82003-09-16 20:30:58 +00001041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001042 /* Single character Unicode objects in the Latin-1 range are
1043 shared when using this constructor */
1044 if (size == 1 && *u < 256)
1045 return get_latin1_char((unsigned char)*u);
1046
1047 /* If not empty and not single character, copy the Unicode data
1048 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001049 if (find_maxchar_surrogates(u, u + size,
1050 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 return NULL;
1052
1053 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1054 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055 if (!unicode)
1056 return NULL;
1057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 switch (PyUnicode_KIND(unicode)) {
1059 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001060 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1062 break;
1063 case PyUnicode_2BYTE_KIND:
1064#if Py_UNICODE_SIZE == 2
1065 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1066#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001067 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1069#endif
1070 break;
1071 case PyUnicode_4BYTE_KIND:
1072#if SIZEOF_WCHAR_T == 2
1073 /* This is the only case which has to process surrogates, thus
1074 a simple copy loop is not enough and we need a function. */
1075 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1076 Py_DECREF(unicode);
1077 return NULL;
1078 }
1079#else
1080 assert(num_surrogates == 0);
1081 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1082#endif
1083 break;
1084 default:
1085 assert(0 && "Impossible state");
1086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087
1088 return (PyObject *)unicode;
1089}
1090
Alexander Belopolsky40018472011-02-26 01:02:56 +00001091PyObject *
1092PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001093{
1094 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001095
Benjamin Peterson14339b62009-01-31 16:36:08 +00001096 if (size < 0) {
1097 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001098 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001099 return NULL;
1100 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001101
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001102 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001103 some optimizations which share commonly used objects.
1104 Also, this means the input must be UTF-8, so fall back to the
1105 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001106 if (u != NULL) {
1107
Benjamin Peterson29060642009-01-31 22:14:21 +00001108 /* Optimization for empty strings */
1109 if (size == 0 && unicode_empty != NULL) {
1110 Py_INCREF(unicode_empty);
1111 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001112 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001113
1114 /* Single characters are shared when using this constructor.
1115 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 if (size == 1 && Py_CHARMASK(*u) < 128)
1117 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001118
1119 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001120 }
1121
Walter Dörwald55507312007-05-18 13:12:10 +00001122 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001123 if (!unicode)
1124 return NULL;
1125
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001126 return (PyObject *)unicode;
1127}
1128
Alexander Belopolsky40018472011-02-26 01:02:56 +00001129PyObject *
1130PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001131{
1132 size_t size = strlen(u);
1133 if (size > PY_SSIZE_T_MAX) {
1134 PyErr_SetString(PyExc_OverflowError, "input too long");
1135 return NULL;
1136 }
1137
1138 return PyUnicode_FromStringAndSize(u, size);
1139}
1140
Victor Stinnere57b1c02011-09-28 22:20:48 +02001141static PyObject*
1142_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001143{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 PyObject *res;
1145 unsigned char max = 127;
1146 Py_ssize_t i;
1147 for (i = 0; i < size; i++) {
1148 if (u[i] & 0x80) {
1149 max = 255;
1150 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001151 }
1152 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153 res = PyUnicode_New(size, max);
1154 if (!res)
1155 return NULL;
1156 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1157 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001158}
1159
Victor Stinnere57b1c02011-09-28 22:20:48 +02001160static PyObject*
1161_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162{
1163 PyObject *res;
1164 Py_UCS2 max = 0;
1165 Py_ssize_t i;
1166 for (i = 0; i < size; i++)
1167 if (u[i] > max)
1168 max = u[i];
1169 res = PyUnicode_New(size, max);
1170 if (!res)
1171 return NULL;
1172 if (max >= 256)
1173 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1174 else
1175 for (i = 0; i < size; i++)
1176 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1177 return res;
1178}
1179
Victor Stinnere57b1c02011-09-28 22:20:48 +02001180static PyObject*
1181_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182{
1183 PyObject *res;
1184 Py_UCS4 max = 0;
1185 Py_ssize_t i;
1186 for (i = 0; i < size; i++)
1187 if (u[i] > max)
1188 max = u[i];
1189 res = PyUnicode_New(size, max);
1190 if (!res)
1191 return NULL;
1192 if (max >= 0x10000)
1193 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1194 else {
1195 int kind = PyUnicode_KIND(res);
1196 void *data = PyUnicode_DATA(res);
1197 for (i = 0; i < size; i++)
1198 PyUnicode_WRITE(kind, data, i, u[i]);
1199 }
1200 return res;
1201}
1202
1203PyObject*
1204PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1205{
1206 switch(kind) {
1207 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001208 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001210 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001212 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001213 }
1214 assert(0);
1215 return NULL;
1216}
1217
Victor Stinner034f6cf2011-09-30 02:26:44 +02001218PyObject*
1219PyUnicode_Copy(PyObject *unicode)
1220{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001221 Py_ssize_t size;
1222 PyObject *copy;
1223 void *data;
1224
Victor Stinner034f6cf2011-09-30 02:26:44 +02001225 if (!PyUnicode_Check(unicode)) {
1226 PyErr_BadInternalCall();
1227 return NULL;
1228 }
1229 if (PyUnicode_READY(unicode))
1230 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001231
1232 size = PyUnicode_GET_LENGTH(unicode);
1233 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1234 if (!copy)
1235 return NULL;
1236 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1237
1238 data = PyUnicode_DATA(unicode);
1239 switch (PyUnicode_KIND(unicode))
1240 {
1241 case PyUnicode_1BYTE_KIND:
1242 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1243 break;
1244 case PyUnicode_2BYTE_KIND:
1245 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1246 break;
1247 case PyUnicode_4BYTE_KIND:
1248 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1249 break;
1250 default:
1251 assert(0);
1252 break;
1253 }
1254 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001255}
1256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257
1258/* Widen Unicode objects to larger buffers.
1259 Return NULL if the string is too wide already. */
1260
1261void*
1262_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1263{
1264 Py_ssize_t i;
1265 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1266 void *d = PyUnicode_DATA(s);
1267 unsigned int skind = PyUnicode_KIND(s);
1268 if (PyUnicode_KIND(s) >= kind) {
1269 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1270 return NULL;
1271 }
1272 switch(kind) {
1273 case PyUnicode_2BYTE_KIND: {
1274 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1275 if (!result) {
1276 PyErr_NoMemory();
1277 return 0;
1278 }
1279 for (i = 0; i < len; i++)
1280 result[i] = ((Py_UCS1*)d)[i];
1281 return result;
1282 }
1283 case PyUnicode_4BYTE_KIND: {
1284 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1285 if (!result) {
1286 PyErr_NoMemory();
1287 return 0;
1288 }
1289 for (i = 0; i < len; i++)
1290 result[i] = PyUnicode_READ(skind, d, i);
1291 return result;
1292 }
1293 }
1294 Py_FatalError("invalid kind");
1295 return NULL;
1296}
1297
1298static Py_UCS4*
1299as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1300 int copy_null)
1301{
1302 int kind;
1303 void *data;
1304 Py_ssize_t len, targetlen;
1305 if (PyUnicode_READY(string) == -1)
1306 return NULL;
1307 kind = PyUnicode_KIND(string);
1308 data = PyUnicode_DATA(string);
1309 len = PyUnicode_GET_LENGTH(string);
1310 targetlen = len;
1311 if (copy_null)
1312 targetlen++;
1313 if (!target) {
1314 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1315 PyErr_NoMemory();
1316 return NULL;
1317 }
1318 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1319 if (!target) {
1320 PyErr_NoMemory();
1321 return NULL;
1322 }
1323 }
1324 else {
1325 if (targetsize < targetlen) {
1326 PyErr_Format(PyExc_SystemError,
1327 "string is longer than the buffer");
1328 if (copy_null && 0 < targetsize)
1329 target[0] = 0;
1330 return NULL;
1331 }
1332 }
1333 if (kind != PyUnicode_4BYTE_KIND) {
1334 Py_ssize_t i;
1335 for (i = 0; i < len; i++)
1336 target[i] = PyUnicode_READ(kind, data, i);
1337 }
1338 else
1339 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1340 if (copy_null)
1341 target[len] = 0;
1342 return target;
1343}
1344
1345Py_UCS4*
1346PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1347 int copy_null)
1348{
1349 if (target == NULL || targetsize < 1) {
1350 PyErr_BadInternalCall();
1351 return NULL;
1352 }
1353 return as_ucs4(string, target, targetsize, copy_null);
1354}
1355
1356Py_UCS4*
1357PyUnicode_AsUCS4Copy(PyObject *string)
1358{
1359 return as_ucs4(string, NULL, 0, 1);
1360}
1361
1362#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001363
Alexander Belopolsky40018472011-02-26 01:02:56 +00001364PyObject *
1365PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001368 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001370 PyErr_BadInternalCall();
1371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001372 }
1373
Martin v. Löwis790465f2008-04-05 20:41:37 +00001374 if (size == -1) {
1375 size = wcslen(w);
1376 }
1377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001379}
1380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001382
Walter Dörwald346737f2007-05-31 10:44:43 +00001383static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001384makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1385 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001386{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001387 *fmt++ = '%';
1388 if (width) {
1389 if (zeropad)
1390 *fmt++ = '0';
1391 fmt += sprintf(fmt, "%d", width);
1392 }
1393 if (precision)
1394 fmt += sprintf(fmt, ".%d", precision);
1395 if (longflag)
1396 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001397 else if (longlongflag) {
1398 /* longlongflag should only ever be nonzero on machines with
1399 HAVE_LONG_LONG defined */
1400#ifdef HAVE_LONG_LONG
1401 char *f = PY_FORMAT_LONG_LONG;
1402 while (*f)
1403 *fmt++ = *f++;
1404#else
1405 /* we shouldn't ever get here */
1406 assert(0);
1407 *fmt++ = 'l';
1408#endif
1409 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001410 else if (size_tflag) {
1411 char *f = PY_FORMAT_SIZE_T;
1412 while (*f)
1413 *fmt++ = *f++;
1414 }
1415 *fmt++ = c;
1416 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001417}
1418
Victor Stinner96865452011-03-01 23:44:09 +00001419/* helper for PyUnicode_FromFormatV() */
1420
1421static const char*
1422parse_format_flags(const char *f,
1423 int *p_width, int *p_precision,
1424 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1425{
1426 int width, precision, longflag, longlongflag, size_tflag;
1427
1428 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1429 f++;
1430 width = 0;
1431 while (Py_ISDIGIT((unsigned)*f))
1432 width = (width*10) + *f++ - '0';
1433 precision = 0;
1434 if (*f == '.') {
1435 f++;
1436 while (Py_ISDIGIT((unsigned)*f))
1437 precision = (precision*10) + *f++ - '0';
1438 if (*f == '%') {
1439 /* "%.3%s" => f points to "3" */
1440 f--;
1441 }
1442 }
1443 if (*f == '\0') {
1444 /* bogus format "%.1" => go backward, f points to "1" */
1445 f--;
1446 }
1447 if (p_width != NULL)
1448 *p_width = width;
1449 if (p_precision != NULL)
1450 *p_precision = precision;
1451
1452 /* Handle %ld, %lu, %lld and %llu. */
1453 longflag = 0;
1454 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001455 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001456
1457 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001458 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001459 longflag = 1;
1460 ++f;
1461 }
1462#ifdef HAVE_LONG_LONG
1463 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001464 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001465 longlongflag = 1;
1466 f += 2;
1467 }
1468#endif
1469 }
1470 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001471 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001472 size_tflag = 1;
1473 ++f;
1474 }
1475 if (p_longflag != NULL)
1476 *p_longflag = longflag;
1477 if (p_longlongflag != NULL)
1478 *p_longlongflag = longlongflag;
1479 if (p_size_tflag != NULL)
1480 *p_size_tflag = size_tflag;
1481 return f;
1482}
1483
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001484/* maximum number of characters required for output of %ld. 21 characters
1485 allows for 64-bit integers (in decimal) and an optional sign. */
1486#define MAX_LONG_CHARS 21
1487/* maximum number of characters required for output of %lld.
1488 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1489 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1490#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1491
Walter Dörwaldd2034312007-05-18 16:29:38 +00001492PyObject *
1493PyUnicode_FromFormatV(const char *format, va_list vargs)
1494{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001495 va_list count;
1496 Py_ssize_t callcount = 0;
1497 PyObject **callresults = NULL;
1498 PyObject **callresult = NULL;
1499 Py_ssize_t n = 0;
1500 int width = 0;
1501 int precision = 0;
1502 int zeropad;
1503 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001505 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001506 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1508 Py_UCS4 argmaxchar;
1509 Py_ssize_t numbersize = 0;
1510 char *numberresults = NULL;
1511 char *numberresult = NULL;
1512 Py_ssize_t i;
1513 int kind;
1514 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001515
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001516 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001517 /* step 1: count the number of %S/%R/%A/%s format specifications
1518 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1519 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520 * result in an array)
1521 * also esimate a upper bound for all the number formats in the string,
1522 * numbers will be formated in step 3 and be keept in a '\0'-separated
1523 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001524 for (f = format; *f; f++) {
1525 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001526 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001527 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1528 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1529 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1530 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001533#ifdef HAVE_LONG_LONG
1534 if (longlongflag) {
1535 if (width < MAX_LONG_LONG_CHARS)
1536 width = MAX_LONG_LONG_CHARS;
1537 }
1538 else
1539#endif
1540 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1541 including sign. Decimal takes the most space. This
1542 isn't enough for octal. If a width is specified we
1543 need more (which we allocate later). */
1544 if (width < MAX_LONG_CHARS)
1545 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001546
1547 /* account for the size + '\0' to separate numbers
1548 inside of the numberresults buffer */
1549 numbersize += (width + 1);
1550 }
1551 }
1552 else if ((unsigned char)*f > 127) {
1553 PyErr_Format(PyExc_ValueError,
1554 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1555 "string, got a non-ASCII byte: 0x%02x",
1556 (unsigned char)*f);
1557 return NULL;
1558 }
1559 }
1560 /* step 2: allocate memory for the results of
1561 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1562 if (callcount) {
1563 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1564 if (!callresults) {
1565 PyErr_NoMemory();
1566 return NULL;
1567 }
1568 callresult = callresults;
1569 }
1570 /* step 2.5: allocate memory for the results of formating numbers */
1571 if (numbersize) {
1572 numberresults = PyObject_Malloc(numbersize);
1573 if (!numberresults) {
1574 PyErr_NoMemory();
1575 goto fail;
1576 }
1577 numberresult = numberresults;
1578 }
1579
1580 /* step 3: format numbers and figure out how large a buffer we need */
1581 for (f = format; *f; f++) {
1582 if (*f == '%') {
1583 const char* p;
1584 int longflag;
1585 int longlongflag;
1586 int size_tflag;
1587 int numprinted;
1588
1589 p = f;
1590 zeropad = (f[1] == '0');
1591 f = parse_format_flags(f, &width, &precision,
1592 &longflag, &longlongflag, &size_tflag);
1593 switch (*f) {
1594 case 'c':
1595 {
1596 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001597 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001598 n++;
1599 break;
1600 }
1601 case '%':
1602 n++;
1603 break;
1604 case 'i':
1605 case 'd':
1606 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1607 width, precision, *f);
1608 if (longflag)
1609 numprinted = sprintf(numberresult, fmt,
1610 va_arg(count, long));
1611#ifdef HAVE_LONG_LONG
1612 else if (longlongflag)
1613 numprinted = sprintf(numberresult, fmt,
1614 va_arg(count, PY_LONG_LONG));
1615#endif
1616 else if (size_tflag)
1617 numprinted = sprintf(numberresult, fmt,
1618 va_arg(count, Py_ssize_t));
1619 else
1620 numprinted = sprintf(numberresult, fmt,
1621 va_arg(count, int));
1622 n += numprinted;
1623 /* advance by +1 to skip over the '\0' */
1624 numberresult += (numprinted + 1);
1625 assert(*(numberresult - 1) == '\0');
1626 assert(*(numberresult - 2) != '\0');
1627 assert(numprinted >= 0);
1628 assert(numberresult <= numberresults + numbersize);
1629 break;
1630 case 'u':
1631 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1632 width, precision, 'u');
1633 if (longflag)
1634 numprinted = sprintf(numberresult, fmt,
1635 va_arg(count, unsigned long));
1636#ifdef HAVE_LONG_LONG
1637 else if (longlongflag)
1638 numprinted = sprintf(numberresult, fmt,
1639 va_arg(count, unsigned PY_LONG_LONG));
1640#endif
1641 else if (size_tflag)
1642 numprinted = sprintf(numberresult, fmt,
1643 va_arg(count, size_t));
1644 else
1645 numprinted = sprintf(numberresult, fmt,
1646 va_arg(count, unsigned int));
1647 n += numprinted;
1648 numberresult += (numprinted + 1);
1649 assert(*(numberresult - 1) == '\0');
1650 assert(*(numberresult - 2) != '\0');
1651 assert(numprinted >= 0);
1652 assert(numberresult <= numberresults + numbersize);
1653 break;
1654 case 'x':
1655 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1656 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1657 n += numprinted;
1658 numberresult += (numprinted + 1);
1659 assert(*(numberresult - 1) == '\0');
1660 assert(*(numberresult - 2) != '\0');
1661 assert(numprinted >= 0);
1662 assert(numberresult <= numberresults + numbersize);
1663 break;
1664 case 'p':
1665 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1666 /* %p is ill-defined: ensure leading 0x. */
1667 if (numberresult[1] == 'X')
1668 numberresult[1] = 'x';
1669 else if (numberresult[1] != 'x') {
1670 memmove(numberresult + 2, numberresult,
1671 strlen(numberresult) + 1);
1672 numberresult[0] = '0';
1673 numberresult[1] = 'x';
1674 numprinted += 2;
1675 }
1676 n += numprinted;
1677 numberresult += (numprinted + 1);
1678 assert(*(numberresult - 1) == '\0');
1679 assert(*(numberresult - 2) != '\0');
1680 assert(numprinted >= 0);
1681 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001682 break;
1683 case 's':
1684 {
1685 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001686 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001687 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1688 if (!str)
1689 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690 /* since PyUnicode_DecodeUTF8 returns already flexible
1691 unicode objects, there is no need to call ready on them */
1692 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001693 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001695 /* Remember the str and switch to the next slot */
1696 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001697 break;
1698 }
1699 case 'U':
1700 {
1701 PyObject *obj = va_arg(count, PyObject *);
1702 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001703 if (PyUnicode_READY(obj) == -1)
1704 goto fail;
1705 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001706 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001707 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001708 break;
1709 }
1710 case 'V':
1711 {
1712 PyObject *obj = va_arg(count, PyObject *);
1713 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001714 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001715 assert(obj || str);
1716 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001717 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 if (PyUnicode_READY(obj) == -1)
1719 goto fail;
1720 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001721 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001723 *callresult++ = NULL;
1724 }
1725 else {
1726 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1727 if (!str_obj)
1728 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001730 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001732 *callresult++ = str_obj;
1733 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001734 break;
1735 }
1736 case 'S':
1737 {
1738 PyObject *obj = va_arg(count, PyObject *);
1739 PyObject *str;
1740 assert(obj);
1741 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001743 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001745 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001747 /* Remember the str and switch to the next slot */
1748 *callresult++ = str;
1749 break;
1750 }
1751 case 'R':
1752 {
1753 PyObject *obj = va_arg(count, PyObject *);
1754 PyObject *repr;
1755 assert(obj);
1756 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001758 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001760 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001762 /* Remember the repr and switch to the next slot */
1763 *callresult++ = repr;
1764 break;
1765 }
1766 case 'A':
1767 {
1768 PyObject *obj = va_arg(count, PyObject *);
1769 PyObject *ascii;
1770 assert(obj);
1771 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001773 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001775 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001777 /* Remember the repr and switch to the next slot */
1778 *callresult++ = ascii;
1779 break;
1780 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001781 default:
1782 /* if we stumble upon an unknown
1783 formatting code, copy the rest of
1784 the format string to the output
1785 string. (we cannot just skip the
1786 code, since there's no way to know
1787 what's in the argument list) */
1788 n += strlen(p);
1789 goto expand;
1790 }
1791 } else
1792 n++;
1793 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001794 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001795 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001797 we don't have to resize the string.
1798 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001800 if (!string)
1801 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 kind = PyUnicode_KIND(string);
1803 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001804 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001808 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001809 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001810
1811 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1813 /* checking for == because the last argument could be a empty
1814 string, which causes i to point to end, the assert at the end of
1815 the loop */
1816 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001817
Benjamin Peterson14339b62009-01-31 16:36:08 +00001818 switch (*f) {
1819 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001820 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821 const int ordinal = va_arg(vargs, int);
1822 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001823 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001824 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001825 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001826 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001827 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001828 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 case 'p':
1830 /* unused, since we already have the result */
1831 if (*f == 'p')
1832 (void) va_arg(vargs, void *);
1833 else
1834 (void) va_arg(vargs, int);
1835 /* extract the result from numberresults and append. */
1836 for (; *numberresult; ++i, ++numberresult)
1837 PyUnicode_WRITE(kind, data, i, *numberresult);
1838 /* skip over the separating '\0' */
1839 assert(*numberresult == '\0');
1840 numberresult++;
1841 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001842 break;
1843 case 's':
1844 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001845 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001847 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 size = PyUnicode_GET_LENGTH(*callresult);
1849 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001850 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1851 *callresult, 0,
1852 size) < 0)
1853 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001854 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001855 /* We're done with the unicode()/repr() => forget it */
1856 Py_DECREF(*callresult);
1857 /* switch to next unicode()/repr() result */
1858 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001859 break;
1860 }
1861 case 'U':
1862 {
1863 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 Py_ssize_t size;
1865 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1866 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001867 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1868 obj, 0,
1869 size) < 0)
1870 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001872 break;
1873 }
1874 case 'V':
1875 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001876 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001877 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001878 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001879 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001880 size = PyUnicode_GET_LENGTH(obj);
1881 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001882 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1883 obj, 0,
1884 size) < 0)
1885 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001887 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 size = PyUnicode_GET_LENGTH(*callresult);
1889 assert(PyUnicode_KIND(*callresult) <=
1890 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001891 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1892 *callresult,
1893 0, size) < 0)
1894 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001896 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001897 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001898 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001899 break;
1900 }
1901 case 'S':
1902 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001903 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001904 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001905 /* unused, since we already have the result */
1906 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001908 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1909 *callresult, 0,
1910 PyUnicode_GET_LENGTH(*callresult)) < 0)
1911 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001913 /* We're done with the unicode()/repr() => forget it */
1914 Py_DECREF(*callresult);
1915 /* switch to next unicode()/repr() result */
1916 ++callresult;
1917 break;
1918 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001919 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001921 break;
1922 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 for (; *p; ++p, ++i)
1924 PyUnicode_WRITE(kind, data, i, *p);
1925 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001926 goto end;
1927 }
Victor Stinner1205f272010-09-11 00:54:47 +00001928 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 else {
1930 assert(i < PyUnicode_GET_LENGTH(string));
1931 PyUnicode_WRITE(kind, data, i++, *f);
1932 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001933 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001934 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001935
Benjamin Peterson29060642009-01-31 22:14:21 +00001936 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001937 if (callresults)
1938 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 if (numberresults)
1940 PyObject_Free(numberresults);
1941 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001942 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001943 if (callresults) {
1944 PyObject **callresult2 = callresults;
1945 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001946 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001947 ++callresult2;
1948 }
1949 PyObject_Free(callresults);
1950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001951 if (numberresults)
1952 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001953 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001954}
1955
Walter Dörwaldd2034312007-05-18 16:29:38 +00001956PyObject *
1957PyUnicode_FromFormat(const char *format, ...)
1958{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001959 PyObject* ret;
1960 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001961
1962#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001963 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001964#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001965 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001966#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001967 ret = PyUnicode_FromFormatV(format, vargs);
1968 va_end(vargs);
1969 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001970}
1971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972#ifdef HAVE_WCHAR_H
1973
Victor Stinner5593d8a2010-10-02 11:11:27 +00001974/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1975 convert a Unicode object to a wide character string.
1976
Victor Stinnerd88d9832011-09-06 02:00:05 +02001977 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001978 character) required to convert the unicode object. Ignore size argument.
1979
Victor Stinnerd88d9832011-09-06 02:00:05 +02001980 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001981 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001982 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001983static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001984unicode_aswidechar(PyUnicodeObject *unicode,
1985 wchar_t *w,
1986 Py_ssize_t size)
1987{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001988 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989 const wchar_t *wstr;
1990
1991 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1992 if (wstr == NULL)
1993 return -1;
1994
Victor Stinner5593d8a2010-10-02 11:11:27 +00001995 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001996 if (size > res)
1997 size = res + 1;
1998 else
1999 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002001 return res;
2002 }
2003 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002005}
2006
2007Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002008PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002009 wchar_t *w,
2010 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011{
2012 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002013 PyErr_BadInternalCall();
2014 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002016 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017}
2018
Victor Stinner137c34c2010-09-29 10:25:54 +00002019wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002020PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002021 Py_ssize_t *size)
2022{
2023 wchar_t* buffer;
2024 Py_ssize_t buflen;
2025
2026 if (unicode == NULL) {
2027 PyErr_BadInternalCall();
2028 return NULL;
2029 }
2030
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002031 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 if (buflen == -1)
2033 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002034 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002035 PyErr_NoMemory();
2036 return NULL;
2037 }
2038
Victor Stinner137c34c2010-09-29 10:25:54 +00002039 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2040 if (buffer == NULL) {
2041 PyErr_NoMemory();
2042 return NULL;
2043 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002044 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045 if (buflen == -1)
2046 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002047 if (size != NULL)
2048 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002049 return buffer;
2050}
2051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053
Alexander Belopolsky40018472011-02-26 01:02:56 +00002054PyObject *
2055PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002056{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002058 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 PyErr_SetString(PyExc_ValueError,
2060 "chr() arg not in range(0x110000)");
2061 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002062 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002064 if (ordinal < 256)
2065 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002067 v = PyUnicode_New(1, ordinal);
2068 if (v == NULL)
2069 return NULL;
2070 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2071 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002072}
2073
Alexander Belopolsky40018472011-02-26 01:02:56 +00002074PyObject *
2075PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002077 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002078 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002079 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002080 if (PyUnicode_READY(obj))
2081 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002082 Py_INCREF(obj);
2083 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002084 }
2085 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002086 /* For a Unicode subtype that's not a Unicode object,
2087 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002088 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002089 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002090 PyErr_Format(PyExc_TypeError,
2091 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002092 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002093 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002094}
2095
Alexander Belopolsky40018472011-02-26 01:02:56 +00002096PyObject *
2097PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002098 const char *encoding,
2099 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002100{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002101 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002102 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002103
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002105 PyErr_BadInternalCall();
2106 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002108
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002109 /* Decoding bytes objects is the most common case and should be fast */
2110 if (PyBytes_Check(obj)) {
2111 if (PyBytes_GET_SIZE(obj) == 0) {
2112 Py_INCREF(unicode_empty);
2113 v = (PyObject *) unicode_empty;
2114 }
2115 else {
2116 v = PyUnicode_Decode(
2117 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2118 encoding, errors);
2119 }
2120 return v;
2121 }
2122
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002123 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002124 PyErr_SetString(PyExc_TypeError,
2125 "decoding str is not supported");
2126 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002127 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002128
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002129 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2130 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2131 PyErr_Format(PyExc_TypeError,
2132 "coercing to str: need bytes, bytearray "
2133 "or buffer-like object, %.80s found",
2134 Py_TYPE(obj)->tp_name);
2135 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002136 }
Tim Petersced69f82003-09-16 20:30:58 +00002137
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002138 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002139 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002140 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141 }
Tim Petersced69f82003-09-16 20:30:58 +00002142 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002143 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002144
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002145 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002146 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147}
2148
Victor Stinner600d3be2010-06-10 12:00:55 +00002149/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002150 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2151 1 on success. */
2152static int
2153normalize_encoding(const char *encoding,
2154 char *lower,
2155 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002157 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002158 char *l;
2159 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002160
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002161 e = encoding;
2162 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002163 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002164 while (*e) {
2165 if (l == l_end)
2166 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002167 if (Py_ISUPPER(*e)) {
2168 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002169 }
2170 else if (*e == '_') {
2171 *l++ = '-';
2172 e++;
2173 }
2174 else {
2175 *l++ = *e++;
2176 }
2177 }
2178 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002179 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002180}
2181
Alexander Belopolsky40018472011-02-26 01:02:56 +00002182PyObject *
2183PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002184 Py_ssize_t size,
2185 const char *encoding,
2186 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002187{
2188 PyObject *buffer = NULL, *unicode;
2189 Py_buffer info;
2190 char lower[11]; /* Enough for any encoding shortcut */
2191
2192 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002193 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002194
2195 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002196 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002197 if ((strcmp(lower, "utf-8") == 0) ||
2198 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002199 return PyUnicode_DecodeUTF8(s, size, errors);
2200 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002201 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002202 (strcmp(lower, "iso-8859-1") == 0))
2203 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002204#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002205 else if (strcmp(lower, "mbcs") == 0)
2206 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002207#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002208 else if (strcmp(lower, "ascii") == 0)
2209 return PyUnicode_DecodeASCII(s, size, errors);
2210 else if (strcmp(lower, "utf-16") == 0)
2211 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2212 else if (strcmp(lower, "utf-32") == 0)
2213 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215
2216 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002217 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002218 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002219 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002220 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221 if (buffer == NULL)
2222 goto onError;
2223 unicode = PyCodec_Decode(buffer, encoding, errors);
2224 if (unicode == NULL)
2225 goto onError;
2226 if (!PyUnicode_Check(unicode)) {
2227 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002228 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002229 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 Py_DECREF(unicode);
2231 goto onError;
2232 }
2233 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 if (PyUnicode_READY(unicode)) {
2235 Py_DECREF(unicode);
2236 return NULL;
2237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002239
Benjamin Peterson29060642009-01-31 22:14:21 +00002240 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 Py_XDECREF(buffer);
2242 return NULL;
2243}
2244
Alexander Belopolsky40018472011-02-26 01:02:56 +00002245PyObject *
2246PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002247 const char *encoding,
2248 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002249{
2250 PyObject *v;
2251
2252 if (!PyUnicode_Check(unicode)) {
2253 PyErr_BadArgument();
2254 goto onError;
2255 }
2256
2257 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002258 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002259
2260 /* Decode via the codec registry */
2261 v = PyCodec_Decode(unicode, encoding, errors);
2262 if (v == NULL)
2263 goto onError;
2264 return v;
2265
Benjamin Peterson29060642009-01-31 22:14:21 +00002266 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002267 return NULL;
2268}
2269
Alexander Belopolsky40018472011-02-26 01:02:56 +00002270PyObject *
2271PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002272 const char *encoding,
2273 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002274{
2275 PyObject *v;
2276
2277 if (!PyUnicode_Check(unicode)) {
2278 PyErr_BadArgument();
2279 goto onError;
2280 }
2281
2282 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002283 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002284
2285 /* Decode via the codec registry */
2286 v = PyCodec_Decode(unicode, encoding, errors);
2287 if (v == NULL)
2288 goto onError;
2289 if (!PyUnicode_Check(v)) {
2290 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002291 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002292 Py_TYPE(v)->tp_name);
2293 Py_DECREF(v);
2294 goto onError;
2295 }
2296 return v;
2297
Benjamin Peterson29060642009-01-31 22:14:21 +00002298 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002299 return NULL;
2300}
2301
Alexander Belopolsky40018472011-02-26 01:02:56 +00002302PyObject *
2303PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002304 Py_ssize_t size,
2305 const char *encoding,
2306 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307{
2308 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002309
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310 unicode = PyUnicode_FromUnicode(s, size);
2311 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002312 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2314 Py_DECREF(unicode);
2315 return v;
2316}
2317
Alexander Belopolsky40018472011-02-26 01:02:56 +00002318PyObject *
2319PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002320 const char *encoding,
2321 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002322{
2323 PyObject *v;
2324
2325 if (!PyUnicode_Check(unicode)) {
2326 PyErr_BadArgument();
2327 goto onError;
2328 }
2329
2330 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002331 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002332
2333 /* Encode via the codec registry */
2334 v = PyCodec_Encode(unicode, encoding, errors);
2335 if (v == NULL)
2336 goto onError;
2337 return v;
2338
Benjamin Peterson29060642009-01-31 22:14:21 +00002339 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002340 return NULL;
2341}
2342
Victor Stinnerad158722010-10-27 00:25:46 +00002343PyObject *
2344PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002345{
Victor Stinner99b95382011-07-04 14:23:54 +02002346#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002347 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2348 PyUnicode_GET_SIZE(unicode),
2349 NULL);
2350#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002351 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002352#else
Victor Stinner793b5312011-04-27 00:24:21 +02002353 PyInterpreterState *interp = PyThreadState_GET()->interp;
2354 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2355 cannot use it to encode and decode filenames before it is loaded. Load
2356 the Python codec requires to encode at least its own filename. Use the C
2357 version of the locale codec until the codec registry is initialized and
2358 the Python codec is loaded.
2359
2360 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2361 cannot only rely on it: check also interp->fscodec_initialized for
2362 subinterpreters. */
2363 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002364 return PyUnicode_AsEncodedString(unicode,
2365 Py_FileSystemDefaultEncoding,
2366 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002367 }
2368 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002369 /* locale encoding with surrogateescape */
2370 wchar_t *wchar;
2371 char *bytes;
2372 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002373 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002374
2375 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2376 if (wchar == NULL)
2377 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002378 bytes = _Py_wchar2char(wchar, &error_pos);
2379 if (bytes == NULL) {
2380 if (error_pos != (size_t)-1) {
2381 char *errmsg = strerror(errno);
2382 PyObject *exc = NULL;
2383 if (errmsg == NULL)
2384 errmsg = "Py_wchar2char() failed";
2385 raise_encode_exception(&exc,
2386 "filesystemencoding",
2387 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2388 error_pos, error_pos+1,
2389 errmsg);
2390 Py_XDECREF(exc);
2391 }
2392 else
2393 PyErr_NoMemory();
2394 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002395 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002396 }
2397 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002398
2399 bytes_obj = PyBytes_FromString(bytes);
2400 PyMem_Free(bytes);
2401 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002402 }
Victor Stinnerad158722010-10-27 00:25:46 +00002403#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002404}
2405
Alexander Belopolsky40018472011-02-26 01:02:56 +00002406PyObject *
2407PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002408 const char *encoding,
2409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410{
2411 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002412 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002413
Guido van Rossumd57fd912000-03-10 22:53:23 +00002414 if (!PyUnicode_Check(unicode)) {
2415 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002416 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002417 }
Fred Drakee4315f52000-05-09 19:53:39 +00002418
Victor Stinner2f283c22011-03-02 01:21:46 +00002419 if (encoding == NULL) {
2420 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002422 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002423 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002424 }
Fred Drakee4315f52000-05-09 19:53:39 +00002425
2426 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002427 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002428 if ((strcmp(lower, "utf-8") == 0) ||
2429 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002430 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002431 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002433 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002435 }
Victor Stinner37296e82010-06-10 13:36:23 +00002436 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002437 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002438 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002440#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002441 else if (strcmp(lower, "mbcs") == 0)
2442 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2443 PyUnicode_GET_SIZE(unicode),
2444 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002445#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002446 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002448 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002449
2450 /* Encode via the codec registry */
2451 v = PyCodec_Encode(unicode, encoding, errors);
2452 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002453 return NULL;
2454
2455 /* The normal path */
2456 if (PyBytes_Check(v))
2457 return v;
2458
2459 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002460 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002461 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002462 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002463
2464 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2465 "encoder %s returned bytearray instead of bytes",
2466 encoding);
2467 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002468 Py_DECREF(v);
2469 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002470 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002471
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002472 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2473 Py_DECREF(v);
2474 return b;
2475 }
2476
2477 PyErr_Format(PyExc_TypeError,
2478 "encoder did not return a bytes object (type=%.400s)",
2479 Py_TYPE(v)->tp_name);
2480 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002481 return NULL;
2482}
2483
Alexander Belopolsky40018472011-02-26 01:02:56 +00002484PyObject *
2485PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002486 const char *encoding,
2487 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002488{
2489 PyObject *v;
2490
2491 if (!PyUnicode_Check(unicode)) {
2492 PyErr_BadArgument();
2493 goto onError;
2494 }
2495
2496 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002497 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002498
2499 /* Encode via the codec registry */
2500 v = PyCodec_Encode(unicode, encoding, errors);
2501 if (v == NULL)
2502 goto onError;
2503 if (!PyUnicode_Check(v)) {
2504 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002505 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002506 Py_TYPE(v)->tp_name);
2507 Py_DECREF(v);
2508 goto onError;
2509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002511
Benjamin Peterson29060642009-01-31 22:14:21 +00002512 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 return NULL;
2514}
2515
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002516PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002517PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002518 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002519 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2520}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002521
Christian Heimes5894ba72007-11-04 11:43:14 +00002522PyObject*
2523PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2524{
Victor Stinner99b95382011-07-04 14:23:54 +02002525#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002526 return PyUnicode_DecodeMBCS(s, size, NULL);
2527#elif defined(__APPLE__)
2528 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2529#else
Victor Stinner793b5312011-04-27 00:24:21 +02002530 PyInterpreterState *interp = PyThreadState_GET()->interp;
2531 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2532 cannot use it to encode and decode filenames before it is loaded. Load
2533 the Python codec requires to encode at least its own filename. Use the C
2534 version of the locale codec until the codec registry is initialized and
2535 the Python codec is loaded.
2536
2537 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2538 cannot only rely on it: check also interp->fscodec_initialized for
2539 subinterpreters. */
2540 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002541 return PyUnicode_Decode(s, size,
2542 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002543 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002544 }
2545 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002546 /* locale encoding with surrogateescape */
2547 wchar_t *wchar;
2548 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002549 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002550
2551 if (s[size] != '\0' || size != strlen(s)) {
2552 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2553 return NULL;
2554 }
2555
Victor Stinner168e1172010-10-16 23:16:16 +00002556 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002557 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002558 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002559
Victor Stinner168e1172010-10-16 23:16:16 +00002560 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002561 PyMem_Free(wchar);
2562 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002563 }
Victor Stinnerad158722010-10-27 00:25:46 +00002564#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002565}
2566
Martin v. Löwis011e8422009-05-05 04:43:17 +00002567
2568int
2569PyUnicode_FSConverter(PyObject* arg, void* addr)
2570{
2571 PyObject *output = NULL;
2572 Py_ssize_t size;
2573 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002574 if (arg == NULL) {
2575 Py_DECREF(*(PyObject**)addr);
2576 return 1;
2577 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002578 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002579 output = arg;
2580 Py_INCREF(output);
2581 }
2582 else {
2583 arg = PyUnicode_FromObject(arg);
2584 if (!arg)
2585 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002586 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002587 Py_DECREF(arg);
2588 if (!output)
2589 return 0;
2590 if (!PyBytes_Check(output)) {
2591 Py_DECREF(output);
2592 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2593 return 0;
2594 }
2595 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002596 size = PyBytes_GET_SIZE(output);
2597 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002598 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002599 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002600 Py_DECREF(output);
2601 return 0;
2602 }
2603 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002604 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002605}
2606
2607
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002608int
2609PyUnicode_FSDecoder(PyObject* arg, void* addr)
2610{
2611 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002612 if (arg == NULL) {
2613 Py_DECREF(*(PyObject**)addr);
2614 return 1;
2615 }
2616 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002617 if (PyUnicode_READY(arg))
2618 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002619 output = arg;
2620 Py_INCREF(output);
2621 }
2622 else {
2623 arg = PyBytes_FromObject(arg);
2624 if (!arg)
2625 return 0;
2626 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2627 PyBytes_GET_SIZE(arg));
2628 Py_DECREF(arg);
2629 if (!output)
2630 return 0;
2631 if (!PyUnicode_Check(output)) {
2632 Py_DECREF(output);
2633 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2634 return 0;
2635 }
2636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2638 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002639 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2640 Py_DECREF(output);
2641 return 0;
2642 }
2643 *(PyObject**)addr = output;
2644 return Py_CLEANUP_SUPPORTED;
2645}
2646
2647
Martin v. Löwis5b222132007-06-10 09:51:05 +00002648char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002649PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002650{
Christian Heimesf3863112007-11-22 07:46:41 +00002651 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2653
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002654 if (!PyUnicode_Check(unicode)) {
2655 PyErr_BadArgument();
2656 return NULL;
2657 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002659 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002660
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002661 if (PyUnicode_UTF8(unicode) == NULL) {
2662 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2664 if (bytes == NULL)
2665 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002666 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2667 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 Py_DECREF(bytes);
2669 return NULL;
2670 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002671 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2672 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 Py_DECREF(bytes);
2674 }
2675
2676 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002677 *psize = PyUnicode_UTF8_LENGTH(unicode);
2678 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002679}
2680
2681char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002683{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002684 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2685}
2686
2687#ifdef Py_DEBUG
2688int unicode_as_unicode_calls = 0;
2689#endif
2690
2691
2692Py_UNICODE *
2693PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2694{
2695 PyUnicodeObject *u;
2696 const unsigned char *one_byte;
2697#if SIZEOF_WCHAR_T == 4
2698 const Py_UCS2 *two_bytes;
2699#else
2700 const Py_UCS4 *four_bytes;
2701 const Py_UCS4 *ucs4_end;
2702 Py_ssize_t num_surrogates;
2703#endif
2704 wchar_t *w;
2705 wchar_t *wchar_end;
2706
2707 if (!PyUnicode_Check(unicode)) {
2708 PyErr_BadArgument();
2709 return NULL;
2710 }
2711 u = (PyUnicodeObject*)unicode;
2712 if (_PyUnicode_WSTR(u) == NULL) {
2713 /* Non-ASCII compact unicode object */
2714 assert(_PyUnicode_KIND(u) != 0);
2715 assert(PyUnicode_IS_READY(u));
2716
2717#ifdef Py_DEBUG
2718 ++unicode_as_unicode_calls;
2719#endif
2720
2721 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2722#if SIZEOF_WCHAR_T == 2
2723 four_bytes = PyUnicode_4BYTE_DATA(u);
2724 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2725 num_surrogates = 0;
2726
2727 for (; four_bytes < ucs4_end; ++four_bytes) {
2728 if (*four_bytes > 0xFFFF)
2729 ++num_surrogates;
2730 }
2731
2732 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2733 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2734 if (!_PyUnicode_WSTR(u)) {
2735 PyErr_NoMemory();
2736 return NULL;
2737 }
2738 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2739
2740 w = _PyUnicode_WSTR(u);
2741 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2742 four_bytes = PyUnicode_4BYTE_DATA(u);
2743 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2744 if (*four_bytes > 0xFFFF) {
2745 /* encode surrogate pair in this case */
2746 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2747 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2748 }
2749 else
2750 *w = *four_bytes;
2751
2752 if (w > wchar_end) {
2753 assert(0 && "Miscalculated string end");
2754 }
2755 }
2756 *w = 0;
2757#else
2758 /* sizeof(wchar_t) == 4 */
2759 Py_FatalError("Impossible unicode object state, wstr and str "
2760 "should share memory already.");
2761 return NULL;
2762#endif
2763 }
2764 else {
2765 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2766 (_PyUnicode_LENGTH(u) + 1));
2767 if (!_PyUnicode_WSTR(u)) {
2768 PyErr_NoMemory();
2769 return NULL;
2770 }
2771 if (!PyUnicode_IS_COMPACT_ASCII(u))
2772 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2773 w = _PyUnicode_WSTR(u);
2774 wchar_end = w + _PyUnicode_LENGTH(u);
2775
2776 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2777 one_byte = PyUnicode_1BYTE_DATA(u);
2778 for (; w < wchar_end; ++one_byte, ++w)
2779 *w = *one_byte;
2780 /* null-terminate the wstr */
2781 *w = 0;
2782 }
2783 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2784#if SIZEOF_WCHAR_T == 4
2785 two_bytes = PyUnicode_2BYTE_DATA(u);
2786 for (; w < wchar_end; ++two_bytes, ++w)
2787 *w = *two_bytes;
2788 /* null-terminate the wstr */
2789 *w = 0;
2790#else
2791 /* sizeof(wchar_t) == 2 */
2792 PyObject_FREE(_PyUnicode_WSTR(u));
2793 _PyUnicode_WSTR(u) = NULL;
2794 Py_FatalError("Impossible unicode object state, wstr "
2795 "and str should share memory already.");
2796 return NULL;
2797#endif
2798 }
2799 else {
2800 assert(0 && "This should never happen.");
2801 }
2802 }
2803 }
2804 if (size != NULL)
2805 *size = PyUnicode_WSTR_LENGTH(u);
2806 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002807}
2808
Alexander Belopolsky40018472011-02-26 01:02:56 +00002809Py_UNICODE *
2810PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002812 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813}
2814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002815
Alexander Belopolsky40018472011-02-26 01:02:56 +00002816Py_ssize_t
2817PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818{
2819 if (!PyUnicode_Check(unicode)) {
2820 PyErr_BadArgument();
2821 goto onError;
2822 }
2823 return PyUnicode_GET_SIZE(unicode);
2824
Benjamin Peterson29060642009-01-31 22:14:21 +00002825 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 return -1;
2827}
2828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002829Py_ssize_t
2830PyUnicode_GetLength(PyObject *unicode)
2831{
2832 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2833 PyErr_BadArgument();
2834 return -1;
2835 }
2836
2837 return PyUnicode_GET_LENGTH(unicode);
2838}
2839
2840Py_UCS4
2841PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2842{
2843 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2844 return PyErr_BadArgument();
2845 return (Py_UCS4)-1;
2846 }
2847 return PyUnicode_READ_CHAR(unicode, index);
2848}
2849
2850int
2851PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2852{
2853 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2854 return PyErr_BadArgument();
2855 return -1;
2856 }
2857
2858 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2859 index, ch);
2860 return 0;
2861}
2862
Alexander Belopolsky40018472011-02-26 01:02:56 +00002863const char *
2864PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002865{
Victor Stinner42cb4622010-09-01 19:39:01 +00002866 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002867}
2868
Victor Stinner554f3f02010-06-16 23:33:54 +00002869/* create or adjust a UnicodeDecodeError */
2870static void
2871make_decode_exception(PyObject **exceptionObject,
2872 const char *encoding,
2873 const char *input, Py_ssize_t length,
2874 Py_ssize_t startpos, Py_ssize_t endpos,
2875 const char *reason)
2876{
2877 if (*exceptionObject == NULL) {
2878 *exceptionObject = PyUnicodeDecodeError_Create(
2879 encoding, input, length, startpos, endpos, reason);
2880 }
2881 else {
2882 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2883 goto onError;
2884 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2885 goto onError;
2886 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2887 goto onError;
2888 }
2889 return;
2890
2891onError:
2892 Py_DECREF(*exceptionObject);
2893 *exceptionObject = NULL;
2894}
2895
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002896/* error handling callback helper:
2897 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002898 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002899 and adjust various state variables.
2900 return 0 on success, -1 on error
2901*/
2902
Alexander Belopolsky40018472011-02-26 01:02:56 +00002903static int
2904unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002905 const char *encoding, const char *reason,
2906 const char **input, const char **inend, Py_ssize_t *startinpos,
2907 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2908 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002909{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002910 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002911
2912 PyObject *restuple = NULL;
2913 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002914 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002915 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002916 Py_ssize_t requiredsize;
2917 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002918 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002919 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002920 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002921 int res = -1;
2922
2923 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002924 *errorHandler = PyCodec_LookupError(errors);
2925 if (*errorHandler == NULL)
2926 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002927 }
2928
Victor Stinner554f3f02010-06-16 23:33:54 +00002929 make_decode_exception(exceptionObject,
2930 encoding,
2931 *input, *inend - *input,
2932 *startinpos, *endinpos,
2933 reason);
2934 if (*exceptionObject == NULL)
2935 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002936
2937 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2938 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002939 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002940 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002941 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002942 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002943 }
2944 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002945 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002946
2947 /* Copy back the bytes variables, which might have been modified by the
2948 callback */
2949 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2950 if (!inputobj)
2951 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002952 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002953 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002954 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002955 *input = PyBytes_AS_STRING(inputobj);
2956 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002957 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002958 /* we can DECREF safely, as the exception has another reference,
2959 so the object won't go away. */
2960 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002961
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002962 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002963 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002964 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002965 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2966 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002967 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002968
2969 /* need more space? (at least enough for what we
2970 have+the replacement+the rest of the string (starting
2971 at the new input position), so we won't have to check space
2972 when there are no errors in the rest of the string) */
2973 repptr = PyUnicode_AS_UNICODE(repunicode);
2974 repsize = PyUnicode_GET_SIZE(repunicode);
2975 requiredsize = *outpos + repsize + insize-newpos;
2976 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002977 if (requiredsize<2*outsize)
2978 requiredsize = 2*outsize;
2979 if (_PyUnicode_Resize(output, requiredsize) < 0)
2980 goto onError;
2981 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002982 }
2983 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002984 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 Py_UNICODE_COPY(*outptr, repptr, repsize);
2986 *outptr += repsize;
2987 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002988
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002989 /* we made it! */
2990 res = 0;
2991
Benjamin Peterson29060642009-01-31 22:14:21 +00002992 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002993 Py_XDECREF(restuple);
2994 return res;
2995}
2996
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002997/* --- UTF-7 Codec -------------------------------------------------------- */
2998
Antoine Pitrou244651a2009-05-04 18:56:13 +00002999/* See RFC2152 for details. We encode conservatively and decode liberally. */
3000
3001/* Three simple macros defining base-64. */
3002
3003/* Is c a base-64 character? */
3004
3005#define IS_BASE64(c) \
3006 (((c) >= 'A' && (c) <= 'Z') || \
3007 ((c) >= 'a' && (c) <= 'z') || \
3008 ((c) >= '0' && (c) <= '9') || \
3009 (c) == '+' || (c) == '/')
3010
3011/* given that c is a base-64 character, what is its base-64 value? */
3012
3013#define FROM_BASE64(c) \
3014 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3015 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3016 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3017 (c) == '+' ? 62 : 63)
3018
3019/* What is the base-64 character of the bottom 6 bits of n? */
3020
3021#define TO_BASE64(n) \
3022 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3023
3024/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3025 * decoded as itself. We are permissive on decoding; the only ASCII
3026 * byte not decoding to itself is the + which begins a base64
3027 * string. */
3028
3029#define DECODE_DIRECT(c) \
3030 ((c) <= 127 && (c) != '+')
3031
3032/* The UTF-7 encoder treats ASCII characters differently according to
3033 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3034 * the above). See RFC2152. This array identifies these different
3035 * sets:
3036 * 0 : "Set D"
3037 * alphanumeric and '(),-./:?
3038 * 1 : "Set O"
3039 * !"#$%&*;<=>@[]^_`{|}
3040 * 2 : "whitespace"
3041 * ht nl cr sp
3042 * 3 : special (must be base64 encoded)
3043 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3044 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003045
Tim Petersced69f82003-09-16 20:30:58 +00003046static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003047char utf7_category[128] = {
3048/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3049 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3050/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3051 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3052/* sp ! " # $ % & ' ( ) * + , - . / */
3053 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3054/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3055 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3056/* @ A B C D E F G H I J K L M N O */
3057 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3058/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3059 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3060/* ` a b c d e f g h i j k l m n o */
3061 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3062/* p q r s t u v w x y z { | } ~ del */
3063 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003064};
3065
Antoine Pitrou244651a2009-05-04 18:56:13 +00003066/* ENCODE_DIRECT: this character should be encoded as itself. The
3067 * answer depends on whether we are encoding set O as itself, and also
3068 * on whether we are encoding whitespace as itself. RFC2152 makes it
3069 * clear that the answers to these questions vary between
3070 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003071
Antoine Pitrou244651a2009-05-04 18:56:13 +00003072#define ENCODE_DIRECT(c, directO, directWS) \
3073 ((c) < 128 && (c) > 0 && \
3074 ((utf7_category[(c)] == 0) || \
3075 (directWS && (utf7_category[(c)] == 2)) || \
3076 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003077
Alexander Belopolsky40018472011-02-26 01:02:56 +00003078PyObject *
3079PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003080 Py_ssize_t size,
3081 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003082{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003083 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3084}
3085
Antoine Pitrou244651a2009-05-04 18:56:13 +00003086/* The decoder. The only state we preserve is our read position,
3087 * i.e. how many characters we have consumed. So if we end in the
3088 * middle of a shift sequence we have to back off the read position
3089 * and the output to the beginning of the sequence, otherwise we lose
3090 * all the shift state (seen bits, number of bits seen, high
3091 * surrogate). */
3092
Alexander Belopolsky40018472011-02-26 01:02:56 +00003093PyObject *
3094PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003095 Py_ssize_t size,
3096 const char *errors,
3097 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003098{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003099 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003100 Py_ssize_t startinpos;
3101 Py_ssize_t endinpos;
3102 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003103 const char *e;
3104 PyUnicodeObject *unicode;
3105 Py_UNICODE *p;
3106 const char *errmsg = "";
3107 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003108 Py_UNICODE *shiftOutStart;
3109 unsigned int base64bits = 0;
3110 unsigned long base64buffer = 0;
3111 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003112 PyObject *errorHandler = NULL;
3113 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003114
3115 unicode = _PyUnicode_New(size);
3116 if (!unicode)
3117 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003118 if (size == 0) {
3119 if (consumed)
3120 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003121 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003122 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003124 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003125 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003126 e = s + size;
3127
3128 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003129 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003130 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003131 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003132
Antoine Pitrou244651a2009-05-04 18:56:13 +00003133 if (inShift) { /* in a base-64 section */
3134 if (IS_BASE64(ch)) { /* consume a base-64 character */
3135 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3136 base64bits += 6;
3137 s++;
3138 if (base64bits >= 16) {
3139 /* we have enough bits for a UTF-16 value */
3140 Py_UNICODE outCh = (Py_UNICODE)
3141 (base64buffer >> (base64bits-16));
3142 base64bits -= 16;
3143 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3144 if (surrogate) {
3145 /* expecting a second surrogate */
3146 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3147#ifdef Py_UNICODE_WIDE
3148 *p++ = (((surrogate & 0x3FF)<<10)
3149 | (outCh & 0x3FF)) + 0x10000;
3150#else
3151 *p++ = surrogate;
3152 *p++ = outCh;
3153#endif
3154 surrogate = 0;
3155 }
3156 else {
3157 surrogate = 0;
3158 errmsg = "second surrogate missing";
3159 goto utf7Error;
3160 }
3161 }
3162 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3163 /* first surrogate */
3164 surrogate = outCh;
3165 }
3166 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3167 errmsg = "unexpected second surrogate";
3168 goto utf7Error;
3169 }
3170 else {
3171 *p++ = outCh;
3172 }
3173 }
3174 }
3175 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003176 inShift = 0;
3177 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003178 if (surrogate) {
3179 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003180 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003181 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003182 if (base64bits > 0) { /* left-over bits */
3183 if (base64bits >= 6) {
3184 /* We've seen at least one base-64 character */
3185 errmsg = "partial character in shift sequence";
3186 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003187 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003188 else {
3189 /* Some bits remain; they should be zero */
3190 if (base64buffer != 0) {
3191 errmsg = "non-zero padding bits in shift sequence";
3192 goto utf7Error;
3193 }
3194 }
3195 }
3196 if (ch != '-') {
3197 /* '-' is absorbed; other terminating
3198 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003199 *p++ = ch;
3200 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003201 }
3202 }
3203 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003204 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003205 s++; /* consume '+' */
3206 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003207 s++;
3208 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003209 }
3210 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003211 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003212 shiftOutStart = p;
3213 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003214 }
3215 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003216 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003217 *p++ = ch;
3218 s++;
3219 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003220 else {
3221 startinpos = s-starts;
3222 s++;
3223 errmsg = "unexpected special character";
3224 goto utf7Error;
3225 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003226 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003227utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003228 outpos = p-PyUnicode_AS_UNICODE(unicode);
3229 endinpos = s-starts;
3230 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003231 errors, &errorHandler,
3232 "utf7", errmsg,
3233 &starts, &e, &startinpos, &endinpos, &exc, &s,
3234 &unicode, &outpos, &p))
3235 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003236 }
3237
Antoine Pitrou244651a2009-05-04 18:56:13 +00003238 /* end of string */
3239
3240 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3241 /* if we're in an inconsistent state, that's an error */
3242 if (surrogate ||
3243 (base64bits >= 6) ||
3244 (base64bits > 0 && base64buffer != 0)) {
3245 outpos = p-PyUnicode_AS_UNICODE(unicode);
3246 endinpos = size;
3247 if (unicode_decode_call_errorhandler(
3248 errors, &errorHandler,
3249 "utf7", "unterminated shift sequence",
3250 &starts, &e, &startinpos, &endinpos, &exc, &s,
3251 &unicode, &outpos, &p))
3252 goto onError;
3253 if (s < e)
3254 goto restart;
3255 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003256 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003257
3258 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003259 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003260 if (inShift) {
3261 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003262 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003263 }
3264 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003265 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003266 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003267 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003268
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003269 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003270 goto onError;
3271
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003272 Py_XDECREF(errorHandler);
3273 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003274 if (PyUnicode_READY(unicode) == -1) {
3275 Py_DECREF(unicode);
3276 return NULL;
3277 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003278 return (PyObject *)unicode;
3279
Benjamin Peterson29060642009-01-31 22:14:21 +00003280 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003281 Py_XDECREF(errorHandler);
3282 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003283 Py_DECREF(unicode);
3284 return NULL;
3285}
3286
3287
Alexander Belopolsky40018472011-02-26 01:02:56 +00003288PyObject *
3289PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003290 Py_ssize_t size,
3291 int base64SetO,
3292 int base64WhiteSpace,
3293 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003294{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003295 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003296 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003297 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003298 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003299 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003300 unsigned int base64bits = 0;
3301 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003302 char * out;
3303 char * start;
3304
3305 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003306 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003307
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003308 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003309 return PyErr_NoMemory();
3310
Antoine Pitrou244651a2009-05-04 18:56:13 +00003311 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003312 if (v == NULL)
3313 return NULL;
3314
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003315 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003316 for (;i < size; ++i) {
3317 Py_UNICODE ch = s[i];
3318
Antoine Pitrou244651a2009-05-04 18:56:13 +00003319 if (inShift) {
3320 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3321 /* shifting out */
3322 if (base64bits) { /* output remaining bits */
3323 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3324 base64buffer = 0;
3325 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003326 }
3327 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003328 /* Characters not in the BASE64 set implicitly unshift the sequence
3329 so no '-' is required, except if the character is itself a '-' */
3330 if (IS_BASE64(ch) || ch == '-') {
3331 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003332 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003333 *out++ = (char) ch;
3334 }
3335 else {
3336 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003337 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003338 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003339 else { /* not in a shift sequence */
3340 if (ch == '+') {
3341 *out++ = '+';
3342 *out++ = '-';
3343 }
3344 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3345 *out++ = (char) ch;
3346 }
3347 else {
3348 *out++ = '+';
3349 inShift = 1;
3350 goto encode_char;
3351 }
3352 }
3353 continue;
3354encode_char:
3355#ifdef Py_UNICODE_WIDE
3356 if (ch >= 0x10000) {
3357 /* code first surrogate */
3358 base64bits += 16;
3359 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3360 while (base64bits >= 6) {
3361 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3362 base64bits -= 6;
3363 }
3364 /* prepare second surrogate */
3365 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3366 }
3367#endif
3368 base64bits += 16;
3369 base64buffer = (base64buffer << 16) | ch;
3370 while (base64bits >= 6) {
3371 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3372 base64bits -= 6;
3373 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003374 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003375 if (base64bits)
3376 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3377 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003378 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003379 if (_PyBytes_Resize(&v, out - start) < 0)
3380 return NULL;
3381 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003382}
3383
Antoine Pitrou244651a2009-05-04 18:56:13 +00003384#undef IS_BASE64
3385#undef FROM_BASE64
3386#undef TO_BASE64
3387#undef DECODE_DIRECT
3388#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003389
Guido van Rossumd57fd912000-03-10 22:53:23 +00003390/* --- UTF-8 Codec -------------------------------------------------------- */
3391
Tim Petersced69f82003-09-16 20:30:58 +00003392static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003394 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3395 illegal prefix. See RFC 3629 for details */
3396 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3397 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003398 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003399 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3400 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3401 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3402 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003403 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3404 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3406 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003407 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3408 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3409 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3410 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3411 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412};
3413
Alexander Belopolsky40018472011-02-26 01:02:56 +00003414PyObject *
3415PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003416 Py_ssize_t size,
3417 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418{
Walter Dörwald69652032004-09-07 20:24:22 +00003419 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3420}
3421
Antoine Pitrouab868312009-01-10 15:40:25 +00003422/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3423#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3424
3425/* Mask to quickly check whether a C 'long' contains a
3426 non-ASCII, UTF8-encoded char. */
3427#if (SIZEOF_LONG == 8)
3428# define ASCII_CHAR_MASK 0x8080808080808080L
3429#elif (SIZEOF_LONG == 4)
3430# define ASCII_CHAR_MASK 0x80808080L
3431#else
3432# error C 'long' size should be either 4 or 8!
3433#endif
3434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003435/* Scans a UTF-8 string and returns the maximum character to be expected,
3436 the size of the decoded unicode string and if any major errors were
3437 encountered.
3438
3439 This function does check basic UTF-8 sanity, it does however NOT CHECK
3440 if the string contains surrogates, and if all continuation bytes are
3441 within the correct ranges, these checks are performed in
3442 PyUnicode_DecodeUTF8Stateful.
3443
3444 If it sets has_errors to 1, it means the value of unicode_size and max_char
3445 will be bogus and you should not rely on useful information in them.
3446 */
3447static Py_UCS4
3448utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3449 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3450 int *has_errors)
3451{
3452 Py_ssize_t n;
3453 Py_ssize_t char_count = 0;
3454 Py_UCS4 max_char = 127, new_max;
3455 Py_UCS4 upper_bound;
3456 const unsigned char *p = (const unsigned char *)s;
3457 const unsigned char *end = p + string_size;
3458 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3459 int err = 0;
3460
3461 for (; p < end && !err; ++p, ++char_count) {
3462 /* Only check value if it's not a ASCII char... */
3463 if (*p < 0x80) {
3464 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3465 an explanation. */
3466 if (!((size_t) p & LONG_PTR_MASK)) {
3467 /* Help register allocation */
3468 register const unsigned char *_p = p;
3469 while (_p < aligned_end) {
3470 unsigned long value = *(unsigned long *) _p;
3471 if (value & ASCII_CHAR_MASK)
3472 break;
3473 _p += SIZEOF_LONG;
3474 char_count += SIZEOF_LONG;
3475 }
3476 p = _p;
3477 if (p == end)
3478 break;
3479 }
3480 }
3481 if (*p >= 0x80) {
3482 n = utf8_code_length[*p];
3483 new_max = max_char;
3484 switch (n) {
3485 /* invalid start byte */
3486 case 0:
3487 err = 1;
3488 break;
3489 case 2:
3490 /* Code points between 0x00FF and 0x07FF inclusive.
3491 Approximate the upper bound of the code point,
3492 if this flips over 255 we can be sure it will be more
3493 than 255 and the string will need 2 bytes per code coint,
3494 if it stays under or equal to 255, we can be sure 1 byte
3495 is enough.
3496 ((*p & 0b00011111) << 6) | 0b00111111 */
3497 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3498 if (max_char < upper_bound)
3499 new_max = upper_bound;
3500 /* Ensure we track at least that we left ASCII space. */
3501 if (new_max < 128)
3502 new_max = 128;
3503 break;
3504 case 3:
3505 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3506 always > 255 and <= 65535 and will always need 2 bytes. */
3507 if (max_char < 65535)
3508 new_max = 65535;
3509 break;
3510 case 4:
3511 /* Code point will be above 0xFFFF for sure in this case. */
3512 new_max = 65537;
3513 break;
3514 /* Internal error, this should be caught by the first if */
3515 case 1:
3516 default:
3517 assert(0 && "Impossible case in utf8_max_char_and_size");
3518 err = 1;
3519 }
3520 /* Instead of number of overall bytes for this code point,
3521 n containts the number of following bytes: */
3522 --n;
3523 /* Check if the follow up chars are all valid continuation bytes */
3524 if (n >= 1) {
3525 const unsigned char *cont;
3526 if ((p + n) >= end) {
3527 if (consumed == 0)
3528 /* incomplete data, non-incremental decoding */
3529 err = 1;
3530 break;
3531 }
3532 for (cont = p + 1; cont < (p + n); ++cont) {
3533 if ((*cont & 0xc0) != 0x80) {
3534 err = 1;
3535 break;
3536 }
3537 }
3538 p += n;
3539 }
3540 else
3541 err = 1;
3542 max_char = new_max;
3543 }
3544 }
3545
3546 if (unicode_size)
3547 *unicode_size = char_count;
3548 if (has_errors)
3549 *has_errors = err;
3550 return max_char;
3551}
3552
3553/* Similar to PyUnicode_WRITE but can also write into wstr field
3554 of the legacy unicode representation */
3555#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3556 do { \
3557 const int k_ = (kind); \
3558 if (k_ == PyUnicode_WCHAR_KIND) \
3559 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3560 else if (k_ == PyUnicode_1BYTE_KIND) \
3561 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3562 else if (k_ == PyUnicode_2BYTE_KIND) \
3563 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3564 else \
3565 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3566 } while (0)
3567
Alexander Belopolsky40018472011-02-26 01:02:56 +00003568PyObject *
3569PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003570 Py_ssize_t size,
3571 const char *errors,
3572 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003573{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003575 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003576 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003577 Py_ssize_t startinpos;
3578 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003579 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003581 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 PyObject *errorHandler = NULL;
3583 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003584 Py_UCS4 maxchar = 0;
3585 Py_ssize_t unicode_size;
3586 Py_ssize_t i;
3587 int kind;
3588 void *data;
3589 int has_errors;
3590 Py_UNICODE *error_outptr;
3591#if SIZEOF_WCHAR_T == 2
3592 Py_ssize_t wchar_offset = 0;
3593#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594
Walter Dörwald69652032004-09-07 20:24:22 +00003595 if (size == 0) {
3596 if (consumed)
3597 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003598 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003599 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003600 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3601 consumed, &has_errors);
3602 if (has_errors) {
3603 unicode = _PyUnicode_New(size);
3604 if (!unicode)
3605 return NULL;
3606 kind = PyUnicode_WCHAR_KIND;
3607 data = PyUnicode_AS_UNICODE(unicode);
3608 assert(data != NULL);
3609 }
3610 else {
3611 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3612 if (!unicode)
3613 return NULL;
3614 /* When the string is ASCII only, just use memcpy and return.
3615 unicode_size may be != size if there is an incomplete UTF-8
3616 sequence at the end of the ASCII block. */
3617 if (maxchar < 128 && size == unicode_size) {
3618 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3619 return (PyObject *)unicode;
3620 }
3621 kind = PyUnicode_KIND(unicode);
3622 data = PyUnicode_DATA(unicode);
3623 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003625 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003627 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628
3629 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003630 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631
3632 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003633 /* Fast path for runs of ASCII characters. Given that common UTF-8
3634 input will consist of an overwhelming majority of ASCII
3635 characters, we try to optimize for this case by checking
3636 as many characters as a C 'long' can contain.
3637 First, check if we can do an aligned read, as most CPUs have
3638 a penalty for unaligned reads.
3639 */
3640 if (!((size_t) s & LONG_PTR_MASK)) {
3641 /* Help register allocation */
3642 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003643 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003644 while (_s < aligned_end) {
3645 /* Read a whole long at a time (either 4 or 8 bytes),
3646 and do a fast unrolled copy if it only contains ASCII
3647 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003648 unsigned long value = *(unsigned long *) _s;
3649 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003650 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003651 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3652 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3653 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3654 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003655#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003656 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3657 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3658 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3659 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003660#endif
3661 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003662 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003663 }
3664 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003665 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003666 if (s == e)
3667 break;
3668 ch = (unsigned char)*s;
3669 }
3670 }
3671
3672 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003673 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674 s++;
3675 continue;
3676 }
3677
3678 n = utf8_code_length[ch];
3679
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003680 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003681 if (consumed)
3682 break;
3683 else {
3684 errmsg = "unexpected end of data";
3685 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003686 endinpos = startinpos+1;
3687 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3688 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003689 goto utf8Error;
3690 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003691 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692
3693 switch (n) {
3694
3695 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003696 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003697 startinpos = s-starts;
3698 endinpos = startinpos+1;
3699 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700
3701 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003702 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003703 startinpos = s-starts;
3704 endinpos = startinpos+1;
3705 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706
3707 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003708 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003709 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003710 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003711 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003712 goto utf8Error;
3713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003715 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003716 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 break;
3718
3719 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003720 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3721 will result in surrogates in range d800-dfff. Surrogates are
3722 not valid UTF-8 so they are rejected.
3723 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3724 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003725 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003726 (s[2] & 0xc0) != 0x80 ||
3727 ((unsigned char)s[0] == 0xE0 &&
3728 (unsigned char)s[1] < 0xA0) ||
3729 ((unsigned char)s[0] == 0xED &&
3730 (unsigned char)s[1] > 0x9F)) {
3731 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003732 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003733 endinpos = startinpos + 1;
3734
3735 /* if s[1] first two bits are 1 and 0, then the invalid
3736 continuation byte is s[2], so increment endinpos by 1,
3737 if not, s[1] is invalid and endinpos doesn't need to
3738 be incremented. */
3739 if ((s[1] & 0xC0) == 0x80)
3740 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003741 goto utf8Error;
3742 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003744 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003745 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003746 break;
3747
3748 case 4:
3749 if ((s[1] & 0xc0) != 0x80 ||
3750 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003751 (s[3] & 0xc0) != 0x80 ||
3752 ((unsigned char)s[0] == 0xF0 &&
3753 (unsigned char)s[1] < 0x90) ||
3754 ((unsigned char)s[0] == 0xF4 &&
3755 (unsigned char)s[1] > 0x8F)) {
3756 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003757 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003758 endinpos = startinpos + 1;
3759 if ((s[1] & 0xC0) == 0x80) {
3760 endinpos++;
3761 if ((s[2] & 0xC0) == 0x80)
3762 endinpos++;
3763 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003764 goto utf8Error;
3765 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003766 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003767 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3768 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003770 /* If the string is flexible or we have native UCS-4, write
3771 directly.. */
3772 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3773 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003775 else {
3776 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003778 /* translate from 10000..10FFFF to 0..FFFF */
3779 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 /* high surrogate = top 10 bits added to D800 */
3782 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3783 (Py_UNICODE)(0xD800 + (ch >> 10)));
3784
3785 /* low surrogate = bottom 10 bits added to DC00 */
3786 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3787 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3788 }
3789#if SIZEOF_WCHAR_T == 2
3790 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003791#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793 }
3794 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003795 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003796
Benjamin Peterson29060642009-01-31 22:14:21 +00003797 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003798 /* If this is not yet a resizable string, make it one.. */
3799 if (kind != PyUnicode_WCHAR_KIND) {
3800 const Py_UNICODE *u;
3801 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3802 if (!new_unicode)
3803 goto onError;
3804 u = PyUnicode_AsUnicode((PyObject *)unicode);
3805 if (!u)
3806 goto onError;
3807#if SIZEOF_WCHAR_T == 2
3808 i += wchar_offset;
3809#endif
3810 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3811 Py_DECREF(unicode);
3812 unicode = new_unicode;
3813 kind = 0;
3814 data = PyUnicode_AS_UNICODE(new_unicode);
3815 assert(data != NULL);
3816 }
3817 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003818 if (unicode_decode_call_errorhandler(
3819 errors, &errorHandler,
3820 "utf8", errmsg,
3821 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003822 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003823 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003824 /* Update data because unicode_decode_call_errorhandler might have
3825 re-created or resized the unicode object. */
3826 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003827 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 /* Ensure the unicode_size calculation above was correct: */
3830 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3831
Walter Dörwald69652032004-09-07 20:24:22 +00003832 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003833 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835 /* Adjust length and ready string when it contained errors and
3836 is of the old resizable kind. */
3837 if (kind == PyUnicode_WCHAR_KIND) {
3838 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3839 PyUnicode_READY(unicode) == -1)
3840 goto onError;
3841 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003843 Py_XDECREF(errorHandler);
3844 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845 if (PyUnicode_READY(unicode) == -1) {
3846 Py_DECREF(unicode);
3847 return NULL;
3848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 return (PyObject *)unicode;
3850
Benjamin Peterson29060642009-01-31 22:14:21 +00003851 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003852 Py_XDECREF(errorHandler);
3853 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854 Py_DECREF(unicode);
3855 return NULL;
3856}
3857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003858#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003859
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003860#ifdef __APPLE__
3861
3862/* Simplified UTF-8 decoder using surrogateescape error handler,
3863 used to decode the command line arguments on Mac OS X. */
3864
3865wchar_t*
3866_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3867{
3868 int n;
3869 const char *e;
3870 wchar_t *unicode, *p;
3871
3872 /* Note: size will always be longer than the resulting Unicode
3873 character count */
3874 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3875 PyErr_NoMemory();
3876 return NULL;
3877 }
3878 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3879 if (!unicode)
3880 return NULL;
3881
3882 /* Unpack UTF-8 encoded data */
3883 p = unicode;
3884 e = s + size;
3885 while (s < e) {
3886 Py_UCS4 ch = (unsigned char)*s;
3887
3888 if (ch < 0x80) {
3889 *p++ = (wchar_t)ch;
3890 s++;
3891 continue;
3892 }
3893
3894 n = utf8_code_length[ch];
3895 if (s + n > e) {
3896 goto surrogateescape;
3897 }
3898
3899 switch (n) {
3900 case 0:
3901 case 1:
3902 goto surrogateescape;
3903
3904 case 2:
3905 if ((s[1] & 0xc0) != 0x80)
3906 goto surrogateescape;
3907 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3908 assert ((ch > 0x007F) && (ch <= 0x07FF));
3909 *p++ = (wchar_t)ch;
3910 break;
3911
3912 case 3:
3913 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3914 will result in surrogates in range d800-dfff. Surrogates are
3915 not valid UTF-8 so they are rejected.
3916 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3917 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3918 if ((s[1] & 0xc0) != 0x80 ||
3919 (s[2] & 0xc0) != 0x80 ||
3920 ((unsigned char)s[0] == 0xE0 &&
3921 (unsigned char)s[1] < 0xA0) ||
3922 ((unsigned char)s[0] == 0xED &&
3923 (unsigned char)s[1] > 0x9F)) {
3924
3925 goto surrogateescape;
3926 }
3927 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3928 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003929 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003930 break;
3931
3932 case 4:
3933 if ((s[1] & 0xc0) != 0x80 ||
3934 (s[2] & 0xc0) != 0x80 ||
3935 (s[3] & 0xc0) != 0x80 ||
3936 ((unsigned char)s[0] == 0xF0 &&
3937 (unsigned char)s[1] < 0x90) ||
3938 ((unsigned char)s[0] == 0xF4 &&
3939 (unsigned char)s[1] > 0x8F)) {
3940 goto surrogateescape;
3941 }
3942 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3943 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3944 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3945
3946#if SIZEOF_WCHAR_T == 4
3947 *p++ = (wchar_t)ch;
3948#else
3949 /* compute and append the two surrogates: */
3950
3951 /* translate from 10000..10FFFF to 0..FFFF */
3952 ch -= 0x10000;
3953
3954 /* high surrogate = top 10 bits added to D800 */
3955 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3956
3957 /* low surrogate = bottom 10 bits added to DC00 */
3958 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3959#endif
3960 break;
3961 }
3962 s += n;
3963 continue;
3964
3965 surrogateescape:
3966 *p++ = 0xDC00 + ch;
3967 s++;
3968 }
3969 *p = L'\0';
3970 return unicode;
3971}
3972
3973#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975/* Primary internal function which creates utf8 encoded bytes objects.
3976
3977 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003978 and allocate exactly as much space needed at the end. Else allocate the
3979 maximum possible needed (4 result bytes per Unicode character), and return
3980 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003981*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003982PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003983_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984{
Tim Peters602f7402002-04-27 18:03:26 +00003985#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003986
Guido van Rossum98297ee2007-11-06 21:34:58 +00003987 Py_ssize_t i; /* index into s of next input byte */
3988 PyObject *result; /* result string object */
3989 char *p; /* next free byte in output buffer */
3990 Py_ssize_t nallocated; /* number of result bytes allocated */
3991 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003992 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003993 PyObject *errorHandler = NULL;
3994 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995 int kind;
3996 void *data;
3997 Py_ssize_t size;
3998 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3999#if SIZEOF_WCHAR_T == 2
4000 Py_ssize_t wchar_offset = 0;
4001#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003 if (!PyUnicode_Check(unicode)) {
4004 PyErr_BadArgument();
4005 return NULL;
4006 }
4007
4008 if (PyUnicode_READY(unicode) == -1)
4009 return NULL;
4010
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004011 if (PyUnicode_UTF8(unicode))
4012 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4013 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014
4015 kind = PyUnicode_KIND(unicode);
4016 data = PyUnicode_DATA(unicode);
4017 size = PyUnicode_GET_LENGTH(unicode);
4018
Tim Peters602f7402002-04-27 18:03:26 +00004019 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020
Tim Peters602f7402002-04-27 18:03:26 +00004021 if (size <= MAX_SHORT_UNICHARS) {
4022 /* Write into the stack buffer; nallocated can't overflow.
4023 * At the end, we'll allocate exactly as much heap space as it
4024 * turns out we need.
4025 */
4026 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004027 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004028 p = stackbuf;
4029 }
4030 else {
4031 /* Overallocate on the heap, and give the excess back at the end. */
4032 nallocated = size * 4;
4033 if (nallocated / 4 != size) /* overflow! */
4034 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004035 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004036 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004037 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004038 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004039 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004040
Tim Peters602f7402002-04-27 18:03:26 +00004041 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004043
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004044 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004045 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004047
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004049 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004050 *p++ = (char)(0xc0 | (ch >> 6));
4051 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004052 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053 Py_ssize_t newpos;
4054 PyObject *rep;
4055 Py_ssize_t repsize, k, startpos;
4056 startpos = i-1;
4057#if SIZEOF_WCHAR_T == 2
4058 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004059#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060 rep = unicode_encode_call_errorhandler(
4061 errors, &errorHandler, "utf-8", "surrogates not allowed",
4062 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4063 &exc, startpos, startpos+1, &newpos);
4064 if (!rep)
4065 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004067 if (PyBytes_Check(rep))
4068 repsize = PyBytes_GET_SIZE(rep);
4069 else
4070 repsize = PyUnicode_GET_SIZE(rep);
4071
4072 if (repsize > 4) {
4073 Py_ssize_t offset;
4074
4075 if (result == NULL)
4076 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004077 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004078 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004080 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4081 /* integer overflow */
4082 PyErr_NoMemory();
4083 goto error;
4084 }
4085 nallocated += repsize - 4;
4086 if (result != NULL) {
4087 if (_PyBytes_Resize(&result, nallocated) < 0)
4088 goto error;
4089 } else {
4090 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004091 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004092 goto error;
4093 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4094 }
4095 p = PyBytes_AS_STRING(result) + offset;
4096 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004098 if (PyBytes_Check(rep)) {
4099 char *prep = PyBytes_AS_STRING(rep);
4100 for(k = repsize; k > 0; k--)
4101 *p++ = *prep++;
4102 } else /* rep is unicode */ {
4103 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4104 Py_UNICODE c;
4105
4106 for(k=0; k<repsize; k++) {
4107 c = prep[k];
4108 if (0x80 <= c) {
4109 raise_encode_exception(&exc, "utf-8",
4110 PyUnicode_AS_UNICODE(unicode),
4111 size, i-1, i,
4112 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004113 goto error;
4114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004116 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004117 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004118 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004119 } else if (ch < 0x10000) {
4120 *p++ = (char)(0xe0 | (ch >> 12));
4121 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4122 *p++ = (char)(0x80 | (ch & 0x3f));
4123 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004124 /* Encode UCS4 Unicode ordinals */
4125 *p++ = (char)(0xf0 | (ch >> 18));
4126 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4127 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4128 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004129#if SIZEOF_WCHAR_T == 2
4130 wchar_offset++;
4131#endif
Tim Peters602f7402002-04-27 18:03:26 +00004132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004134
Guido van Rossum98297ee2007-11-06 21:34:58 +00004135 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004136 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004137 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004138 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004139 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004140 }
4141 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004142 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004143 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004144 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004145 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004147
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004148 Py_XDECREF(errorHandler);
4149 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004150 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004151 error:
4152 Py_XDECREF(errorHandler);
4153 Py_XDECREF(exc);
4154 Py_XDECREF(result);
4155 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004156
Tim Peters602f7402002-04-27 18:03:26 +00004157#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004158}
4159
Alexander Belopolsky40018472011-02-26 01:02:56 +00004160PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004161PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4162 Py_ssize_t size,
4163 const char *errors)
4164{
4165 PyObject *v, *unicode;
4166
4167 unicode = PyUnicode_FromUnicode(s, size);
4168 if (unicode == NULL)
4169 return NULL;
4170 v = _PyUnicode_AsUTF8String(unicode, errors);
4171 Py_DECREF(unicode);
4172 return v;
4173}
4174
4175PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004176PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004178 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179}
4180
Walter Dörwald41980ca2007-08-16 21:55:45 +00004181/* --- UTF-32 Codec ------------------------------------------------------- */
4182
4183PyObject *
4184PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004185 Py_ssize_t size,
4186 const char *errors,
4187 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004188{
4189 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4190}
4191
4192PyObject *
4193PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 Py_ssize_t size,
4195 const char *errors,
4196 int *byteorder,
4197 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004198{
4199 const char *starts = s;
4200 Py_ssize_t startinpos;
4201 Py_ssize_t endinpos;
4202 Py_ssize_t outpos;
4203 PyUnicodeObject *unicode;
4204 Py_UNICODE *p;
4205#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004206 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004207 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004208#else
4209 const int pairs = 0;
4210#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004211 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004212 int bo = 0; /* assume native ordering by default */
4213 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004214 /* Offsets from q for retrieving bytes in the right order. */
4215#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4216 int iorder[] = {0, 1, 2, 3};
4217#else
4218 int iorder[] = {3, 2, 1, 0};
4219#endif
4220 PyObject *errorHandler = NULL;
4221 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004222
Walter Dörwald41980ca2007-08-16 21:55:45 +00004223 q = (unsigned char *)s;
4224 e = q + size;
4225
4226 if (byteorder)
4227 bo = *byteorder;
4228
4229 /* Check for BOM marks (U+FEFF) in the input and adjust current
4230 byte order setting accordingly. In native mode, the leading BOM
4231 mark is skipped, in all other modes, it is copied to the output
4232 stream as-is (giving a ZWNBSP character). */
4233 if (bo == 0) {
4234 if (size >= 4) {
4235 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004236 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004237#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004238 if (bom == 0x0000FEFF) {
4239 q += 4;
4240 bo = -1;
4241 }
4242 else if (bom == 0xFFFE0000) {
4243 q += 4;
4244 bo = 1;
4245 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004246#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 if (bom == 0x0000FEFF) {
4248 q += 4;
4249 bo = 1;
4250 }
4251 else if (bom == 0xFFFE0000) {
4252 q += 4;
4253 bo = -1;
4254 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004255#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004257 }
4258
4259 if (bo == -1) {
4260 /* force LE */
4261 iorder[0] = 0;
4262 iorder[1] = 1;
4263 iorder[2] = 2;
4264 iorder[3] = 3;
4265 }
4266 else if (bo == 1) {
4267 /* force BE */
4268 iorder[0] = 3;
4269 iorder[1] = 2;
4270 iorder[2] = 1;
4271 iorder[3] = 0;
4272 }
4273
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004274 /* On narrow builds we split characters outside the BMP into two
4275 codepoints => count how much extra space we need. */
4276#ifndef Py_UNICODE_WIDE
4277 for (qq = q; qq < e; qq += 4)
4278 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4279 pairs++;
4280#endif
4281
4282 /* This might be one to much, because of a BOM */
4283 unicode = _PyUnicode_New((size+3)/4+pairs);
4284 if (!unicode)
4285 return NULL;
4286 if (size == 0)
4287 return (PyObject *)unicode;
4288
4289 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004290 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004291
Walter Dörwald41980ca2007-08-16 21:55:45 +00004292 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004293 Py_UCS4 ch;
4294 /* remaining bytes at the end? (size should be divisible by 4) */
4295 if (e-q<4) {
4296 if (consumed)
4297 break;
4298 errmsg = "truncated data";
4299 startinpos = ((const char *)q)-starts;
4300 endinpos = ((const char *)e)-starts;
4301 goto utf32Error;
4302 /* The remaining input chars are ignored if the callback
4303 chooses to skip the input */
4304 }
4305 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4306 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004307
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 if (ch >= 0x110000)
4309 {
4310 errmsg = "codepoint not in range(0x110000)";
4311 startinpos = ((const char *)q)-starts;
4312 endinpos = startinpos+4;
4313 goto utf32Error;
4314 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004315#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004316 if (ch >= 0x10000)
4317 {
4318 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4319 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4320 }
4321 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004322#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004323 *p++ = ch;
4324 q += 4;
4325 continue;
4326 utf32Error:
4327 outpos = p-PyUnicode_AS_UNICODE(unicode);
4328 if (unicode_decode_call_errorhandler(
4329 errors, &errorHandler,
4330 "utf32", errmsg,
4331 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4332 &unicode, &outpos, &p))
4333 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004334 }
4335
4336 if (byteorder)
4337 *byteorder = bo;
4338
4339 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004340 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004341
4342 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004343 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004344 goto onError;
4345
4346 Py_XDECREF(errorHandler);
4347 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004348 if (PyUnicode_READY(unicode) == -1) {
4349 Py_DECREF(unicode);
4350 return NULL;
4351 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004352 return (PyObject *)unicode;
4353
Benjamin Peterson29060642009-01-31 22:14:21 +00004354 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004355 Py_DECREF(unicode);
4356 Py_XDECREF(errorHandler);
4357 Py_XDECREF(exc);
4358 return NULL;
4359}
4360
4361PyObject *
4362PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004363 Py_ssize_t size,
4364 const char *errors,
4365 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004366{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004367 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004368 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004369 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004370#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004371 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004372#else
4373 const int pairs = 0;
4374#endif
4375 /* Offsets from p for storing byte pairs in the right order. */
4376#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4377 int iorder[] = {0, 1, 2, 3};
4378#else
4379 int iorder[] = {3, 2, 1, 0};
4380#endif
4381
Benjamin Peterson29060642009-01-31 22:14:21 +00004382#define STORECHAR(CH) \
4383 do { \
4384 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4385 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4386 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4387 p[iorder[0]] = (CH) & 0xff; \
4388 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004389 } while(0)
4390
4391 /* In narrow builds we can output surrogate pairs as one codepoint,
4392 so we need less space. */
4393#ifndef Py_UNICODE_WIDE
4394 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4396 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4397 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004398#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004399 nsize = (size - pairs + (byteorder == 0));
4400 bytesize = nsize * 4;
4401 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004402 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004403 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004404 if (v == NULL)
4405 return NULL;
4406
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004407 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004408 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004409 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004410 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004411 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004412
4413 if (byteorder == -1) {
4414 /* force LE */
4415 iorder[0] = 0;
4416 iorder[1] = 1;
4417 iorder[2] = 2;
4418 iorder[3] = 3;
4419 }
4420 else if (byteorder == 1) {
4421 /* force BE */
4422 iorder[0] = 3;
4423 iorder[1] = 2;
4424 iorder[2] = 1;
4425 iorder[3] = 0;
4426 }
4427
4428 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004430#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4432 Py_UCS4 ch2 = *s;
4433 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4434 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4435 s++;
4436 size--;
4437 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004438 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004439#endif
4440 STORECHAR(ch);
4441 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004442
4443 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004444 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004445#undef STORECHAR
4446}
4447
Alexander Belopolsky40018472011-02-26 01:02:56 +00004448PyObject *
4449PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004450{
4451 if (!PyUnicode_Check(unicode)) {
4452 PyErr_BadArgument();
4453 return NULL;
4454 }
4455 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 PyUnicode_GET_SIZE(unicode),
4457 NULL,
4458 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004459}
4460
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461/* --- UTF-16 Codec ------------------------------------------------------- */
4462
Tim Peters772747b2001-08-09 22:21:55 +00004463PyObject *
4464PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 Py_ssize_t size,
4466 const char *errors,
4467 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468{
Walter Dörwald69652032004-09-07 20:24:22 +00004469 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4470}
4471
Antoine Pitrouab868312009-01-10 15:40:25 +00004472/* Two masks for fast checking of whether a C 'long' may contain
4473 UTF16-encoded surrogate characters. This is an efficient heuristic,
4474 assuming that non-surrogate characters with a code point >= 0x8000 are
4475 rare in most input.
4476 FAST_CHAR_MASK is used when the input is in native byte ordering,
4477 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004478*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004479#if (SIZEOF_LONG == 8)
4480# define FAST_CHAR_MASK 0x8000800080008000L
4481# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4482#elif (SIZEOF_LONG == 4)
4483# define FAST_CHAR_MASK 0x80008000L
4484# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4485#else
4486# error C 'long' size should be either 4 or 8!
4487#endif
4488
Walter Dörwald69652032004-09-07 20:24:22 +00004489PyObject *
4490PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 Py_ssize_t size,
4492 const char *errors,
4493 int *byteorder,
4494 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004495{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004496 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004497 Py_ssize_t startinpos;
4498 Py_ssize_t endinpos;
4499 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500 PyUnicodeObject *unicode;
4501 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004502 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004503 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004504 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004505 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004506 /* Offsets from q for retrieving byte pairs in the right order. */
4507#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4508 int ihi = 1, ilo = 0;
4509#else
4510 int ihi = 0, ilo = 1;
4511#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512 PyObject *errorHandler = NULL;
4513 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514
4515 /* Note: size will always be longer than the resulting Unicode
4516 character count */
4517 unicode = _PyUnicode_New(size);
4518 if (!unicode)
4519 return NULL;
4520 if (size == 0)
4521 return (PyObject *)unicode;
4522
4523 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004524 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004525 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004526 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527
4528 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004529 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004531 /* Check for BOM marks (U+FEFF) in the input and adjust current
4532 byte order setting accordingly. In native mode, the leading BOM
4533 mark is skipped, in all other modes, it is copied to the output
4534 stream as-is (giving a ZWNBSP character). */
4535 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004536 if (size >= 2) {
4537 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004538#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 if (bom == 0xFEFF) {
4540 q += 2;
4541 bo = -1;
4542 }
4543 else if (bom == 0xFFFE) {
4544 q += 2;
4545 bo = 1;
4546 }
Tim Petersced69f82003-09-16 20:30:58 +00004547#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 if (bom == 0xFEFF) {
4549 q += 2;
4550 bo = 1;
4551 }
4552 else if (bom == 0xFFFE) {
4553 q += 2;
4554 bo = -1;
4555 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004556#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004557 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559
Tim Peters772747b2001-08-09 22:21:55 +00004560 if (bo == -1) {
4561 /* force LE */
4562 ihi = 1;
4563 ilo = 0;
4564 }
4565 else if (bo == 1) {
4566 /* force BE */
4567 ihi = 0;
4568 ilo = 1;
4569 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004570#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4571 native_ordering = ilo < ihi;
4572#else
4573 native_ordering = ilo > ihi;
4574#endif
Tim Peters772747b2001-08-09 22:21:55 +00004575
Antoine Pitrouab868312009-01-10 15:40:25 +00004576 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004577 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004578 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004579 /* First check for possible aligned read of a C 'long'. Unaligned
4580 reads are more expensive, better to defer to another iteration. */
4581 if (!((size_t) q & LONG_PTR_MASK)) {
4582 /* Fast path for runs of non-surrogate chars. */
4583 register const unsigned char *_q = q;
4584 Py_UNICODE *_p = p;
4585 if (native_ordering) {
4586 /* Native ordering is simple: as long as the input cannot
4587 possibly contain a surrogate char, do an unrolled copy
4588 of several 16-bit code points to the target object.
4589 The non-surrogate check is done on several input bytes
4590 at a time (as many as a C 'long' can contain). */
4591 while (_q < aligned_end) {
4592 unsigned long data = * (unsigned long *) _q;
4593 if (data & FAST_CHAR_MASK)
4594 break;
4595 _p[0] = ((unsigned short *) _q)[0];
4596 _p[1] = ((unsigned short *) _q)[1];
4597#if (SIZEOF_LONG == 8)
4598 _p[2] = ((unsigned short *) _q)[2];
4599 _p[3] = ((unsigned short *) _q)[3];
4600#endif
4601 _q += SIZEOF_LONG;
4602 _p += SIZEOF_LONG / 2;
4603 }
4604 }
4605 else {
4606 /* Byteswapped ordering is similar, but we must decompose
4607 the copy bytewise, and take care of zero'ing out the
4608 upper bytes if the target object is in 32-bit units
4609 (that is, in UCS-4 builds). */
4610 while (_q < aligned_end) {
4611 unsigned long data = * (unsigned long *) _q;
4612 if (data & SWAPPED_FAST_CHAR_MASK)
4613 break;
4614 /* Zero upper bytes in UCS-4 builds */
4615#if (Py_UNICODE_SIZE > 2)
4616 _p[0] = 0;
4617 _p[1] = 0;
4618#if (SIZEOF_LONG == 8)
4619 _p[2] = 0;
4620 _p[3] = 0;
4621#endif
4622#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004623 /* Issue #4916; UCS-4 builds on big endian machines must
4624 fill the two last bytes of each 4-byte unit. */
4625#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4626# define OFF 2
4627#else
4628# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004629#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004630 ((unsigned char *) _p)[OFF + 1] = _q[0];
4631 ((unsigned char *) _p)[OFF + 0] = _q[1];
4632 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4633 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4634#if (SIZEOF_LONG == 8)
4635 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4636 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4637 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4638 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4639#endif
4640#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004641 _q += SIZEOF_LONG;
4642 _p += SIZEOF_LONG / 2;
4643 }
4644 }
4645 p = _p;
4646 q = _q;
4647 if (q >= e)
4648 break;
4649 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004650 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651
Benjamin Peterson14339b62009-01-31 16:36:08 +00004652 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004653
4654 if (ch < 0xD800 || ch > 0xDFFF) {
4655 *p++ = ch;
4656 continue;
4657 }
4658
4659 /* UTF-16 code pair: */
4660 if (q > e) {
4661 errmsg = "unexpected end of data";
4662 startinpos = (((const char *)q) - 2) - starts;
4663 endinpos = ((const char *)e) + 1 - starts;
4664 goto utf16Error;
4665 }
4666 if (0xD800 <= ch && ch <= 0xDBFF) {
4667 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4668 q += 2;
4669 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004670#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004671 *p++ = ch;
4672 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004673#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004675#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004676 continue;
4677 }
4678 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004679 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004680 startinpos = (((const char *)q)-4)-starts;
4681 endinpos = startinpos+2;
4682 goto utf16Error;
4683 }
4684
Benjamin Peterson14339b62009-01-31 16:36:08 +00004685 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 errmsg = "illegal encoding";
4687 startinpos = (((const char *)q)-2)-starts;
4688 endinpos = startinpos+2;
4689 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004690
Benjamin Peterson29060642009-01-31 22:14:21 +00004691 utf16Error:
4692 outpos = p - PyUnicode_AS_UNICODE(unicode);
4693 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004694 errors,
4695 &errorHandler,
4696 "utf16", errmsg,
4697 &starts,
4698 (const char **)&e,
4699 &startinpos,
4700 &endinpos,
4701 &exc,
4702 (const char **)&q,
4703 &unicode,
4704 &outpos,
4705 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004706 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004708 /* remaining byte at the end? (size should be even) */
4709 if (e == q) {
4710 if (!consumed) {
4711 errmsg = "truncated data";
4712 startinpos = ((const char *)q) - starts;
4713 endinpos = ((const char *)e) + 1 - starts;
4714 outpos = p - PyUnicode_AS_UNICODE(unicode);
4715 if (unicode_decode_call_errorhandler(
4716 errors,
4717 &errorHandler,
4718 "utf16", errmsg,
4719 &starts,
4720 (const char **)&e,
4721 &startinpos,
4722 &endinpos,
4723 &exc,
4724 (const char **)&q,
4725 &unicode,
4726 &outpos,
4727 &p))
4728 goto onError;
4729 /* The remaining input chars are ignored if the callback
4730 chooses to skip the input */
4731 }
4732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733
4734 if (byteorder)
4735 *byteorder = bo;
4736
Walter Dörwald69652032004-09-07 20:24:22 +00004737 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004738 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004739
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004741 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742 goto onError;
4743
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004744 Py_XDECREF(errorHandler);
4745 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004746 if (PyUnicode_READY(unicode) == -1) {
4747 Py_DECREF(unicode);
4748 return NULL;
4749 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 return (PyObject *)unicode;
4751
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 Py_XDECREF(errorHandler);
4755 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756 return NULL;
4757}
4758
Antoine Pitrouab868312009-01-10 15:40:25 +00004759#undef FAST_CHAR_MASK
4760#undef SWAPPED_FAST_CHAR_MASK
4761
Tim Peters772747b2001-08-09 22:21:55 +00004762PyObject *
4763PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 Py_ssize_t size,
4765 const char *errors,
4766 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004768 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004769 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004770 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004771#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004772 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004773#else
4774 const int pairs = 0;
4775#endif
Tim Peters772747b2001-08-09 22:21:55 +00004776 /* Offsets from p for storing byte pairs in the right order. */
4777#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4778 int ihi = 1, ilo = 0;
4779#else
4780 int ihi = 0, ilo = 1;
4781#endif
4782
Benjamin Peterson29060642009-01-31 22:14:21 +00004783#define STORECHAR(CH) \
4784 do { \
4785 p[ihi] = ((CH) >> 8) & 0xff; \
4786 p[ilo] = (CH) & 0xff; \
4787 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004788 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004790#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004791 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004792 if (s[i] >= 0x10000)
4793 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004794#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004795 /* 2 * (size + pairs + (byteorder == 0)) */
4796 if (size > PY_SSIZE_T_MAX ||
4797 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004799 nsize = size + pairs + (byteorder == 0);
4800 bytesize = nsize * 2;
4801 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004802 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004803 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804 if (v == NULL)
4805 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004807 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004809 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004810 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004811 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004812
4813 if (byteorder == -1) {
4814 /* force LE */
4815 ihi = 1;
4816 ilo = 0;
4817 }
4818 else if (byteorder == 1) {
4819 /* force BE */
4820 ihi = 0;
4821 ilo = 1;
4822 }
4823
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004824 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004825 Py_UNICODE ch = *s++;
4826 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004827#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004828 if (ch >= 0x10000) {
4829 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4830 ch = 0xD800 | ((ch-0x10000) >> 10);
4831 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004832#endif
Tim Peters772747b2001-08-09 22:21:55 +00004833 STORECHAR(ch);
4834 if (ch2)
4835 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004836 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004837
4838 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004839 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004840#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841}
4842
Alexander Belopolsky40018472011-02-26 01:02:56 +00004843PyObject *
4844PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845{
4846 if (!PyUnicode_Check(unicode)) {
4847 PyErr_BadArgument();
4848 return NULL;
4849 }
4850 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004851 PyUnicode_GET_SIZE(unicode),
4852 NULL,
4853 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854}
4855
4856/* --- Unicode Escape Codec ----------------------------------------------- */
4857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004858/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4859 if all the escapes in the string make it still a valid ASCII string.
4860 Returns -1 if any escapes were found which cause the string to
4861 pop out of ASCII range. Otherwise returns the length of the
4862 required buffer to hold the string.
4863 */
4864Py_ssize_t
4865length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4866{
4867 const unsigned char *p = (const unsigned char *)s;
4868 const unsigned char *end = p + size;
4869 Py_ssize_t length = 0;
4870
4871 if (size < 0)
4872 return -1;
4873
4874 for (; p < end; ++p) {
4875 if (*p > 127) {
4876 /* Non-ASCII */
4877 return -1;
4878 }
4879 else if (*p != '\\') {
4880 /* Normal character */
4881 ++length;
4882 }
4883 else {
4884 /* Backslash-escape, check next char */
4885 ++p;
4886 /* Escape sequence reaches till end of string or
4887 non-ASCII follow-up. */
4888 if (p >= end || *p > 127)
4889 return -1;
4890 switch (*p) {
4891 case '\n':
4892 /* backslash + \n result in zero characters */
4893 break;
4894 case '\\': case '\'': case '\"':
4895 case 'b': case 'f': case 't':
4896 case 'n': case 'r': case 'v': case 'a':
4897 ++length;
4898 break;
4899 case '0': case '1': case '2': case '3':
4900 case '4': case '5': case '6': case '7':
4901 case 'x': case 'u': case 'U': case 'N':
4902 /* these do not guarantee ASCII characters */
4903 return -1;
4904 default:
4905 /* count the backslash + the other character */
4906 length += 2;
4907 }
4908 }
4909 }
4910 return length;
4911}
4912
4913/* Similar to PyUnicode_WRITE but either write into wstr field
4914 or treat string as ASCII. */
4915#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4916 do { \
4917 if ((kind) != PyUnicode_WCHAR_KIND) \
4918 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4919 else \
4920 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4921 } while (0)
4922
4923#define WRITE_WSTR(buf, index, value) \
4924 assert(kind == PyUnicode_WCHAR_KIND), \
4925 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4926
4927
Fredrik Lundh06d12682001-01-24 07:59:11 +00004928static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004929
Alexander Belopolsky40018472011-02-26 01:02:56 +00004930PyObject *
4931PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004932 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02004933 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004935 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004936 Py_ssize_t startinpos;
4937 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004938 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004940 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004942 char* message;
4943 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004944 PyObject *errorHandler = NULL;
4945 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004946 Py_ssize_t ascii_length;
4947 Py_ssize_t i;
4948 int kind;
4949 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004951 ascii_length = length_of_escaped_ascii_string(s, size);
4952
4953 /* After length_of_escaped_ascii_string() there are two alternatives,
4954 either the string is pure ASCII with named escapes like \n, etc.
4955 and we determined it's exact size (common case)
4956 or it contains \x, \u, ... escape sequences. then we create a
4957 legacy wchar string and resize it at the end of this function. */
4958 if (ascii_length >= 0) {
4959 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4960 if (!v)
4961 goto onError;
4962 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4963 kind = PyUnicode_1BYTE_KIND;
4964 data = PyUnicode_DATA(v);
4965 }
4966 else {
4967 /* Escaped strings will always be longer than the resulting
4968 Unicode string, so we start with size here and then reduce the
4969 length after conversion to the true value.
4970 (but if the error callback returns a long replacement string
4971 we'll have to allocate more space) */
4972 v = _PyUnicode_New(size);
4973 if (!v)
4974 goto onError;
4975 kind = PyUnicode_WCHAR_KIND;
4976 data = PyUnicode_AS_UNICODE(v);
4977 }
4978
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979 if (size == 0)
4980 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004981 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004983
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984 while (s < end) {
4985 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004986 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004987 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004989 if (kind == PyUnicode_WCHAR_KIND) {
4990 assert(i < _PyUnicode_WSTR_LENGTH(v));
4991 }
4992 else {
4993 /* The only case in which i == ascii_length is a backslash
4994 followed by a newline. */
4995 assert(i <= ascii_length);
4996 }
4997
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998 /* Non-escape characters are interpreted as Unicode ordinals */
4999 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005000 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001 continue;
5002 }
5003
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005004 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005 /* \ - Escapes */
5006 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005007 c = *s++;
5008 if (s > end)
5009 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005010
5011 if (kind == PyUnicode_WCHAR_KIND) {
5012 assert(i < _PyUnicode_WSTR_LENGTH(v));
5013 }
5014 else {
5015 /* The only case in which i == ascii_length is a backslash
5016 followed by a newline. */
5017 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5018 }
5019
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005020 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005021
Benjamin Peterson29060642009-01-31 22:14:21 +00005022 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005023 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005024 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5025 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5026 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5027 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5028 /* FF */
5029 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5030 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5031 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5032 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5033 /* VT */
5034 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5035 /* BEL, not classic C */
5036 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039 case '0': case '1': case '2': case '3':
5040 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005041 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005042 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005043 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005044 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005045 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005047 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048 break;
5049
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 /* hex escapes */
5051 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005053 digits = 2;
5054 message = "truncated \\xXX escape";
5055 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005059 digits = 4;
5060 message = "truncated \\uXXXX escape";
5061 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005064 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005065 digits = 8;
5066 message = "truncated \\UXXXXXXXX escape";
5067 hexescape:
5068 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005069 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005070 if (s+digits>end) {
5071 endinpos = size;
5072 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005073 errors, &errorHandler,
5074 "unicodeescape", "end of string in escape sequence",
5075 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005076 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005077 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005078 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005079 goto nextByte;
5080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005081 for (j = 0; j < digits; ++j) {
5082 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005083 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005084 endinpos = (s+j+1)-starts;
5085 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005086 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 errors, &errorHandler,
5088 "unicodeescape", message,
5089 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005090 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005091 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005092 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005093 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005094 }
5095 chr = (chr<<4) & ~0xF;
5096 if (c >= '0' && c <= '9')
5097 chr += c - '0';
5098 else if (c >= 'a' && c <= 'f')
5099 chr += 10 + c - 'a';
5100 else
5101 chr += 10 + c - 'A';
5102 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005103 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005104 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005105 /* _decoding_error will have already written into the
5106 target buffer. */
5107 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005108 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005109 /* when we get here, chr is a 32-bit unicode character */
5110 if (chr <= 0xffff)
5111 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005112 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005113 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005114 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005115 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005116#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005117 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005118#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005119 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005120 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5121 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005122#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005123 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005124 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005125 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 errors, &errorHandler,
5128 "unicodeescape", "illegal Unicode character",
5129 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005130 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005131 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005132 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005133 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005134 break;
5135
Benjamin Peterson29060642009-01-31 22:14:21 +00005136 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005137 case 'N':
5138 message = "malformed \\N character escape";
5139 if (ucnhash_CAPI == NULL) {
5140 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005141 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5142 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005143 if (ucnhash_CAPI == NULL)
5144 goto ucnhashError;
5145 }
5146 if (*s == '{') {
5147 const char *start = s+1;
5148 /* look for the closing brace */
5149 while (*s != '}' && s < end)
5150 s++;
5151 if (s > start && s < end && *s == '}') {
5152 /* found a name. look it up in the unicode database */
5153 message = "unknown Unicode character name";
5154 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005155 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5156 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005157 goto store;
5158 }
5159 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005160 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005161 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005162 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005163 errors, &errorHandler,
5164 "unicodeescape", message,
5165 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005166 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005167 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005168 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005169 break;
5170
5171 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005172 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005173 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005174 message = "\\ at end of string";
5175 s--;
5176 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005177 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005178 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005179 errors, &errorHandler,
5180 "unicodeescape", message,
5181 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005182 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005183 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005184 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005185 }
5186 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005187 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5188 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005189 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005190 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005192 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005193 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005195 /* Ensure the length prediction worked in case of ASCII strings */
5196 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5197
5198 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5199 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005200 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005201 Py_XDECREF(errorHandler);
5202 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005204
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005206 PyErr_SetString(
5207 PyExc_UnicodeError,
5208 "\\N escapes not supported (can't load unicodedata module)"
5209 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005210 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005211 Py_XDECREF(errorHandler);
5212 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005213 return NULL;
5214
Benjamin Peterson29060642009-01-31 22:14:21 +00005215 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005217 Py_XDECREF(errorHandler);
5218 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 return NULL;
5220}
5221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005222#undef WRITE_ASCII_OR_WSTR
5223#undef WRITE_WSTR
5224
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225/* Return a Unicode-Escape string version of the Unicode object.
5226
5227 If quotes is true, the string is enclosed in u"" or u'' quotes as
5228 appropriate.
5229
5230*/
5231
Walter Dörwald79e913e2007-05-12 11:08:06 +00005232static const char *hexdigits = "0123456789abcdef";
5233
Alexander Belopolsky40018472011-02-26 01:02:56 +00005234PyObject *
5235PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005236 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005238 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005241#ifdef Py_UNICODE_WIDE
5242 const Py_ssize_t expandsize = 10;
5243#else
5244 const Py_ssize_t expandsize = 6;
5245#endif
5246
Thomas Wouters89f507f2006-12-13 04:49:30 +00005247 /* XXX(nnorwitz): rather than over-allocating, it would be
5248 better to choose a different scheme. Perhaps scan the
5249 first N-chars of the string and allocate based on that size.
5250 */
5251 /* Initial allocation is based on the longest-possible unichr
5252 escape.
5253
5254 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5255 unichr, so in this case it's the longest unichr escape. In
5256 narrow (UTF-16) builds this is five chars per source unichr
5257 since there are two unichrs in the surrogate pair, so in narrow
5258 (UTF-16) builds it's not the longest unichr escape.
5259
5260 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5261 so in the narrow (UTF-16) build case it's the longest unichr
5262 escape.
5263 */
5264
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005265 if (size == 0)
5266 return PyBytes_FromStringAndSize(NULL, 0);
5267
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005268 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005269 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005270
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005271 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005272 2
5273 + expandsize*size
5274 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 if (repr == NULL)
5276 return NULL;
5277
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005278 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 while (size-- > 0) {
5281 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005282
Walter Dörwald79e913e2007-05-12 11:08:06 +00005283 /* Escape backslashes */
5284 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 *p++ = '\\';
5286 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005287 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005288 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005289
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005290#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005291 /* Map 21-bit characters to '\U00xxxxxx' */
5292 else if (ch >= 0x10000) {
5293 *p++ = '\\';
5294 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005295 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5296 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5297 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5298 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5299 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5300 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5301 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5302 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005304 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005305#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5307 else if (ch >= 0xD800 && ch < 0xDC00) {
5308 Py_UNICODE ch2;
5309 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005310
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 ch2 = *s++;
5312 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005313 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005314 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5315 *p++ = '\\';
5316 *p++ = 'U';
5317 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5318 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5319 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5320 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5321 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5322 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5323 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5324 *p++ = hexdigits[ucs & 0x0000000F];
5325 continue;
5326 }
5327 /* Fall through: isolated surrogates are copied as-is */
5328 s--;
5329 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005330 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005331#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005332
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005334 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335 *p++ = '\\';
5336 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005337 *p++ = hexdigits[(ch >> 12) & 0x000F];
5338 *p++ = hexdigits[(ch >> 8) & 0x000F];
5339 *p++ = hexdigits[(ch >> 4) & 0x000F];
5340 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005342
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005343 /* Map special whitespace to '\t', \n', '\r' */
5344 else if (ch == '\t') {
5345 *p++ = '\\';
5346 *p++ = 't';
5347 }
5348 else if (ch == '\n') {
5349 *p++ = '\\';
5350 *p++ = 'n';
5351 }
5352 else if (ch == '\r') {
5353 *p++ = '\\';
5354 *p++ = 'r';
5355 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005356
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005357 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005358 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005360 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005361 *p++ = hexdigits[(ch >> 4) & 0x000F];
5362 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005363 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005364
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 /* Copy everything else as-is */
5366 else
5367 *p++ = (char) ch;
5368 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005370 assert(p - PyBytes_AS_STRING(repr) > 0);
5371 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5372 return NULL;
5373 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374}
5375
Alexander Belopolsky40018472011-02-26 01:02:56 +00005376PyObject *
5377PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005379 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 if (!PyUnicode_Check(unicode)) {
5381 PyErr_BadArgument();
5382 return NULL;
5383 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005384 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5385 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005386 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387}
5388
5389/* --- Raw Unicode Escape Codec ------------------------------------------- */
5390
Alexander Belopolsky40018472011-02-26 01:02:56 +00005391PyObject *
5392PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005393 Py_ssize_t size,
5394 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005396 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005397 Py_ssize_t startinpos;
5398 Py_ssize_t endinpos;
5399 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005401 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 const char *end;
5403 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005404 PyObject *errorHandler = NULL;
5405 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005406
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 /* Escaped strings will always be longer than the resulting
5408 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005409 length after conversion to the true value. (But decoding error
5410 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411 v = _PyUnicode_New(size);
5412 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005415 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005416 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 end = s + size;
5418 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005419 unsigned char c;
5420 Py_UCS4 x;
5421 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005422 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 /* Non-escape characters are interpreted as Unicode ordinals */
5425 if (*s != '\\') {
5426 *p++ = (unsigned char)*s++;
5427 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005428 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 startinpos = s-starts;
5430
5431 /* \u-escapes are only interpreted iff the number of leading
5432 backslashes if odd */
5433 bs = s;
5434 for (;s < end;) {
5435 if (*s != '\\')
5436 break;
5437 *p++ = (unsigned char)*s++;
5438 }
5439 if (((s - bs) & 1) == 0 ||
5440 s >= end ||
5441 (*s != 'u' && *s != 'U')) {
5442 continue;
5443 }
5444 p--;
5445 count = *s=='u' ? 4 : 8;
5446 s++;
5447
5448 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5449 outpos = p-PyUnicode_AS_UNICODE(v);
5450 for (x = 0, i = 0; i < count; ++i, ++s) {
5451 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005452 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005453 endinpos = s-starts;
5454 if (unicode_decode_call_errorhandler(
5455 errors, &errorHandler,
5456 "rawunicodeescape", "truncated \\uXXXX",
5457 &starts, &end, &startinpos, &endinpos, &exc, &s,
5458 &v, &outpos, &p))
5459 goto onError;
5460 goto nextByte;
5461 }
5462 x = (x<<4) & ~0xF;
5463 if (c >= '0' && c <= '9')
5464 x += c - '0';
5465 else if (c >= 'a' && c <= 'f')
5466 x += 10 + c - 'a';
5467 else
5468 x += 10 + c - 'A';
5469 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005470 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 /* UCS-2 character */
5472 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005473 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005474 /* UCS-4 character. Either store directly, or as
5475 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005476#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005477 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005478#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 x -= 0x10000L;
5480 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5481 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005482#endif
5483 } else {
5484 endinpos = s-starts;
5485 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005486 if (unicode_decode_call_errorhandler(
5487 errors, &errorHandler,
5488 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 &starts, &end, &startinpos, &endinpos, &exc, &s,
5490 &v, &outpos, &p))
5491 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005492 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 nextByte:
5494 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005496 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005498 Py_XDECREF(errorHandler);
5499 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005500 if (PyUnicode_READY(v) == -1) {
5501 Py_DECREF(v);
5502 return NULL;
5503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005505
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005508 Py_XDECREF(errorHandler);
5509 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 return NULL;
5511}
5512
Alexander Belopolsky40018472011-02-26 01:02:56 +00005513PyObject *
5514PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005515 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005517 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518 char *p;
5519 char *q;
5520
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005521#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005522 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005523#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005524 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005525#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005526
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005527 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005529
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005530 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 if (repr == NULL)
5532 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005533 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005534 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005536 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 while (size-- > 0) {
5538 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005539#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005540 /* Map 32-bit characters to '\Uxxxxxxxx' */
5541 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005542 *p++ = '\\';
5543 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005544 *p++ = hexdigits[(ch >> 28) & 0xf];
5545 *p++ = hexdigits[(ch >> 24) & 0xf];
5546 *p++ = hexdigits[(ch >> 20) & 0xf];
5547 *p++ = hexdigits[(ch >> 16) & 0xf];
5548 *p++ = hexdigits[(ch >> 12) & 0xf];
5549 *p++ = hexdigits[(ch >> 8) & 0xf];
5550 *p++ = hexdigits[(ch >> 4) & 0xf];
5551 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005552 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005553 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005554#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005555 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5556 if (ch >= 0xD800 && ch < 0xDC00) {
5557 Py_UNICODE ch2;
5558 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005559
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 ch2 = *s++;
5561 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005562 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5564 *p++ = '\\';
5565 *p++ = 'U';
5566 *p++ = hexdigits[(ucs >> 28) & 0xf];
5567 *p++ = hexdigits[(ucs >> 24) & 0xf];
5568 *p++ = hexdigits[(ucs >> 20) & 0xf];
5569 *p++ = hexdigits[(ucs >> 16) & 0xf];
5570 *p++ = hexdigits[(ucs >> 12) & 0xf];
5571 *p++ = hexdigits[(ucs >> 8) & 0xf];
5572 *p++ = hexdigits[(ucs >> 4) & 0xf];
5573 *p++ = hexdigits[ucs & 0xf];
5574 continue;
5575 }
5576 /* Fall through: isolated surrogates are copied as-is */
5577 s--;
5578 size++;
5579 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005580#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 /* Map 16-bit characters to '\uxxxx' */
5582 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 *p++ = '\\';
5584 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005585 *p++ = hexdigits[(ch >> 12) & 0xf];
5586 *p++ = hexdigits[(ch >> 8) & 0xf];
5587 *p++ = hexdigits[(ch >> 4) & 0xf];
5588 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005590 /* Copy everything else as-is */
5591 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592 *p++ = (char) ch;
5593 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005594 size = p - q;
5595
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005596 assert(size > 0);
5597 if (_PyBytes_Resize(&repr, size) < 0)
5598 return NULL;
5599 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600}
5601
Alexander Belopolsky40018472011-02-26 01:02:56 +00005602PyObject *
5603PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005605 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005607 PyErr_BadArgument();
5608 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005610 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5611 PyUnicode_GET_SIZE(unicode));
5612
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005613 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614}
5615
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005616/* --- Unicode Internal Codec ------------------------------------------- */
5617
Alexander Belopolsky40018472011-02-26 01:02:56 +00005618PyObject *
5619_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005620 Py_ssize_t size,
5621 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005622{
5623 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005624 Py_ssize_t startinpos;
5625 Py_ssize_t endinpos;
5626 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005627 PyUnicodeObject *v;
5628 Py_UNICODE *p;
5629 const char *end;
5630 const char *reason;
5631 PyObject *errorHandler = NULL;
5632 PyObject *exc = NULL;
5633
Neal Norwitzd43069c2006-01-08 01:12:10 +00005634#ifdef Py_UNICODE_WIDE
5635 Py_UNICODE unimax = PyUnicode_GetMax();
5636#endif
5637
Thomas Wouters89f507f2006-12-13 04:49:30 +00005638 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005639 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5640 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005641 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5643 as string was created with the old API. */
5644 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005645 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005646 p = PyUnicode_AS_UNICODE(v);
5647 end = s + size;
5648
5649 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005650 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005651 /* We have to sanity check the raw data, otherwise doom looms for
5652 some malformed UCS-4 data. */
5653 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005654#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005655 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005656#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005657 end-s < Py_UNICODE_SIZE
5658 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005659 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005660 startinpos = s - starts;
5661 if (end-s < Py_UNICODE_SIZE) {
5662 endinpos = end-starts;
5663 reason = "truncated input";
5664 }
5665 else {
5666 endinpos = s - starts + Py_UNICODE_SIZE;
5667 reason = "illegal code point (> 0x10FFFF)";
5668 }
5669 outpos = p - PyUnicode_AS_UNICODE(v);
5670 if (unicode_decode_call_errorhandler(
5671 errors, &errorHandler,
5672 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005673 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005674 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005675 goto onError;
5676 }
5677 }
5678 else {
5679 p++;
5680 s += Py_UNICODE_SIZE;
5681 }
5682 }
5683
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005684 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005685 goto onError;
5686 Py_XDECREF(errorHandler);
5687 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005688 if (PyUnicode_READY(v) == -1) {
5689 Py_DECREF(v);
5690 return NULL;
5691 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005692 return (PyObject *)v;
5693
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005695 Py_XDECREF(v);
5696 Py_XDECREF(errorHandler);
5697 Py_XDECREF(exc);
5698 return NULL;
5699}
5700
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701/* --- Latin-1 Codec ------------------------------------------------------ */
5702
Alexander Belopolsky40018472011-02-26 01:02:56 +00005703PyObject *
5704PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005705 Py_ssize_t size,
5706 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005709 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710}
5711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005713static void
5714make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005715 const char *encoding,
5716 const Py_UNICODE *unicode, Py_ssize_t size,
5717 Py_ssize_t startpos, Py_ssize_t endpos,
5718 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005721 *exceptionObject = PyUnicodeEncodeError_Create(
5722 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 }
5724 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5726 goto onError;
5727 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5728 goto onError;
5729 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5730 goto onError;
5731 return;
5732 onError:
5733 Py_DECREF(*exceptionObject);
5734 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 }
5736}
5737
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005738/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005739static void
5740raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005741 const char *encoding,
5742 const Py_UNICODE *unicode, Py_ssize_t size,
5743 Py_ssize_t startpos, Py_ssize_t endpos,
5744 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005745{
5746 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005748 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005750}
5751
5752/* error handling callback helper:
5753 build arguments, call the callback and check the arguments,
5754 put the result into newpos and return the replacement string, which
5755 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005756static PyObject *
5757unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005758 PyObject **errorHandler,
5759 const char *encoding, const char *reason,
5760 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5761 Py_ssize_t startpos, Py_ssize_t endpos,
5762 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005763{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005764 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005765
5766 PyObject *restuple;
5767 PyObject *resunicode;
5768
5769 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 }
5774
5775 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005779
5780 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005785 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 Py_DECREF(restuple);
5787 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005788 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005789 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 &resunicode, newpos)) {
5791 Py_DECREF(restuple);
5792 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005793 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005794 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5795 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5796 Py_DECREF(restuple);
5797 return NULL;
5798 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005799 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005800 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005801 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5803 Py_DECREF(restuple);
5804 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005805 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005806 Py_INCREF(resunicode);
5807 Py_DECREF(restuple);
5808 return resunicode;
5809}
5810
Alexander Belopolsky40018472011-02-26 01:02:56 +00005811static PyObject *
5812unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005813 Py_ssize_t size,
5814 const char *errors,
5815 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005816{
5817 /* output object */
5818 PyObject *res;
5819 /* pointers to the beginning and end+1 of input */
5820 const Py_UNICODE *startp = p;
5821 const Py_UNICODE *endp = p + size;
5822 /* pointer to the beginning of the unencodable characters */
5823 /* const Py_UNICODE *badp = NULL; */
5824 /* pointer into the output */
5825 char *str;
5826 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005827 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005828 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5829 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005830 PyObject *errorHandler = NULL;
5831 PyObject *exc = NULL;
5832 /* the following variable is used for caching string comparisons
5833 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5834 int known_errorHandler = -1;
5835
5836 /* allocate enough for a simple encoding without
5837 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005838 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005839 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005840 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005841 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005842 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005843 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005844 ressize = size;
5845
5846 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005848
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 /* can we encode this? */
5850 if (c<limit) {
5851 /* no overflow check, because we know that the space is enough */
5852 *str++ = (char)c;
5853 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005854 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 else {
5856 Py_ssize_t unicodepos = p-startp;
5857 Py_ssize_t requiredsize;
5858 PyObject *repunicode;
5859 Py_ssize_t repsize;
5860 Py_ssize_t newpos;
5861 Py_ssize_t respos;
5862 Py_UNICODE *uni2;
5863 /* startpos for collecting unencodable chars */
5864 const Py_UNICODE *collstart = p;
5865 const Py_UNICODE *collend = p;
5866 /* find all unecodable characters */
5867 while ((collend < endp) && ((*collend)>=limit))
5868 ++collend;
5869 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5870 if (known_errorHandler==-1) {
5871 if ((errors==NULL) || (!strcmp(errors, "strict")))
5872 known_errorHandler = 1;
5873 else if (!strcmp(errors, "replace"))
5874 known_errorHandler = 2;
5875 else if (!strcmp(errors, "ignore"))
5876 known_errorHandler = 3;
5877 else if (!strcmp(errors, "xmlcharrefreplace"))
5878 known_errorHandler = 4;
5879 else
5880 known_errorHandler = 0;
5881 }
5882 switch (known_errorHandler) {
5883 case 1: /* strict */
5884 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5885 goto onError;
5886 case 2: /* replace */
5887 while (collstart++<collend)
5888 *str++ = '?'; /* fall through */
5889 case 3: /* ignore */
5890 p = collend;
5891 break;
5892 case 4: /* xmlcharrefreplace */
5893 respos = str - PyBytes_AS_STRING(res);
5894 /* determine replacement size (temporarily (mis)uses p) */
5895 for (p = collstart, repsize = 0; p < collend; ++p) {
5896 if (*p<10)
5897 repsize += 2+1+1;
5898 else if (*p<100)
5899 repsize += 2+2+1;
5900 else if (*p<1000)
5901 repsize += 2+3+1;
5902 else if (*p<10000)
5903 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005904#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 else
5906 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005907#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 else if (*p<100000)
5909 repsize += 2+5+1;
5910 else if (*p<1000000)
5911 repsize += 2+6+1;
5912 else
5913 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005914#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 }
5916 requiredsize = respos+repsize+(endp-collend);
5917 if (requiredsize > ressize) {
5918 if (requiredsize<2*ressize)
5919 requiredsize = 2*ressize;
5920 if (_PyBytes_Resize(&res, requiredsize))
5921 goto onError;
5922 str = PyBytes_AS_STRING(res) + respos;
5923 ressize = requiredsize;
5924 }
5925 /* generate replacement (temporarily (mis)uses p) */
5926 for (p = collstart; p < collend; ++p) {
5927 str += sprintf(str, "&#%d;", (int)*p);
5928 }
5929 p = collend;
5930 break;
5931 default:
5932 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5933 encoding, reason, startp, size, &exc,
5934 collstart-startp, collend-startp, &newpos);
5935 if (repunicode == NULL)
5936 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005937 if (PyBytes_Check(repunicode)) {
5938 /* Directly copy bytes result to output. */
5939 repsize = PyBytes_Size(repunicode);
5940 if (repsize > 1) {
5941 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005942 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005943 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5944 Py_DECREF(repunicode);
5945 goto onError;
5946 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005947 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005948 ressize += repsize-1;
5949 }
5950 memcpy(str, PyBytes_AsString(repunicode), repsize);
5951 str += repsize;
5952 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005953 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005954 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005955 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 /* need more space? (at least enough for what we
5957 have+the replacement+the rest of the string, so
5958 we won't have to check space for encodable characters) */
5959 respos = str - PyBytes_AS_STRING(res);
5960 repsize = PyUnicode_GET_SIZE(repunicode);
5961 requiredsize = respos+repsize+(endp-collend);
5962 if (requiredsize > ressize) {
5963 if (requiredsize<2*ressize)
5964 requiredsize = 2*ressize;
5965 if (_PyBytes_Resize(&res, requiredsize)) {
5966 Py_DECREF(repunicode);
5967 goto onError;
5968 }
5969 str = PyBytes_AS_STRING(res) + respos;
5970 ressize = requiredsize;
5971 }
5972 /* check if there is anything unencodable in the replacement
5973 and copy it to the output */
5974 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5975 c = *uni2;
5976 if (c >= limit) {
5977 raise_encode_exception(&exc, encoding, startp, size,
5978 unicodepos, unicodepos+1, reason);
5979 Py_DECREF(repunicode);
5980 goto onError;
5981 }
5982 *str = (char)c;
5983 }
5984 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005985 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005986 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005987 }
5988 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005989 /* Resize if we allocated to much */
5990 size = str - PyBytes_AS_STRING(res);
5991 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005992 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005993 if (_PyBytes_Resize(&res, size) < 0)
5994 goto onError;
5995 }
5996
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005997 Py_XDECREF(errorHandler);
5998 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005999 return res;
6000
6001 onError:
6002 Py_XDECREF(res);
6003 Py_XDECREF(errorHandler);
6004 Py_XDECREF(exc);
6005 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006006}
6007
Alexander Belopolsky40018472011-02-26 01:02:56 +00006008PyObject *
6009PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006010 Py_ssize_t size,
6011 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006013 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014}
6015
Alexander Belopolsky40018472011-02-26 01:02:56 +00006016PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006017_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018{
6019 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 PyErr_BadArgument();
6021 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006023 if (PyUnicode_READY(unicode) == -1)
6024 return NULL;
6025 /* Fast path: if it is a one-byte string, construct
6026 bytes object directly. */
6027 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6028 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6029 PyUnicode_GET_LENGTH(unicode));
6030 /* Non-Latin-1 characters present. Defer to above function to
6031 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006034 errors);
6035}
6036
6037PyObject*
6038PyUnicode_AsLatin1String(PyObject *unicode)
6039{
6040 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041}
6042
6043/* --- 7-bit ASCII Codec -------------------------------------------------- */
6044
Alexander Belopolsky40018472011-02-26 01:02:56 +00006045PyObject *
6046PyUnicode_DecodeASCII(const char *s,
6047 Py_ssize_t size,
6048 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006050 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 PyUnicodeObject *v;
6052 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006053 Py_ssize_t startinpos;
6054 Py_ssize_t endinpos;
6055 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006057 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006058 PyObject *errorHandler = NULL;
6059 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006060 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006061
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006063 if (size == 1 && *(unsigned char*)s < 128)
6064 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6065
6066 /* Fast path. Assume the input actually *is* ASCII, and allocate
6067 a single-block Unicode object with that assumption. If there is
6068 an error, drop the object and start over. */
6069 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6070 if (v == NULL)
6071 goto onError;
6072 d = PyUnicode_1BYTE_DATA(v);
6073 for (i = 0; i < size; i++) {
6074 unsigned char ch = ((unsigned char*)s)[i];
6075 if (ch < 128)
6076 d[i] = ch;
6077 else
6078 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006079 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006080 if (i == size)
6081 return (PyObject*)v;
6082 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006083
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 v = _PyUnicode_New(size);
6085 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006090 e = s + size;
6091 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 register unsigned char c = (unsigned char)*s;
6093 if (c < 128) {
6094 *p++ = c;
6095 ++s;
6096 }
6097 else {
6098 startinpos = s-starts;
6099 endinpos = startinpos + 1;
6100 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6101 if (unicode_decode_call_errorhandler(
6102 errors, &errorHandler,
6103 "ascii", "ordinal not in range(128)",
6104 &starts, &e, &startinpos, &endinpos, &exc, &s,
6105 &v, &outpos, &p))
6106 goto onError;
6107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006109 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6111 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006112 Py_XDECREF(errorHandler);
6113 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006114 if (PyUnicode_READY(v) == -1) {
6115 Py_DECREF(v);
6116 return NULL;
6117 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006119
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006122 Py_XDECREF(errorHandler);
6123 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 return NULL;
6125}
6126
Alexander Belopolsky40018472011-02-26 01:02:56 +00006127PyObject *
6128PyUnicode_EncodeASCII(const Py_UNICODE *p,
6129 Py_ssize_t size,
6130 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006132 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133}
6134
Alexander Belopolsky40018472011-02-26 01:02:56 +00006135PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006136_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137{
6138 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 PyErr_BadArgument();
6140 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006142 if (PyUnicode_READY(unicode) == -1)
6143 return NULL;
6144 /* Fast path: if it is an ASCII-only string, construct bytes object
6145 directly. Else defer to above function to raise the exception. */
6146 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6147 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6148 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006150 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006151 errors);
6152}
6153
6154PyObject *
6155PyUnicode_AsASCIIString(PyObject *unicode)
6156{
6157 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158}
6159
Victor Stinner99b95382011-07-04 14:23:54 +02006160#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006161
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006162/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006163
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006164#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006165#define NEED_RETRY
6166#endif
6167
6168/* XXX This code is limited to "true" double-byte encodings, as
6169 a) it assumes an incomplete character consists of a single byte, and
6170 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006172
Alexander Belopolsky40018472011-02-26 01:02:56 +00006173static int
6174is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006175{
6176 const char *curr = s + offset;
6177
6178 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 const char *prev = CharPrev(s, curr);
6180 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006181 }
6182 return 0;
6183}
6184
6185/*
6186 * Decode MBCS string into unicode object. If 'final' is set, converts
6187 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6188 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006189static int
6190decode_mbcs(PyUnicodeObject **v,
6191 const char *s, /* MBCS string */
6192 int size, /* sizeof MBCS string */
6193 int final,
6194 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006195{
6196 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006197 Py_ssize_t n;
6198 DWORD usize;
6199 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006200
6201 assert(size >= 0);
6202
Victor Stinner554f3f02010-06-16 23:33:54 +00006203 /* check and handle 'errors' arg */
6204 if (errors==NULL || strcmp(errors, "strict")==0)
6205 flags = MB_ERR_INVALID_CHARS;
6206 else if (strcmp(errors, "ignore")==0)
6207 flags = 0;
6208 else {
6209 PyErr_Format(PyExc_ValueError,
6210 "mbcs encoding does not support errors='%s'",
6211 errors);
6212 return -1;
6213 }
6214
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006215 /* Skip trailing lead-byte unless 'final' is set */
6216 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006217 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006218
6219 /* First get the size of the result */
6220 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006221 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6222 if (usize==0)
6223 goto mbcs_decode_error;
6224 } else
6225 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006226
6227 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 /* Create unicode object */
6229 *v = _PyUnicode_New(usize);
6230 if (*v == NULL)
6231 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006232 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006233 }
6234 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 /* Extend unicode object */
6236 n = PyUnicode_GET_SIZE(*v);
6237 if (_PyUnicode_Resize(v, n + usize) < 0)
6238 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006239 }
6240
6241 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006242 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006243 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006244 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6245 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006247 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006248 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006249
6250mbcs_decode_error:
6251 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6252 we raise a UnicodeDecodeError - else it is a 'generic'
6253 windows error
6254 */
6255 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6256 /* Ideally, we should get reason from FormatMessage - this
6257 is the Windows 2000 English version of the message
6258 */
6259 PyObject *exc = NULL;
6260 const char *reason = "No mapping for the Unicode character exists "
6261 "in the target multi-byte code page.";
6262 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6263 if (exc != NULL) {
6264 PyCodec_StrictErrors(exc);
6265 Py_DECREF(exc);
6266 }
6267 } else {
6268 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6269 }
6270 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006271}
6272
Alexander Belopolsky40018472011-02-26 01:02:56 +00006273PyObject *
6274PyUnicode_DecodeMBCSStateful(const char *s,
6275 Py_ssize_t size,
6276 const char *errors,
6277 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006278{
6279 PyUnicodeObject *v = NULL;
6280 int done;
6281
6282 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006284
6285#ifdef NEED_RETRY
6286 retry:
6287 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006288 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006289 else
6290#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006291 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006292
6293 if (done < 0) {
6294 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006296 }
6297
6298 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006300
6301#ifdef NEED_RETRY
6302 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 s += done;
6304 size -= done;
6305 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006306 }
6307#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006308 if (PyUnicode_READY(v) == -1) {
6309 Py_DECREF(v);
6310 return NULL;
6311 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006312 return (PyObject *)v;
6313}
6314
Alexander Belopolsky40018472011-02-26 01:02:56 +00006315PyObject *
6316PyUnicode_DecodeMBCS(const char *s,
6317 Py_ssize_t size,
6318 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006319{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006320 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6321}
6322
6323/*
6324 * Convert unicode into string object (MBCS).
6325 * Returns 0 if succeed, -1 otherwise.
6326 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006327static int
6328encode_mbcs(PyObject **repr,
6329 const Py_UNICODE *p, /* unicode */
6330 int size, /* size of unicode */
6331 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006332{
Victor Stinner554f3f02010-06-16 23:33:54 +00006333 BOOL usedDefaultChar = FALSE;
6334 BOOL *pusedDefaultChar;
6335 int mbcssize;
6336 Py_ssize_t n;
6337 PyObject *exc = NULL;
6338 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006339
6340 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006341
Victor Stinner554f3f02010-06-16 23:33:54 +00006342 /* check and handle 'errors' arg */
6343 if (errors==NULL || strcmp(errors, "strict")==0) {
6344 flags = WC_NO_BEST_FIT_CHARS;
6345 pusedDefaultChar = &usedDefaultChar;
6346 } else if (strcmp(errors, "replace")==0) {
6347 flags = 0;
6348 pusedDefaultChar = NULL;
6349 } else {
6350 PyErr_Format(PyExc_ValueError,
6351 "mbcs encoding does not support errors='%s'",
6352 errors);
6353 return -1;
6354 }
6355
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006356 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006357 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006358 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6359 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 if (mbcssize == 0) {
6361 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6362 return -1;
6363 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006364 /* If we used a default char, then we failed! */
6365 if (pusedDefaultChar && *pusedDefaultChar)
6366 goto mbcs_encode_error;
6367 } else {
6368 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006369 }
6370
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006371 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 /* Create string object */
6373 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6374 if (*repr == NULL)
6375 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006376 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006377 }
6378 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 /* Extend string object */
6380 n = PyBytes_Size(*repr);
6381 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6382 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006383 }
6384
6385 /* Do the conversion */
6386 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006388 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6389 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006390 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6391 return -1;
6392 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006393 if (pusedDefaultChar && *pusedDefaultChar)
6394 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006395 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006396 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006397
6398mbcs_encode_error:
6399 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6400 Py_XDECREF(exc);
6401 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006402}
6403
Alexander Belopolsky40018472011-02-26 01:02:56 +00006404PyObject *
6405PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6406 Py_ssize_t size,
6407 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006408{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006409 PyObject *repr = NULL;
6410 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006411
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006412#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006414 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006415 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006416 else
6417#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006418 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006419
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006420 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 Py_XDECREF(repr);
6422 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006423 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006424
6425#ifdef NEED_RETRY
6426 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 p += INT_MAX;
6428 size -= INT_MAX;
6429 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006430 }
6431#endif
6432
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006433 return repr;
6434}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006435
Alexander Belopolsky40018472011-02-26 01:02:56 +00006436PyObject *
6437PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006438{
6439 if (!PyUnicode_Check(unicode)) {
6440 PyErr_BadArgument();
6441 return NULL;
6442 }
6443 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 PyUnicode_GET_SIZE(unicode),
6445 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006446}
6447
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006448#undef NEED_RETRY
6449
Victor Stinner99b95382011-07-04 14:23:54 +02006450#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006451
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452/* --- Character Mapping Codec -------------------------------------------- */
6453
Alexander Belopolsky40018472011-02-26 01:02:56 +00006454PyObject *
6455PyUnicode_DecodeCharmap(const char *s,
6456 Py_ssize_t size,
6457 PyObject *mapping,
6458 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006460 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006461 Py_ssize_t startinpos;
6462 Py_ssize_t endinpos;
6463 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 PyUnicodeObject *v;
6466 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006467 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006468 PyObject *errorHandler = NULL;
6469 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006470 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006471 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006472
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 /* Default to Latin-1 */
6474 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476
6477 v = _PyUnicode_New(size);
6478 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006484 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 mapstring = PyUnicode_AS_UNICODE(mapping);
6486 maplen = PyUnicode_GET_SIZE(mapping);
6487 while (s < e) {
6488 unsigned char ch = *s;
6489 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490
Benjamin Peterson29060642009-01-31 22:14:21 +00006491 if (ch < maplen)
6492 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 if (x == 0xfffe) {
6495 /* undefined mapping */
6496 outpos = p-PyUnicode_AS_UNICODE(v);
6497 startinpos = s-starts;
6498 endinpos = startinpos+1;
6499 if (unicode_decode_call_errorhandler(
6500 errors, &errorHandler,
6501 "charmap", "character maps to <undefined>",
6502 &starts, &e, &startinpos, &endinpos, &exc, &s,
6503 &v, &outpos, &p)) {
6504 goto onError;
6505 }
6506 continue;
6507 }
6508 *p++ = x;
6509 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006510 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006511 }
6512 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 while (s < e) {
6514 unsigned char ch = *s;
6515 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006516
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6518 w = PyLong_FromLong((long)ch);
6519 if (w == NULL)
6520 goto onError;
6521 x = PyObject_GetItem(mapping, w);
6522 Py_DECREF(w);
6523 if (x == NULL) {
6524 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6525 /* No mapping found means: mapping is undefined. */
6526 PyErr_Clear();
6527 x = Py_None;
6528 Py_INCREF(x);
6529 } else
6530 goto onError;
6531 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006532
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 /* Apply mapping */
6534 if (PyLong_Check(x)) {
6535 long value = PyLong_AS_LONG(x);
6536 if (value < 0 || value > 65535) {
6537 PyErr_SetString(PyExc_TypeError,
6538 "character mapping must be in range(65536)");
6539 Py_DECREF(x);
6540 goto onError;
6541 }
6542 *p++ = (Py_UNICODE)value;
6543 }
6544 else if (x == Py_None) {
6545 /* undefined mapping */
6546 outpos = p-PyUnicode_AS_UNICODE(v);
6547 startinpos = s-starts;
6548 endinpos = startinpos+1;
6549 if (unicode_decode_call_errorhandler(
6550 errors, &errorHandler,
6551 "charmap", "character maps to <undefined>",
6552 &starts, &e, &startinpos, &endinpos, &exc, &s,
6553 &v, &outpos, &p)) {
6554 Py_DECREF(x);
6555 goto onError;
6556 }
6557 Py_DECREF(x);
6558 continue;
6559 }
6560 else if (PyUnicode_Check(x)) {
6561 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006562
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 if (targetsize == 1)
6564 /* 1-1 mapping */
6565 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006566
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 else if (targetsize > 1) {
6568 /* 1-n mapping */
6569 if (targetsize > extrachars) {
6570 /* resize first */
6571 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6572 Py_ssize_t needed = (targetsize - extrachars) + \
6573 (targetsize << 2);
6574 extrachars += needed;
6575 /* XXX overflow detection missing */
6576 if (_PyUnicode_Resize(&v,
6577 PyUnicode_GET_SIZE(v) + needed) < 0) {
6578 Py_DECREF(x);
6579 goto onError;
6580 }
6581 p = PyUnicode_AS_UNICODE(v) + oldpos;
6582 }
6583 Py_UNICODE_COPY(p,
6584 PyUnicode_AS_UNICODE(x),
6585 targetsize);
6586 p += targetsize;
6587 extrachars -= targetsize;
6588 }
6589 /* 1-0 mapping: skip the character */
6590 }
6591 else {
6592 /* wrong return value */
6593 PyErr_SetString(PyExc_TypeError,
6594 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006595 Py_DECREF(x);
6596 goto onError;
6597 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 Py_DECREF(x);
6599 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006600 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 }
6602 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6604 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006605 Py_XDECREF(errorHandler);
6606 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006607 if (PyUnicode_READY(v) == -1) {
6608 Py_DECREF(v);
6609 return NULL;
6610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006612
Benjamin Peterson29060642009-01-31 22:14:21 +00006613 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006614 Py_XDECREF(errorHandler);
6615 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 Py_XDECREF(v);
6617 return NULL;
6618}
6619
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006620/* Charmap encoding: the lookup table */
6621
Alexander Belopolsky40018472011-02-26 01:02:56 +00006622struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 PyObject_HEAD
6624 unsigned char level1[32];
6625 int count2, count3;
6626 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006627};
6628
6629static PyObject*
6630encoding_map_size(PyObject *obj, PyObject* args)
6631{
6632 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006633 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006635}
6636
6637static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006638 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006639 PyDoc_STR("Return the size (in bytes) of this object") },
6640 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006641};
6642
6643static void
6644encoding_map_dealloc(PyObject* o)
6645{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006646 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006647}
6648
6649static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006650 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 "EncodingMap", /*tp_name*/
6652 sizeof(struct encoding_map), /*tp_basicsize*/
6653 0, /*tp_itemsize*/
6654 /* methods */
6655 encoding_map_dealloc, /*tp_dealloc*/
6656 0, /*tp_print*/
6657 0, /*tp_getattr*/
6658 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006659 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 0, /*tp_repr*/
6661 0, /*tp_as_number*/
6662 0, /*tp_as_sequence*/
6663 0, /*tp_as_mapping*/
6664 0, /*tp_hash*/
6665 0, /*tp_call*/
6666 0, /*tp_str*/
6667 0, /*tp_getattro*/
6668 0, /*tp_setattro*/
6669 0, /*tp_as_buffer*/
6670 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6671 0, /*tp_doc*/
6672 0, /*tp_traverse*/
6673 0, /*tp_clear*/
6674 0, /*tp_richcompare*/
6675 0, /*tp_weaklistoffset*/
6676 0, /*tp_iter*/
6677 0, /*tp_iternext*/
6678 encoding_map_methods, /*tp_methods*/
6679 0, /*tp_members*/
6680 0, /*tp_getset*/
6681 0, /*tp_base*/
6682 0, /*tp_dict*/
6683 0, /*tp_descr_get*/
6684 0, /*tp_descr_set*/
6685 0, /*tp_dictoffset*/
6686 0, /*tp_init*/
6687 0, /*tp_alloc*/
6688 0, /*tp_new*/
6689 0, /*tp_free*/
6690 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006691};
6692
6693PyObject*
6694PyUnicode_BuildEncodingMap(PyObject* string)
6695{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006696 PyObject *result;
6697 struct encoding_map *mresult;
6698 int i;
6699 int need_dict = 0;
6700 unsigned char level1[32];
6701 unsigned char level2[512];
6702 unsigned char *mlevel1, *mlevel2, *mlevel3;
6703 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006704 int kind;
6705 void *data;
6706 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006708 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006709 PyErr_BadArgument();
6710 return NULL;
6711 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006712 kind = PyUnicode_KIND(string);
6713 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006714 memset(level1, 0xFF, sizeof level1);
6715 memset(level2, 0xFF, sizeof level2);
6716
6717 /* If there isn't a one-to-one mapping of NULL to \0,
6718 or if there are non-BMP characters, we need to use
6719 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006720 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006721 need_dict = 1;
6722 for (i = 1; i < 256; i++) {
6723 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006724 ch = PyUnicode_READ(kind, data, i);
6725 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006726 need_dict = 1;
6727 break;
6728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006729 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006730 /* unmapped character */
6731 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006732 l1 = ch >> 11;
6733 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006734 if (level1[l1] == 0xFF)
6735 level1[l1] = count2++;
6736 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006737 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006738 }
6739
6740 if (count2 >= 0xFF || count3 >= 0xFF)
6741 need_dict = 1;
6742
6743 if (need_dict) {
6744 PyObject *result = PyDict_New();
6745 PyObject *key, *value;
6746 if (!result)
6747 return NULL;
6748 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006749 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006750 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006751 if (!key || !value)
6752 goto failed1;
6753 if (PyDict_SetItem(result, key, value) == -1)
6754 goto failed1;
6755 Py_DECREF(key);
6756 Py_DECREF(value);
6757 }
6758 return result;
6759 failed1:
6760 Py_XDECREF(key);
6761 Py_XDECREF(value);
6762 Py_DECREF(result);
6763 return NULL;
6764 }
6765
6766 /* Create a three-level trie */
6767 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6768 16*count2 + 128*count3 - 1);
6769 if (!result)
6770 return PyErr_NoMemory();
6771 PyObject_Init(result, &EncodingMapType);
6772 mresult = (struct encoding_map*)result;
6773 mresult->count2 = count2;
6774 mresult->count3 = count3;
6775 mlevel1 = mresult->level1;
6776 mlevel2 = mresult->level23;
6777 mlevel3 = mresult->level23 + 16*count2;
6778 memcpy(mlevel1, level1, 32);
6779 memset(mlevel2, 0xFF, 16*count2);
6780 memset(mlevel3, 0, 128*count3);
6781 count3 = 0;
6782 for (i = 1; i < 256; i++) {
6783 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006784 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006785 /* unmapped character */
6786 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006787 o1 = PyUnicode_READ(kind, data, i)>>11;
6788 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006789 i2 = 16*mlevel1[o1] + o2;
6790 if (mlevel2[i2] == 0xFF)
6791 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006792 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006793 i3 = 128*mlevel2[i2] + o3;
6794 mlevel3[i3] = i;
6795 }
6796 return result;
6797}
6798
6799static int
6800encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6801{
6802 struct encoding_map *map = (struct encoding_map*)mapping;
6803 int l1 = c>>11;
6804 int l2 = (c>>7) & 0xF;
6805 int l3 = c & 0x7F;
6806 int i;
6807
6808#ifdef Py_UNICODE_WIDE
6809 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006811 }
6812#endif
6813 if (c == 0)
6814 return 0;
6815 /* level 1*/
6816 i = map->level1[l1];
6817 if (i == 0xFF) {
6818 return -1;
6819 }
6820 /* level 2*/
6821 i = map->level23[16*i+l2];
6822 if (i == 0xFF) {
6823 return -1;
6824 }
6825 /* level 3 */
6826 i = map->level23[16*map->count2 + 128*i + l3];
6827 if (i == 0) {
6828 return -1;
6829 }
6830 return i;
6831}
6832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006833/* Lookup the character ch in the mapping. If the character
6834 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006835 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006836static PyObject *
6837charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838{
Christian Heimes217cfd12007-12-02 14:31:20 +00006839 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006840 PyObject *x;
6841
6842 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006844 x = PyObject_GetItem(mapping, w);
6845 Py_DECREF(w);
6846 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6848 /* No mapping found means: mapping is undefined. */
6849 PyErr_Clear();
6850 x = Py_None;
6851 Py_INCREF(x);
6852 return x;
6853 } else
6854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006856 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006858 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 long value = PyLong_AS_LONG(x);
6860 if (value < 0 || value > 255) {
6861 PyErr_SetString(PyExc_TypeError,
6862 "character mapping must be in range(256)");
6863 Py_DECREF(x);
6864 return NULL;
6865 }
6866 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006868 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 /* wrong return value */
6872 PyErr_Format(PyExc_TypeError,
6873 "character mapping must return integer, bytes or None, not %.400s",
6874 x->ob_type->tp_name);
6875 Py_DECREF(x);
6876 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 }
6878}
6879
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006880static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006881charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006882{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006883 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6884 /* exponentially overallocate to minimize reallocations */
6885 if (requiredsize < 2*outsize)
6886 requiredsize = 2*outsize;
6887 if (_PyBytes_Resize(outobj, requiredsize))
6888 return -1;
6889 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006890}
6891
Benjamin Peterson14339b62009-01-31 16:36:08 +00006892typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006894} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006895/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006896 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006897 space is available. Return a new reference to the object that
6898 was put in the output buffer, or Py_None, if the mapping was undefined
6899 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006900 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006901static charmapencode_result
6902charmapencode_output(Py_UNICODE c, PyObject *mapping,
6903 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006904{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006905 PyObject *rep;
6906 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006907 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006908
Christian Heimes90aa7642007-12-19 02:45:37 +00006909 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006910 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006912 if (res == -1)
6913 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 if (outsize<requiredsize)
6915 if (charmapencode_resize(outobj, outpos, requiredsize))
6916 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006917 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 outstart[(*outpos)++] = (char)res;
6919 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006920 }
6921
6922 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006923 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006925 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 Py_DECREF(rep);
6927 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006928 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 if (PyLong_Check(rep)) {
6930 Py_ssize_t requiredsize = *outpos+1;
6931 if (outsize<requiredsize)
6932 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6933 Py_DECREF(rep);
6934 return enc_EXCEPTION;
6935 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006936 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006938 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006939 else {
6940 const char *repchars = PyBytes_AS_STRING(rep);
6941 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6942 Py_ssize_t requiredsize = *outpos+repsize;
6943 if (outsize<requiredsize)
6944 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6945 Py_DECREF(rep);
6946 return enc_EXCEPTION;
6947 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006948 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 memcpy(outstart + *outpos, repchars, repsize);
6950 *outpos += repsize;
6951 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006952 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006953 Py_DECREF(rep);
6954 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006955}
6956
6957/* handle an error in PyUnicode_EncodeCharmap
6958 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006959static int
6960charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006961 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006962 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006963 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006964 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006965{
6966 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006967 Py_ssize_t repsize;
6968 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006969 Py_UNICODE *uni2;
6970 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006971 Py_ssize_t collstartpos = *inpos;
6972 Py_ssize_t collendpos = *inpos+1;
6973 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006974 char *encoding = "charmap";
6975 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006976 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006977
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006978 /* find all unencodable characters */
6979 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006980 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006981 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 int res = encoding_map_lookup(p[collendpos], mapping);
6983 if (res != -1)
6984 break;
6985 ++collendpos;
6986 continue;
6987 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006988
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 rep = charmapencode_lookup(p[collendpos], mapping);
6990 if (rep==NULL)
6991 return -1;
6992 else if (rep!=Py_None) {
6993 Py_DECREF(rep);
6994 break;
6995 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006996 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006998 }
6999 /* cache callback name lookup
7000 * (if not done yet, i.e. it's the first error) */
7001 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007002 if ((errors==NULL) || (!strcmp(errors, "strict")))
7003 *known_errorHandler = 1;
7004 else if (!strcmp(errors, "replace"))
7005 *known_errorHandler = 2;
7006 else if (!strcmp(errors, "ignore"))
7007 *known_errorHandler = 3;
7008 else if (!strcmp(errors, "xmlcharrefreplace"))
7009 *known_errorHandler = 4;
7010 else
7011 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007012 }
7013 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007014 case 1: /* strict */
7015 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7016 return -1;
7017 case 2: /* replace */
7018 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 x = charmapencode_output('?', mapping, res, respos);
7020 if (x==enc_EXCEPTION) {
7021 return -1;
7022 }
7023 else if (x==enc_FAILED) {
7024 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7025 return -1;
7026 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007027 }
7028 /* fall through */
7029 case 3: /* ignore */
7030 *inpos = collendpos;
7031 break;
7032 case 4: /* xmlcharrefreplace */
7033 /* generate replacement (temporarily (mis)uses p) */
7034 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 char buffer[2+29+1+1];
7036 char *cp;
7037 sprintf(buffer, "&#%d;", (int)p[collpos]);
7038 for (cp = buffer; *cp; ++cp) {
7039 x = charmapencode_output(*cp, mapping, res, respos);
7040 if (x==enc_EXCEPTION)
7041 return -1;
7042 else if (x==enc_FAILED) {
7043 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7044 return -1;
7045 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007046 }
7047 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007048 *inpos = collendpos;
7049 break;
7050 default:
7051 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 encoding, reason, p, size, exceptionObject,
7053 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007054 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007056 if (PyBytes_Check(repunicode)) {
7057 /* Directly copy bytes result to output. */
7058 Py_ssize_t outsize = PyBytes_Size(*res);
7059 Py_ssize_t requiredsize;
7060 repsize = PyBytes_Size(repunicode);
7061 requiredsize = *respos + repsize;
7062 if (requiredsize > outsize)
7063 /* Make room for all additional bytes. */
7064 if (charmapencode_resize(res, respos, requiredsize)) {
7065 Py_DECREF(repunicode);
7066 return -1;
7067 }
7068 memcpy(PyBytes_AsString(*res) + *respos,
7069 PyBytes_AsString(repunicode), repsize);
7070 *respos += repsize;
7071 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007072 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007073 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007074 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007075 /* generate replacement */
7076 repsize = PyUnicode_GET_SIZE(repunicode);
7077 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007078 x = charmapencode_output(*uni2, mapping, res, respos);
7079 if (x==enc_EXCEPTION) {
7080 return -1;
7081 }
7082 else if (x==enc_FAILED) {
7083 Py_DECREF(repunicode);
7084 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7085 return -1;
7086 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007087 }
7088 *inpos = newpos;
7089 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007090 }
7091 return 0;
7092}
7093
Alexander Belopolsky40018472011-02-26 01:02:56 +00007094PyObject *
7095PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7096 Py_ssize_t size,
7097 PyObject *mapping,
7098 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007100 /* output object */
7101 PyObject *res = NULL;
7102 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007103 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007104 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007105 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007106 PyObject *errorHandler = NULL;
7107 PyObject *exc = NULL;
7108 /* the following variable is used for caching string comparisons
7109 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7110 * 3=ignore, 4=xmlcharrefreplace */
7111 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112
7113 /* Default to Latin-1 */
7114 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007117 /* allocate enough for a simple encoding without
7118 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007119 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007120 if (res == NULL)
7121 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007122 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007125 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 /* try to encode it */
7127 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7128 if (x==enc_EXCEPTION) /* error */
7129 goto onError;
7130 if (x==enc_FAILED) { /* unencodable character */
7131 if (charmap_encoding_error(p, size, &inpos, mapping,
7132 &exc,
7133 &known_errorHandler, &errorHandler, errors,
7134 &res, &respos)) {
7135 goto onError;
7136 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007137 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 else
7139 /* done with this character => adjust input position */
7140 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007143 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007144 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007145 if (_PyBytes_Resize(&res, respos) < 0)
7146 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007147
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007148 Py_XDECREF(exc);
7149 Py_XDECREF(errorHandler);
7150 return res;
7151
Benjamin Peterson29060642009-01-31 22:14:21 +00007152 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007153 Py_XDECREF(res);
7154 Py_XDECREF(exc);
7155 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156 return NULL;
7157}
7158
Alexander Belopolsky40018472011-02-26 01:02:56 +00007159PyObject *
7160PyUnicode_AsCharmapString(PyObject *unicode,
7161 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162{
7163 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007164 PyErr_BadArgument();
7165 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 }
7167 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 PyUnicode_GET_SIZE(unicode),
7169 mapping,
7170 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171}
7172
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007173/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007174static void
7175make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007176 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007177 Py_ssize_t startpos, Py_ssize_t endpos,
7178 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007180 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007181 *exceptionObject = _PyUnicodeTranslateError_Create(
7182 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 }
7184 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007185 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7186 goto onError;
7187 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7188 goto onError;
7189 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7190 goto onError;
7191 return;
7192 onError:
7193 Py_DECREF(*exceptionObject);
7194 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195 }
7196}
7197
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007198/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007199static void
7200raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007201 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007202 Py_ssize_t startpos, Py_ssize_t endpos,
7203 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007204{
7205 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007206 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007207 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007209}
7210
7211/* error handling callback helper:
7212 build arguments, call the callback and check the arguments,
7213 put the result into newpos and return the replacement string, which
7214 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007215static PyObject *
7216unicode_translate_call_errorhandler(const char *errors,
7217 PyObject **errorHandler,
7218 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007219 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007220 Py_ssize_t startpos, Py_ssize_t endpos,
7221 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007222{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007223 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007224
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007225 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007226 PyObject *restuple;
7227 PyObject *resunicode;
7228
7229 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007231 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007233 }
7234
7235 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007236 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007237 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007239
7240 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007241 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007242 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007243 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007244 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007245 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 Py_DECREF(restuple);
7247 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007248 }
7249 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007250 &resunicode, &i_newpos)) {
7251 Py_DECREF(restuple);
7252 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007253 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007254 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007255 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007256 else
7257 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007258 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7260 Py_DECREF(restuple);
7261 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007262 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007263 Py_INCREF(resunicode);
7264 Py_DECREF(restuple);
7265 return resunicode;
7266}
7267
7268/* Lookup the character ch in the mapping and put the result in result,
7269 which must be decrefed by the caller.
7270 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007271static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007272charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007273{
Christian Heimes217cfd12007-12-02 14:31:20 +00007274 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007275 PyObject *x;
7276
7277 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007278 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007279 x = PyObject_GetItem(mapping, w);
7280 Py_DECREF(w);
7281 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007282 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7283 /* No mapping found means: use 1:1 mapping. */
7284 PyErr_Clear();
7285 *result = NULL;
7286 return 0;
7287 } else
7288 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007289 }
7290 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 *result = x;
7292 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007293 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007294 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 long value = PyLong_AS_LONG(x);
7296 long max = PyUnicode_GetMax();
7297 if (value < 0 || value > max) {
7298 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007299 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 Py_DECREF(x);
7301 return -1;
7302 }
7303 *result = x;
7304 return 0;
7305 }
7306 else if (PyUnicode_Check(x)) {
7307 *result = x;
7308 return 0;
7309 }
7310 else {
7311 /* wrong return value */
7312 PyErr_SetString(PyExc_TypeError,
7313 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007314 Py_DECREF(x);
7315 return -1;
7316 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007317}
7318/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 if not reallocate and adjust various state variables.
7320 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007321static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007322charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007324{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007325 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007326 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 /* exponentially overallocate to minimize reallocations */
7328 if (requiredsize < 2 * oldsize)
7329 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007330 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7331 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007333 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007334 }
7335 return 0;
7336}
7337/* lookup the character, put the result in the output string and adjust
7338 various state variables. Return a new reference to the object that
7339 was put in the output buffer in *result, or Py_None, if the mapping was
7340 undefined (in which case no character was written).
7341 The called must decref result.
7342 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007343static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007344charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7345 PyObject *mapping, Py_UCS4 **output,
7346 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007347 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007348{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007349 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7350 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007352 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007353 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007354 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007355 }
7356 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007358 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007360 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007361 }
7362 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007363 Py_ssize_t repsize;
7364 if (PyUnicode_READY(*res) == -1)
7365 return -1;
7366 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 if (repsize==1) {
7368 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007369 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 }
7371 else if (repsize!=0) {
7372 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007373 Py_ssize_t requiredsize = *opos +
7374 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007376 Py_ssize_t i;
7377 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007379 for(i = 0; i < repsize; i++)
7380 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007381 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007382 }
7383 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007385 return 0;
7386}
7387
Alexander Belopolsky40018472011-02-26 01:02:56 +00007388PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007389_PyUnicode_TranslateCharmap(PyObject *input,
7390 PyObject *mapping,
7391 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007393 /* input object */
7394 char *idata;
7395 Py_ssize_t size, i;
7396 int kind;
7397 /* output buffer */
7398 Py_UCS4 *output = NULL;
7399 Py_ssize_t osize;
7400 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007401 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007402 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007403 char *reason = "character maps to <undefined>";
7404 PyObject *errorHandler = NULL;
7405 PyObject *exc = NULL;
7406 /* the following variable is used for caching string comparisons
7407 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7408 * 3=ignore, 4=xmlcharrefreplace */
7409 int known_errorHandler = -1;
7410
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007412 PyErr_BadArgument();
7413 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007416 if (PyUnicode_READY(input) == -1)
7417 return NULL;
7418 idata = (char*)PyUnicode_DATA(input);
7419 kind = PyUnicode_KIND(input);
7420 size = PyUnicode_GET_LENGTH(input);
7421 i = 0;
7422
7423 if (size == 0) {
7424 Py_INCREF(input);
7425 return input;
7426 }
7427
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007428 /* allocate enough for a simple 1:1 translation without
7429 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007430 osize = size;
7431 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7432 opos = 0;
7433 if (output == NULL) {
7434 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007435 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007436 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007438 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 /* try to encode it */
7440 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007441 if (charmaptranslate_output(input, i, mapping,
7442 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 Py_XDECREF(x);
7444 goto onError;
7445 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007446 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007448 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 else { /* untranslatable character */
7450 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7451 Py_ssize_t repsize;
7452 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007453 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007455 Py_ssize_t collstart = i;
7456 Py_ssize_t collend = i+1;
7457 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007460 while (collend < size) {
7461 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 goto onError;
7463 Py_XDECREF(x);
7464 if (x!=Py_None)
7465 break;
7466 ++collend;
7467 }
7468 /* cache callback name lookup
7469 * (if not done yet, i.e. it's the first error) */
7470 if (known_errorHandler==-1) {
7471 if ((errors==NULL) || (!strcmp(errors, "strict")))
7472 known_errorHandler = 1;
7473 else if (!strcmp(errors, "replace"))
7474 known_errorHandler = 2;
7475 else if (!strcmp(errors, "ignore"))
7476 known_errorHandler = 3;
7477 else if (!strcmp(errors, "xmlcharrefreplace"))
7478 known_errorHandler = 4;
7479 else
7480 known_errorHandler = 0;
7481 }
7482 switch (known_errorHandler) {
7483 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007484 raise_translate_exception(&exc, input, collstart,
7485 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007486 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007487 case 2: /* replace */
7488 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007489 for (coll = collstart; coll<collend; coll++)
7490 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007491 /* fall through */
7492 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007493 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 break;
7495 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007496 /* generate replacement (temporarily (mis)uses i) */
7497 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 char buffer[2+29+1+1];
7499 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007500 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7501 if (charmaptranslate_makespace(&output, &osize,
7502 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007503 goto onError;
7504 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007505 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007507 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007508 break;
7509 default:
7510 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007511 reason, input, &exc,
7512 collstart, collend, &newpos);
7513 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 goto onError;
7515 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007516 repsize = PyUnicode_GET_LENGTH(repunicode);
7517 if (charmaptranslate_makespace(&output, &osize,
7518 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 Py_DECREF(repunicode);
7520 goto onError;
7521 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007522 for (uni2 = 0; repsize-->0; ++uni2)
7523 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7524 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007526 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007527 }
7528 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007529 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7530 if (!res)
7531 goto onError;
7532 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007533 Py_XDECREF(exc);
7534 Py_XDECREF(errorHandler);
7535 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007538 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007539 Py_XDECREF(exc);
7540 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541 return NULL;
7542}
7543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007544/* Deprecated. Use PyUnicode_Translate instead. */
7545PyObject *
7546PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7547 Py_ssize_t size,
7548 PyObject *mapping,
7549 const char *errors)
7550{
7551 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7552 if (!unicode)
7553 return NULL;
7554 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7555}
7556
Alexander Belopolsky40018472011-02-26 01:02:56 +00007557PyObject *
7558PyUnicode_Translate(PyObject *str,
7559 PyObject *mapping,
7560 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561{
7562 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007563
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 str = PyUnicode_FromObject(str);
7565 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007566 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007567 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 Py_DECREF(str);
7569 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007570
Benjamin Peterson29060642009-01-31 22:14:21 +00007571 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 Py_XDECREF(str);
7573 return NULL;
7574}
Tim Petersced69f82003-09-16 20:30:58 +00007575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007576static Py_UCS4
7577fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7578{
7579 /* No need to call PyUnicode_READY(self) because this function is only
7580 called as a callback from fixup() which does it already. */
7581 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7582 const int kind = PyUnicode_KIND(self);
7583 void *data = PyUnicode_DATA(self);
7584 Py_UCS4 maxchar = 0, ch, fixed;
7585 Py_ssize_t i;
7586
7587 for (i = 0; i < len; ++i) {
7588 ch = PyUnicode_READ(kind, data, i);
7589 fixed = 0;
7590 if (ch > 127) {
7591 if (Py_UNICODE_ISSPACE(ch))
7592 fixed = ' ';
7593 else {
7594 const int decimal = Py_UNICODE_TODECIMAL(ch);
7595 if (decimal >= 0)
7596 fixed = '0' + decimal;
7597 }
7598 if (fixed != 0) {
7599 if (fixed > maxchar)
7600 maxchar = fixed;
7601 PyUnicode_WRITE(kind, data, i, fixed);
7602 }
7603 else if (ch > maxchar)
7604 maxchar = ch;
7605 }
7606 else if (ch > maxchar)
7607 maxchar = ch;
7608 }
7609
7610 return maxchar;
7611}
7612
7613PyObject *
7614_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7615{
7616 if (!PyUnicode_Check(unicode)) {
7617 PyErr_BadInternalCall();
7618 return NULL;
7619 }
7620 if (PyUnicode_READY(unicode) == -1)
7621 return NULL;
7622 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7623 /* If the string is already ASCII, just return the same string */
7624 Py_INCREF(unicode);
7625 return unicode;
7626 }
7627 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7628}
7629
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007630PyObject *
7631PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7632 Py_ssize_t length)
7633{
7634 PyObject *result;
7635 Py_UNICODE *p; /* write pointer into result */
7636 Py_ssize_t i;
7637 /* Copy to a new string */
7638 result = (PyObject *)_PyUnicode_New(length);
7639 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7640 if (result == NULL)
7641 return result;
7642 p = PyUnicode_AS_UNICODE(result);
7643 /* Iterate over code points */
7644 for (i = 0; i < length; i++) {
7645 Py_UNICODE ch =s[i];
7646 if (ch > 127) {
7647 int decimal = Py_UNICODE_TODECIMAL(ch);
7648 if (decimal >= 0)
7649 p[i] = '0' + decimal;
7650 }
7651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007652 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7653 Py_DECREF(result);
7654 return NULL;
7655 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007656 return result;
7657}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007658/* --- Decimal Encoder ---------------------------------------------------- */
7659
Alexander Belopolsky40018472011-02-26 01:02:56 +00007660int
7661PyUnicode_EncodeDecimal(Py_UNICODE *s,
7662 Py_ssize_t length,
7663 char *output,
7664 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007665{
7666 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007667 PyObject *errorHandler = NULL;
7668 PyObject *exc = NULL;
7669 const char *encoding = "decimal";
7670 const char *reason = "invalid decimal Unicode string";
7671 /* the following variable is used for caching string comparisons
7672 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7673 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007674
7675 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 PyErr_BadArgument();
7677 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007678 }
7679
7680 p = s;
7681 end = s + length;
7682 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007683 register Py_UNICODE ch = *p;
7684 int decimal;
7685 PyObject *repunicode;
7686 Py_ssize_t repsize;
7687 Py_ssize_t newpos;
7688 Py_UNICODE *uni2;
7689 Py_UNICODE *collstart;
7690 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007691
Benjamin Peterson29060642009-01-31 22:14:21 +00007692 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007693 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 ++p;
7695 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007696 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 decimal = Py_UNICODE_TODECIMAL(ch);
7698 if (decimal >= 0) {
7699 *output++ = '0' + decimal;
7700 ++p;
7701 continue;
7702 }
7703 if (0 < ch && ch < 256) {
7704 *output++ = (char)ch;
7705 ++p;
7706 continue;
7707 }
7708 /* All other characters are considered unencodable */
7709 collstart = p;
7710 collend = p+1;
7711 while (collend < end) {
7712 if ((0 < *collend && *collend < 256) ||
7713 !Py_UNICODE_ISSPACE(*collend) ||
7714 Py_UNICODE_TODECIMAL(*collend))
7715 break;
7716 }
7717 /* cache callback name lookup
7718 * (if not done yet, i.e. it's the first error) */
7719 if (known_errorHandler==-1) {
7720 if ((errors==NULL) || (!strcmp(errors, "strict")))
7721 known_errorHandler = 1;
7722 else if (!strcmp(errors, "replace"))
7723 known_errorHandler = 2;
7724 else if (!strcmp(errors, "ignore"))
7725 known_errorHandler = 3;
7726 else if (!strcmp(errors, "xmlcharrefreplace"))
7727 known_errorHandler = 4;
7728 else
7729 known_errorHandler = 0;
7730 }
7731 switch (known_errorHandler) {
7732 case 1: /* strict */
7733 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7734 goto onError;
7735 case 2: /* replace */
7736 for (p = collstart; p < collend; ++p)
7737 *output++ = '?';
7738 /* fall through */
7739 case 3: /* ignore */
7740 p = collend;
7741 break;
7742 case 4: /* xmlcharrefreplace */
7743 /* generate replacement (temporarily (mis)uses p) */
7744 for (p = collstart; p < collend; ++p)
7745 output += sprintf(output, "&#%d;", (int)*p);
7746 p = collend;
7747 break;
7748 default:
7749 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7750 encoding, reason, s, length, &exc,
7751 collstart-s, collend-s, &newpos);
7752 if (repunicode == NULL)
7753 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007754 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007755 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007756 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7757 Py_DECREF(repunicode);
7758 goto onError;
7759 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 /* generate replacement */
7761 repsize = PyUnicode_GET_SIZE(repunicode);
7762 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7763 Py_UNICODE ch = *uni2;
7764 if (Py_UNICODE_ISSPACE(ch))
7765 *output++ = ' ';
7766 else {
7767 decimal = Py_UNICODE_TODECIMAL(ch);
7768 if (decimal >= 0)
7769 *output++ = '0' + decimal;
7770 else if (0 < ch && ch < 256)
7771 *output++ = (char)ch;
7772 else {
7773 Py_DECREF(repunicode);
7774 raise_encode_exception(&exc, encoding,
7775 s, length, collstart-s, collend-s, reason);
7776 goto onError;
7777 }
7778 }
7779 }
7780 p = s + newpos;
7781 Py_DECREF(repunicode);
7782 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007783 }
7784 /* 0-terminate the output string */
7785 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007786 Py_XDECREF(exc);
7787 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007788 return 0;
7789
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007791 Py_XDECREF(exc);
7792 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007793 return -1;
7794}
7795
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796/* --- Helpers ------------------------------------------------------------ */
7797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007798#include "stringlib/ucs1lib.h"
7799#include "stringlib/fastsearch.h"
7800#include "stringlib/partition.h"
7801#include "stringlib/split.h"
7802#include "stringlib/count.h"
7803#include "stringlib/find.h"
7804#include "stringlib/localeutil.h"
7805#include "stringlib/undef.h"
7806
7807#include "stringlib/ucs2lib.h"
7808#include "stringlib/fastsearch.h"
7809#include "stringlib/partition.h"
7810#include "stringlib/split.h"
7811#include "stringlib/count.h"
7812#include "stringlib/find.h"
7813#include "stringlib/localeutil.h"
7814#include "stringlib/undef.h"
7815
7816#include "stringlib/ucs4lib.h"
7817#include "stringlib/fastsearch.h"
7818#include "stringlib/partition.h"
7819#include "stringlib/split.h"
7820#include "stringlib/count.h"
7821#include "stringlib/find.h"
7822#include "stringlib/localeutil.h"
7823#include "stringlib/undef.h"
7824
7825static Py_ssize_t
7826any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7827 const Py_UCS1*, Py_ssize_t,
7828 Py_ssize_t, Py_ssize_t),
7829 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7830 const Py_UCS2*, Py_ssize_t,
7831 Py_ssize_t, Py_ssize_t),
7832 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7833 const Py_UCS4*, Py_ssize_t,
7834 Py_ssize_t, Py_ssize_t),
7835 PyObject* s1, PyObject* s2,
7836 Py_ssize_t start,
7837 Py_ssize_t end)
7838{
7839 int kind1, kind2, kind;
7840 void *buf1, *buf2;
7841 Py_ssize_t len1, len2, result;
7842
7843 kind1 = PyUnicode_KIND(s1);
7844 kind2 = PyUnicode_KIND(s2);
7845 kind = kind1 > kind2 ? kind1 : kind2;
7846 buf1 = PyUnicode_DATA(s1);
7847 buf2 = PyUnicode_DATA(s2);
7848 if (kind1 != kind)
7849 buf1 = _PyUnicode_AsKind(s1, kind);
7850 if (!buf1)
7851 return -2;
7852 if (kind2 != kind)
7853 buf2 = _PyUnicode_AsKind(s2, kind);
7854 if (!buf2) {
7855 if (kind1 != kind) PyMem_Free(buf1);
7856 return -2;
7857 }
7858 len1 = PyUnicode_GET_LENGTH(s1);
7859 len2 = PyUnicode_GET_LENGTH(s2);
7860
7861 switch(kind) {
7862 case PyUnicode_1BYTE_KIND:
7863 result = ucs1(buf1, len1, buf2, len2, start, end);
7864 break;
7865 case PyUnicode_2BYTE_KIND:
7866 result = ucs2(buf1, len1, buf2, len2, start, end);
7867 break;
7868 case PyUnicode_4BYTE_KIND:
7869 result = ucs4(buf1, len1, buf2, len2, start, end);
7870 break;
7871 default:
7872 assert(0); result = -2;
7873 }
7874
7875 if (kind1 != kind)
7876 PyMem_Free(buf1);
7877 if (kind2 != kind)
7878 PyMem_Free(buf2);
7879
7880 return result;
7881}
7882
7883Py_ssize_t
7884_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7885 Py_ssize_t n_buffer,
7886 void *digits, Py_ssize_t n_digits,
7887 Py_ssize_t min_width,
7888 const char *grouping,
7889 const char *thousands_sep)
7890{
7891 switch(kind) {
7892 case PyUnicode_1BYTE_KIND:
7893 return _PyUnicode_ucs1_InsertThousandsGrouping(
7894 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7895 min_width, grouping, thousands_sep);
7896 case PyUnicode_2BYTE_KIND:
7897 return _PyUnicode_ucs2_InsertThousandsGrouping(
7898 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7899 min_width, grouping, thousands_sep);
7900 case PyUnicode_4BYTE_KIND:
7901 return _PyUnicode_ucs4_InsertThousandsGrouping(
7902 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7903 min_width, grouping, thousands_sep);
7904 }
7905 assert(0);
7906 return -1;
7907}
7908
7909
Eric Smith8c663262007-08-25 02:26:07 +00007910#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007911#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007912
Thomas Wouters477c8d52006-05-27 19:21:47 +00007913#include "stringlib/count.h"
7914#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007915
Thomas Wouters477c8d52006-05-27 19:21:47 +00007916/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007917#define ADJUST_INDICES(start, end, len) \
7918 if (end > len) \
7919 end = len; \
7920 else if (end < 0) { \
7921 end += len; \
7922 if (end < 0) \
7923 end = 0; \
7924 } \
7925 if (start < 0) { \
7926 start += len; \
7927 if (start < 0) \
7928 start = 0; \
7929 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007930
Alexander Belopolsky40018472011-02-26 01:02:56 +00007931Py_ssize_t
7932PyUnicode_Count(PyObject *str,
7933 PyObject *substr,
7934 Py_ssize_t start,
7935 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007937 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007938 PyUnicodeObject* str_obj;
7939 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007940 int kind1, kind2, kind;
7941 void *buf1 = NULL, *buf2 = NULL;
7942 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007943
Thomas Wouters477c8d52006-05-27 19:21:47 +00007944 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007945 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007947 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02007948 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 Py_DECREF(str_obj);
7950 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951 }
Tim Petersced69f82003-09-16 20:30:58 +00007952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007953 kind1 = PyUnicode_KIND(str_obj);
7954 kind2 = PyUnicode_KIND(sub_obj);
7955 kind = kind1 > kind2 ? kind1 : kind2;
7956 buf1 = PyUnicode_DATA(str_obj);
7957 if (kind1 != kind)
7958 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7959 if (!buf1)
7960 goto onError;
7961 buf2 = PyUnicode_DATA(sub_obj);
7962 if (kind2 != kind)
7963 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7964 if (!buf2)
7965 goto onError;
7966 len1 = PyUnicode_GET_LENGTH(str_obj);
7967 len2 = PyUnicode_GET_LENGTH(sub_obj);
7968
7969 ADJUST_INDICES(start, end, len1);
7970 switch(kind) {
7971 case PyUnicode_1BYTE_KIND:
7972 result = ucs1lib_count(
7973 ((Py_UCS1*)buf1) + start, end - start,
7974 buf2, len2, PY_SSIZE_T_MAX
7975 );
7976 break;
7977 case PyUnicode_2BYTE_KIND:
7978 result = ucs2lib_count(
7979 ((Py_UCS2*)buf1) + start, end - start,
7980 buf2, len2, PY_SSIZE_T_MAX
7981 );
7982 break;
7983 case PyUnicode_4BYTE_KIND:
7984 result = ucs4lib_count(
7985 ((Py_UCS4*)buf1) + start, end - start,
7986 buf2, len2, PY_SSIZE_T_MAX
7987 );
7988 break;
7989 default:
7990 assert(0); result = 0;
7991 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007992
7993 Py_DECREF(sub_obj);
7994 Py_DECREF(str_obj);
7995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007996 if (kind1 != kind)
7997 PyMem_Free(buf1);
7998 if (kind2 != kind)
7999 PyMem_Free(buf2);
8000
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008002 onError:
8003 Py_DECREF(sub_obj);
8004 Py_DECREF(str_obj);
8005 if (kind1 != kind && buf1)
8006 PyMem_Free(buf1);
8007 if (kind2 != kind && buf2)
8008 PyMem_Free(buf2);
8009 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010}
8011
Alexander Belopolsky40018472011-02-26 01:02:56 +00008012Py_ssize_t
8013PyUnicode_Find(PyObject *str,
8014 PyObject *sub,
8015 Py_ssize_t start,
8016 Py_ssize_t end,
8017 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008019 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008020
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008022 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008024 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008025 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 Py_DECREF(str);
8027 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 }
Tim Petersced69f82003-09-16 20:30:58 +00008029
Thomas Wouters477c8d52006-05-27 19:21:47 +00008030 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008031 result = any_find_slice(
8032 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8033 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008034 );
8035 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008036 result = any_find_slice(
8037 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8038 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008039 );
8040
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008042 Py_DECREF(sub);
8043
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044 return result;
8045}
8046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008047Py_ssize_t
8048PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8049 Py_ssize_t start, Py_ssize_t end,
8050 int direction)
8051{
8052 char *result;
8053 int kind;
8054 if (PyUnicode_READY(str) == -1)
8055 return -2;
8056 if (end > PyUnicode_GET_LENGTH(str))
8057 end = PyUnicode_GET_LENGTH(str);
8058 kind = PyUnicode_KIND(str);
8059 result = findchar(PyUnicode_1BYTE_DATA(str)
8060 + PyUnicode_KIND_SIZE(kind, start),
8061 kind,
8062 end-start, ch, direction);
8063 if (!result)
8064 return -1;
8065 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8066}
8067
Alexander Belopolsky40018472011-02-26 01:02:56 +00008068static int
8069tailmatch(PyUnicodeObject *self,
8070 PyUnicodeObject *substring,
8071 Py_ssize_t start,
8072 Py_ssize_t end,
8073 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008075 int kind_self;
8076 int kind_sub;
8077 void *data_self;
8078 void *data_sub;
8079 Py_ssize_t offset;
8080 Py_ssize_t i;
8081 Py_ssize_t end_sub;
8082
8083 if (PyUnicode_READY(self) == -1 ||
8084 PyUnicode_READY(substring) == -1)
8085 return 0;
8086
8087 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088 return 1;
8089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008090 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8091 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008095 kind_self = PyUnicode_KIND(self);
8096 data_self = PyUnicode_DATA(self);
8097 kind_sub = PyUnicode_KIND(substring);
8098 data_sub = PyUnicode_DATA(substring);
8099 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8100
8101 if (direction > 0)
8102 offset = end;
8103 else
8104 offset = start;
8105
8106 if (PyUnicode_READ(kind_self, data_self, offset) ==
8107 PyUnicode_READ(kind_sub, data_sub, 0) &&
8108 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8109 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8110 /* If both are of the same kind, memcmp is sufficient */
8111 if (kind_self == kind_sub) {
8112 return ! memcmp((char *)data_self +
8113 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8114 data_sub,
8115 PyUnicode_GET_LENGTH(substring) *
8116 PyUnicode_CHARACTER_SIZE(substring));
8117 }
8118 /* otherwise we have to compare each character by first accesing it */
8119 else {
8120 /* We do not need to compare 0 and len(substring)-1 because
8121 the if statement above ensured already that they are equal
8122 when we end up here. */
8123 // TODO: honor direction and do a forward or backwards search
8124 for (i = 1; i < end_sub; ++i) {
8125 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8126 PyUnicode_READ(kind_sub, data_sub, i))
8127 return 0;
8128 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 }
8132
8133 return 0;
8134}
8135
Alexander Belopolsky40018472011-02-26 01:02:56 +00008136Py_ssize_t
8137PyUnicode_Tailmatch(PyObject *str,
8138 PyObject *substr,
8139 Py_ssize_t start,
8140 Py_ssize_t end,
8141 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008143 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008144
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 str = PyUnicode_FromObject(str);
8146 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 substr = PyUnicode_FromObject(substr);
8149 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 Py_DECREF(str);
8151 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152 }
Tim Petersced69f82003-09-16 20:30:58 +00008153
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 (PyUnicodeObject *)substr,
8156 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157 Py_DECREF(str);
8158 Py_DECREF(substr);
8159 return result;
8160}
8161
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162/* Apply fixfct filter to the Unicode object self and return a
8163 reference to the modified object */
8164
Alexander Belopolsky40018472011-02-26 01:02:56 +00008165static PyObject *
8166fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008167 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008169 PyObject *u;
8170 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008172 if (PyUnicode_READY(self) == -1)
8173 return NULL;
8174 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8175 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8176 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8181 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008183 /* fix functions return the new maximum character in a string,
8184 if the kind of the resulting unicode object does not change,
8185 everything is fine. Otherwise we need to change the string kind
8186 and re-run the fix function. */
8187 maxchar_new = fixfct((PyUnicodeObject*)u);
8188 if (maxchar_new == 0)
8189 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8190 else if (maxchar_new <= 127)
8191 maxchar_new = 127;
8192 else if (maxchar_new <= 255)
8193 maxchar_new = 255;
8194 else if (maxchar_new <= 65535)
8195 maxchar_new = 65535;
8196 else
8197 maxchar_new = 1114111; /* 0x10ffff */
8198
8199 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008200 /* fixfct should return TRUE if it modified the buffer. If
8201 FALSE, return a reference to the original buffer instead
8202 (to save space, not time) */
8203 Py_INCREF(self);
8204 Py_DECREF(u);
8205 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008207 else if (maxchar_new == maxchar_old) {
8208 return u;
8209 }
8210 else {
8211 /* In case the maximum character changed, we need to
8212 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008213 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008214 if (v == NULL) {
8215 Py_DECREF(u);
8216 return NULL;
8217 }
8218 if (maxchar_new > maxchar_old) {
8219 /* If the maxchar increased so that the kind changed, not all
8220 characters are representable anymore and we need to fix the
8221 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008222 if (PyUnicode_CopyCharacters(v, 0,
8223 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008224 PyUnicode_GET_LENGTH(self)) < 0)
8225 {
8226 Py_DECREF(u);
8227 return NULL;
8228 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008229 maxchar_old = fixfct((PyUnicodeObject*)v);
8230 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8231 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008232 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008233 if (PyUnicode_CopyCharacters(v, 0,
8234 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008235 PyUnicode_GET_LENGTH(self)) < 0)
8236 {
8237 Py_DECREF(u);
8238 return NULL;
8239 }
8240 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008241
8242 Py_DECREF(u);
8243 return v;
8244 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245}
8246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008247static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008248fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008250 /* No need to call PyUnicode_READY(self) because this function is only
8251 called as a callback from fixup() which does it already. */
8252 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8253 const int kind = PyUnicode_KIND(self);
8254 void *data = PyUnicode_DATA(self);
8255 int touched = 0;
8256 Py_UCS4 maxchar = 0;
8257 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008259 for (i = 0; i < len; ++i) {
8260 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8261 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8262 if (up != ch) {
8263 if (up > maxchar)
8264 maxchar = up;
8265 PyUnicode_WRITE(kind, data, i, up);
8266 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008268 else if (ch > maxchar)
8269 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270 }
8271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008272 if (touched)
8273 return maxchar;
8274 else
8275 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276}
8277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008278static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008279fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008281 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8282 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8283 const int kind = PyUnicode_KIND(self);
8284 void *data = PyUnicode_DATA(self);
8285 int touched = 0;
8286 Py_UCS4 maxchar = 0;
8287 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008289 for(i = 0; i < len; ++i) {
8290 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8291 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8292 if (lo != ch) {
8293 if (lo > maxchar)
8294 maxchar = lo;
8295 PyUnicode_WRITE(kind, data, i, lo);
8296 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008298 else if (ch > maxchar)
8299 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 }
8301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008302 if (touched)
8303 return maxchar;
8304 else
8305 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306}
8307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008308static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008309fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008311 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8312 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8313 const int kind = PyUnicode_KIND(self);
8314 void *data = PyUnicode_DATA(self);
8315 int touched = 0;
8316 Py_UCS4 maxchar = 0;
8317 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008319 for(i = 0; i < len; ++i) {
8320 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8321 Py_UCS4 nu = 0;
8322
8323 if (Py_UNICODE_ISUPPER(ch))
8324 nu = Py_UNICODE_TOLOWER(ch);
8325 else if (Py_UNICODE_ISLOWER(ch))
8326 nu = Py_UNICODE_TOUPPER(ch);
8327
8328 if (nu != 0) {
8329 if (nu > maxchar)
8330 maxchar = nu;
8331 PyUnicode_WRITE(kind, data, i, nu);
8332 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008334 else if (ch > maxchar)
8335 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336 }
8337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008338 if (touched)
8339 return maxchar;
8340 else
8341 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342}
8343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008344static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008345fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008347 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8348 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8349 const int kind = PyUnicode_KIND(self);
8350 void *data = PyUnicode_DATA(self);
8351 int touched = 0;
8352 Py_UCS4 maxchar = 0;
8353 Py_ssize_t i = 0;
8354 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008355
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008356 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008358
8359 ch = PyUnicode_READ(kind, data, i);
8360 if (!Py_UNICODE_ISUPPER(ch)) {
8361 maxchar = Py_UNICODE_TOUPPER(ch);
8362 PyUnicode_WRITE(kind, data, i, maxchar);
8363 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365 ++i;
8366 for(; i < len; ++i) {
8367 ch = PyUnicode_READ(kind, data, i);
8368 if (!Py_UNICODE_ISLOWER(ch)) {
8369 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8370 if (lo > maxchar)
8371 maxchar = lo;
8372 PyUnicode_WRITE(kind, data, i, lo);
8373 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 else if (ch > maxchar)
8376 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008377 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008378
8379 if (touched)
8380 return maxchar;
8381 else
8382 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383}
8384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008386fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008388 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8389 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8390 const int kind = PyUnicode_KIND(self);
8391 void *data = PyUnicode_DATA(self);
8392 Py_UCS4 maxchar = 0;
8393 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394 int previous_is_cased;
8395
8396 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008397 if (len == 1) {
8398 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8399 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8400 if (ti != ch) {
8401 PyUnicode_WRITE(kind, data, i, ti);
8402 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 }
8404 else
8405 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008408 for(; i < len; ++i) {
8409 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8410 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008411
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008413 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 nu = Py_UNICODE_TOTITLE(ch);
8416
8417 if (nu > maxchar)
8418 maxchar = nu;
8419 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008420
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 if (Py_UNICODE_ISLOWER(ch) ||
8422 Py_UNICODE_ISUPPER(ch) ||
8423 Py_UNICODE_ISTITLE(ch))
8424 previous_is_cased = 1;
8425 else
8426 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008428 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429}
8430
Tim Peters8ce9f162004-08-27 01:49:32 +00008431PyObject *
8432PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008434 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008435 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008436 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008437 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008438 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8439 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008440 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008441 Py_ssize_t sz, i, res_offset;
8442 Py_UCS4 maxchar = 0;
8443 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444
Tim Peters05eba1f2004-08-27 21:32:02 +00008445 fseq = PySequence_Fast(seq, "");
8446 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008447 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008448 }
8449
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008450 /* NOTE: the following code can't call back into Python code,
8451 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008452 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008453
Tim Peters05eba1f2004-08-27 21:32:02 +00008454 seqlen = PySequence_Fast_GET_SIZE(fseq);
8455 /* If empty sequence, return u"". */
8456 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008459 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008460 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008461 /* If singleton sequence with an exact Unicode, return that. */
8462 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 item = items[0];
8464 if (PyUnicode_CheckExact(item)) {
8465 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008466 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 goto Done;
8468 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008469 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008470 else {
8471 /* Set up sep and seplen */
8472 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 /* fall back to a blank space separator */
8474 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008475 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008476 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008477 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008478 else {
8479 if (!PyUnicode_Check(separator)) {
8480 PyErr_Format(PyExc_TypeError,
8481 "separator: expected str instance,"
8482 " %.80s found",
8483 Py_TYPE(separator)->tp_name);
8484 goto onError;
8485 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008486 if (PyUnicode_READY(separator) == -1)
8487 goto onError;
8488 sep = separator;
8489 seplen = PyUnicode_GET_LENGTH(separator);
8490 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8491 /* inc refcount to keep this code path symetric with the
8492 above case of a blank separator */
8493 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008494 }
8495 }
8496
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008497 /* There are at least two things to join, or else we have a subclass
8498 * of str in the sequence.
8499 * Do a pre-pass to figure out the total amount of space we'll
8500 * need (sz), and see whether all argument are strings.
8501 */
8502 sz = 0;
8503 for (i = 0; i < seqlen; i++) {
8504 const Py_ssize_t old_sz = sz;
8505 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 if (!PyUnicode_Check(item)) {
8507 PyErr_Format(PyExc_TypeError,
8508 "sequence item %zd: expected str instance,"
8509 " %.80s found",
8510 i, Py_TYPE(item)->tp_name);
8511 goto onError;
8512 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008513 if (PyUnicode_READY(item) == -1)
8514 goto onError;
8515 sz += PyUnicode_GET_LENGTH(item);
8516 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8517 if (item_maxchar > maxchar)
8518 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008519 if (i != 0)
8520 sz += seplen;
8521 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8522 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008524 goto onError;
8525 }
8526 }
Tim Petersced69f82003-09-16 20:30:58 +00008527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008529 if (res == NULL)
8530 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008531
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008532 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008533 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008534 Py_ssize_t itemlen;
8535 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008536 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 /* Copy item, and maybe the separator. */
8538 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008539 if (PyUnicode_CopyCharacters(res, res_offset,
8540 sep, 0, seplen) < 0)
8541 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008544 if (PyUnicode_CopyCharacters(res, res_offset,
8545 item, 0, itemlen) < 0)
8546 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008547 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008549 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008550
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008552 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008553 Py_XDECREF(sep);
8554 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008557 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008559 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560 return NULL;
8561}
8562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008563#define FILL(kind, data, value, start, length) \
8564 do { \
8565 Py_ssize_t i_ = 0; \
8566 assert(kind != PyUnicode_WCHAR_KIND); \
8567 switch ((kind)) { \
8568 case PyUnicode_1BYTE_KIND: { \
8569 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8570 memset(to_, (unsigned char)value, length); \
8571 break; \
8572 } \
8573 case PyUnicode_2BYTE_KIND: { \
8574 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8575 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8576 break; \
8577 } \
8578 default: { \
8579 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8580 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8581 break; \
8582 } \
8583 } \
8584 } while (0)
8585
Alexander Belopolsky40018472011-02-26 01:02:56 +00008586static PyUnicodeObject *
8587pad(PyUnicodeObject *self,
8588 Py_ssize_t left,
8589 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 PyObject *u;
8593 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008594 int kind;
8595 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596
8597 if (left < 0)
8598 left = 0;
8599 if (right < 0)
8600 right = 0;
8601
Tim Peters7a29bd52001-09-12 03:03:31 +00008602 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603 Py_INCREF(self);
8604 return self;
8605 }
8606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8608 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008609 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8610 return NULL;
8611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8613 if (fill > maxchar)
8614 maxchar = fill;
8615 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008616 if (!u)
8617 return NULL;
8618
8619 kind = PyUnicode_KIND(u);
8620 data = PyUnicode_DATA(u);
8621 if (left)
8622 FILL(kind, data, fill, 0, left);
8623 if (right)
8624 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008625 if (PyUnicode_CopyCharacters(u, left,
8626 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008627 _PyUnicode_LENGTH(self)) < 0)
8628 {
8629 Py_DECREF(u);
8630 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 }
8632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636
Alexander Belopolsky40018472011-02-26 01:02:56 +00008637PyObject *
8638PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641
8642 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 switch(PyUnicode_KIND(string)) {
8647 case PyUnicode_1BYTE_KIND:
8648 list = ucs1lib_splitlines(
8649 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8650 PyUnicode_GET_LENGTH(string), keepends);
8651 break;
8652 case PyUnicode_2BYTE_KIND:
8653 list = ucs2lib_splitlines(
8654 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8655 PyUnicode_GET_LENGTH(string), keepends);
8656 break;
8657 case PyUnicode_4BYTE_KIND:
8658 list = ucs4lib_splitlines(
8659 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8660 PyUnicode_GET_LENGTH(string), keepends);
8661 break;
8662 default:
8663 assert(0);
8664 list = 0;
8665 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666 Py_DECREF(string);
8667 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668}
8669
Alexander Belopolsky40018472011-02-26 01:02:56 +00008670static PyObject *
8671split(PyUnicodeObject *self,
8672 PyUnicodeObject *substring,
8673 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675 int kind1, kind2, kind;
8676 void *buf1, *buf2;
8677 Py_ssize_t len1, len2;
8678 PyObject* out;
8679
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008681 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008683 if (PyUnicode_READY(self) == -1)
8684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 if (substring == NULL)
8687 switch(PyUnicode_KIND(self)) {
8688 case PyUnicode_1BYTE_KIND:
8689 return ucs1lib_split_whitespace(
8690 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8691 PyUnicode_GET_LENGTH(self), maxcount
8692 );
8693 case PyUnicode_2BYTE_KIND:
8694 return ucs2lib_split_whitespace(
8695 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8696 PyUnicode_GET_LENGTH(self), maxcount
8697 );
8698 case PyUnicode_4BYTE_KIND:
8699 return ucs4lib_split_whitespace(
8700 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8701 PyUnicode_GET_LENGTH(self), maxcount
8702 );
8703 default:
8704 assert(0);
8705 return NULL;
8706 }
8707
8708 if (PyUnicode_READY(substring) == -1)
8709 return NULL;
8710
8711 kind1 = PyUnicode_KIND(self);
8712 kind2 = PyUnicode_KIND(substring);
8713 kind = kind1 > kind2 ? kind1 : kind2;
8714 buf1 = PyUnicode_DATA(self);
8715 buf2 = PyUnicode_DATA(substring);
8716 if (kind1 != kind)
8717 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8718 if (!buf1)
8719 return NULL;
8720 if (kind2 != kind)
8721 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8722 if (!buf2) {
8723 if (kind1 != kind) PyMem_Free(buf1);
8724 return NULL;
8725 }
8726 len1 = PyUnicode_GET_LENGTH(self);
8727 len2 = PyUnicode_GET_LENGTH(substring);
8728
8729 switch(kind) {
8730 case PyUnicode_1BYTE_KIND:
8731 out = ucs1lib_split(
8732 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8733 break;
8734 case PyUnicode_2BYTE_KIND:
8735 out = ucs2lib_split(
8736 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8737 break;
8738 case PyUnicode_4BYTE_KIND:
8739 out = ucs4lib_split(
8740 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8741 break;
8742 default:
8743 out = NULL;
8744 }
8745 if (kind1 != kind)
8746 PyMem_Free(buf1);
8747 if (kind2 != kind)
8748 PyMem_Free(buf2);
8749 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750}
8751
Alexander Belopolsky40018472011-02-26 01:02:56 +00008752static PyObject *
8753rsplit(PyUnicodeObject *self,
8754 PyUnicodeObject *substring,
8755 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008757 int kind1, kind2, kind;
8758 void *buf1, *buf2;
8759 Py_ssize_t len1, len2;
8760 PyObject* out;
8761
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008762 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008763 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765 if (PyUnicode_READY(self) == -1)
8766 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008768 if (substring == NULL)
8769 switch(PyUnicode_KIND(self)) {
8770 case PyUnicode_1BYTE_KIND:
8771 return ucs1lib_rsplit_whitespace(
8772 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8773 PyUnicode_GET_LENGTH(self), maxcount
8774 );
8775 case PyUnicode_2BYTE_KIND:
8776 return ucs2lib_rsplit_whitespace(
8777 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8778 PyUnicode_GET_LENGTH(self), maxcount
8779 );
8780 case PyUnicode_4BYTE_KIND:
8781 return ucs4lib_rsplit_whitespace(
8782 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8783 PyUnicode_GET_LENGTH(self), maxcount
8784 );
8785 default:
8786 assert(0);
8787 return NULL;
8788 }
8789
8790 if (PyUnicode_READY(substring) == -1)
8791 return NULL;
8792
8793 kind1 = PyUnicode_KIND(self);
8794 kind2 = PyUnicode_KIND(substring);
8795 kind = kind1 > kind2 ? kind1 : kind2;
8796 buf1 = PyUnicode_DATA(self);
8797 buf2 = PyUnicode_DATA(substring);
8798 if (kind1 != kind)
8799 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8800 if (!buf1)
8801 return NULL;
8802 if (kind2 != kind)
8803 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8804 if (!buf2) {
8805 if (kind1 != kind) PyMem_Free(buf1);
8806 return NULL;
8807 }
8808 len1 = PyUnicode_GET_LENGTH(self);
8809 len2 = PyUnicode_GET_LENGTH(substring);
8810
8811 switch(kind) {
8812 case PyUnicode_1BYTE_KIND:
8813 out = ucs1lib_rsplit(
8814 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8815 break;
8816 case PyUnicode_2BYTE_KIND:
8817 out = ucs2lib_rsplit(
8818 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8819 break;
8820 case PyUnicode_4BYTE_KIND:
8821 out = ucs4lib_rsplit(
8822 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8823 break;
8824 default:
8825 out = NULL;
8826 }
8827 if (kind1 != kind)
8828 PyMem_Free(buf1);
8829 if (kind2 != kind)
8830 PyMem_Free(buf2);
8831 return out;
8832}
8833
8834static Py_ssize_t
8835anylib_find(int kind, void *buf1, Py_ssize_t len1,
8836 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8837{
8838 switch(kind) {
8839 case PyUnicode_1BYTE_KIND:
8840 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8841 case PyUnicode_2BYTE_KIND:
8842 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8843 case PyUnicode_4BYTE_KIND:
8844 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8845 }
8846 assert(0);
8847 return -1;
8848}
8849
8850static Py_ssize_t
8851anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8852 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8853{
8854 switch(kind) {
8855 case PyUnicode_1BYTE_KIND:
8856 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8857 case PyUnicode_2BYTE_KIND:
8858 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8859 case PyUnicode_4BYTE_KIND:
8860 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8861 }
8862 assert(0);
8863 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008864}
8865
Alexander Belopolsky40018472011-02-26 01:02:56 +00008866static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008867replace(PyObject *self, PyObject *str1,
8868 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008870 PyObject *u;
8871 char *sbuf = PyUnicode_DATA(self);
8872 char *buf1 = PyUnicode_DATA(str1);
8873 char *buf2 = PyUnicode_DATA(str2);
8874 int srelease = 0, release1 = 0, release2 = 0;
8875 int skind = PyUnicode_KIND(self);
8876 int kind1 = PyUnicode_KIND(str1);
8877 int kind2 = PyUnicode_KIND(str2);
8878 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8879 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8880 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881
8882 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008884 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008885 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008887 if (skind < kind1)
8888 /* substring too wide to be present */
8889 goto nothing;
8890
8891 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008892 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008893 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008894 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008895 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008896 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008897 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 Py_UCS4 u1, u2, maxchar;
8899 int mayshrink, rkind;
8900 u1 = PyUnicode_READ_CHAR(str1, 0);
8901 if (!findchar(sbuf, PyUnicode_KIND(self),
8902 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008903 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904 u2 = PyUnicode_READ_CHAR(str2, 0);
8905 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8906 /* Replacing u1 with u2 may cause a maxchar reduction in the
8907 result string. */
8908 mayshrink = maxchar > 127;
8909 if (u2 > maxchar) {
8910 maxchar = u2;
8911 mayshrink = 0;
8912 }
8913 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008914 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008916 if (PyUnicode_CopyCharacters(u, 0,
8917 (PyObject*)self, 0, slen) < 0)
8918 {
8919 Py_DECREF(u);
8920 return NULL;
8921 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 rkind = PyUnicode_KIND(u);
8923 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8924 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008925 if (--maxcount < 0)
8926 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008928 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929 if (mayshrink) {
8930 PyObject *tmp = u;
8931 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8932 PyUnicode_GET_LENGTH(tmp));
8933 Py_DECREF(tmp);
8934 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936 int rkind = skind;
8937 char *res;
8938 if (kind1 < rkind) {
8939 /* widen substring */
8940 buf1 = _PyUnicode_AsKind(str1, rkind);
8941 if (!buf1) goto error;
8942 release1 = 1;
8943 }
8944 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008945 if (i < 0)
8946 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 if (rkind > kind2) {
8948 /* widen replacement */
8949 buf2 = _PyUnicode_AsKind(str2, rkind);
8950 if (!buf2) goto error;
8951 release2 = 1;
8952 }
8953 else if (rkind < kind2) {
8954 /* widen self and buf1 */
8955 rkind = kind2;
8956 if (release1) PyMem_Free(buf1);
8957 sbuf = _PyUnicode_AsKind(self, rkind);
8958 if (!sbuf) goto error;
8959 srelease = 1;
8960 buf1 = _PyUnicode_AsKind(str1, rkind);
8961 if (!buf1) goto error;
8962 release1 = 1;
8963 }
8964 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8965 if (!res) {
8966 PyErr_NoMemory();
8967 goto error;
8968 }
8969 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008970 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8972 buf2,
8973 PyUnicode_KIND_SIZE(rkind, len2));
8974 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008975
8976 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8978 slen-i,
8979 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008980 if (i == -1)
8981 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8983 buf2,
8984 PyUnicode_KIND_SIZE(rkind, len2));
8985 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987
8988 u = PyUnicode_FromKindAndData(rkind, res, slen);
8989 PyMem_Free(res);
8990 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008994 Py_ssize_t n, i, j, ires;
8995 Py_ssize_t product, new_size;
8996 int rkind = skind;
8997 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008999 if (kind1 < rkind) {
9000 buf1 = _PyUnicode_AsKind(str1, rkind);
9001 if (!buf1) goto error;
9002 release1 = 1;
9003 }
9004 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009005 if (n == 0)
9006 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009007 if (kind2 < rkind) {
9008 buf2 = _PyUnicode_AsKind(str2, rkind);
9009 if (!buf2) goto error;
9010 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009012 else if (kind2 > rkind) {
9013 rkind = kind2;
9014 sbuf = _PyUnicode_AsKind(self, rkind);
9015 if (!sbuf) goto error;
9016 srelease = 1;
9017 if (release1) PyMem_Free(buf1);
9018 buf1 = _PyUnicode_AsKind(str1, rkind);
9019 if (!buf1) goto error;
9020 release1 = 1;
9021 }
9022 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9023 PyUnicode_GET_LENGTH(str1))); */
9024 product = n * (len2-len1);
9025 if ((product / (len2-len1)) != n) {
9026 PyErr_SetString(PyExc_OverflowError,
9027 "replace string is too long");
9028 goto error;
9029 }
9030 new_size = slen + product;
9031 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9032 PyErr_SetString(PyExc_OverflowError,
9033 "replace string is too long");
9034 goto error;
9035 }
9036 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9037 if (!res)
9038 goto error;
9039 ires = i = 0;
9040 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009041 while (n-- > 0) {
9042 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043 j = anylib_find(rkind,
9044 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9045 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009046 if (j == -1)
9047 break;
9048 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009049 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9051 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9052 PyUnicode_KIND_SIZE(rkind, j-i));
9053 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009054 }
9055 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056 if (len2 > 0) {
9057 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9058 buf2,
9059 PyUnicode_KIND_SIZE(rkind, len2));
9060 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009061 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009063 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009065 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9067 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9068 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009069 } else {
9070 /* interleave */
9071 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9073 buf2,
9074 PyUnicode_KIND_SIZE(rkind, len2));
9075 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009076 if (--n <= 0)
9077 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9079 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9080 PyUnicode_KIND_SIZE(rkind, 1));
9081 ires++;
9082 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9085 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9086 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009087 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009089 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 if (srelease)
9092 PyMem_FREE(sbuf);
9093 if (release1)
9094 PyMem_FREE(buf1);
9095 if (release2)
9096 PyMem_FREE(buf2);
9097 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009098
Benjamin Peterson29060642009-01-31 22:14:21 +00009099 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009100 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101 if (srelease)
9102 PyMem_FREE(sbuf);
9103 if (release1)
9104 PyMem_FREE(buf1);
9105 if (release2)
9106 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009107 if (PyUnicode_CheckExact(self)) {
9108 Py_INCREF(self);
9109 return (PyObject *) self;
9110 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009111 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 error:
9113 if (srelease && sbuf)
9114 PyMem_FREE(sbuf);
9115 if (release1 && buf1)
9116 PyMem_FREE(buf1);
9117 if (release2 && buf2)
9118 PyMem_FREE(buf2);
9119 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009120}
9121
9122/* --- Unicode Object Methods --------------------------------------------- */
9123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009124PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009125 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126\n\
9127Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009128characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129
9130static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009131unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133 return fixup(self, fixtitle);
9134}
9135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009136PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138\n\
9139Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009140have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141
9142static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009143unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 return fixup(self, fixcapitalize);
9146}
9147
9148#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009149PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009150 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151\n\
9152Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009153normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154
9155static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009156unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157{
9158 PyObject *list;
9159 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009160 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161
Guido van Rossumd57fd912000-03-10 22:53:23 +00009162 /* Split into words */
9163 list = split(self, NULL, -1);
9164 if (!list)
9165 return NULL;
9166
9167 /* Capitalize each word */
9168 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9169 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009170 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171 if (item == NULL)
9172 goto onError;
9173 Py_DECREF(PyList_GET_ITEM(list, i));
9174 PyList_SET_ITEM(list, i, item);
9175 }
9176
9177 /* Join the words to form a new string */
9178 item = PyUnicode_Join(NULL, list);
9179
Benjamin Peterson29060642009-01-31 22:14:21 +00009180 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181 Py_DECREF(list);
9182 return (PyObject *)item;
9183}
9184#endif
9185
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009186/* Argument converter. Coerces to a single unicode character */
9187
9188static int
9189convert_uc(PyObject *obj, void *addr)
9190{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009191 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009192 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009193
Benjamin Peterson14339b62009-01-31 16:36:08 +00009194 uniobj = PyUnicode_FromObject(obj);
9195 if (uniobj == NULL) {
9196 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009197 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009198 return 0;
9199 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009200 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009201 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009202 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009203 Py_DECREF(uniobj);
9204 return 0;
9205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009207 Py_DECREF(uniobj);
9208 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009209}
9210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009211PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009212 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009214Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009215done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216
9217static PyObject *
9218unicode_center(PyUnicodeObject *self, PyObject *args)
9219{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009220 Py_ssize_t marg, left;
9221 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009222 Py_UCS4 fillchar = ' ';
9223
Victor Stinnere9a29352011-10-01 02:14:59 +02009224 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226
Victor Stinnere9a29352011-10-01 02:14:59 +02009227 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228 return NULL;
9229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231 Py_INCREF(self);
9232 return (PyObject*) self;
9233 }
9234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009235 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236 left = marg / 2 + (marg & width & 1);
9237
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009238 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239}
9240
Marc-André Lemburge5034372000-08-08 08:04:29 +00009241#if 0
9242
9243/* This code should go into some future Unicode collation support
9244 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009245 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009246
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009247/* speedy UTF-16 code point order comparison */
9248/* gleaned from: */
9249/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9250
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009251static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009252{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009253 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009254 0, 0, 0, 0, 0, 0, 0, 0,
9255 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009256 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009257};
9258
Guido van Rossumd57fd912000-03-10 22:53:23 +00009259static int
9260unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9261{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009262 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009263
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264 Py_UNICODE *s1 = str1->str;
9265 Py_UNICODE *s2 = str2->str;
9266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 len1 = str1->_base._base.length;
9268 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009269
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009271 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009272
9273 c1 = *s1++;
9274 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009275
Benjamin Peterson29060642009-01-31 22:14:21 +00009276 if (c1 > (1<<11) * 26)
9277 c1 += utf16Fixup[c1>>11];
9278 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009279 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009280 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009281
9282 if (c1 != c2)
9283 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009284
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009285 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286 }
9287
9288 return (len1 < len2) ? -1 : (len1 != len2);
9289}
9290
Marc-André Lemburge5034372000-08-08 08:04:29 +00009291#else
9292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293/* This function assumes that str1 and str2 are readied by the caller. */
9294
Marc-André Lemburge5034372000-08-08 08:04:29 +00009295static int
9296unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9297{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 int kind1, kind2;
9299 void *data1, *data2;
9300 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302 kind1 = PyUnicode_KIND(str1);
9303 kind2 = PyUnicode_KIND(str2);
9304 data1 = PyUnicode_DATA(str1);
9305 data2 = PyUnicode_DATA(str2);
9306 len1 = PyUnicode_GET_LENGTH(str1);
9307 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309 for (i = 0; i < len1 && i < len2; ++i) {
9310 Py_UCS4 c1, c2;
9311 c1 = PyUnicode_READ(kind1, data1, i);
9312 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009313
9314 if (c1 != c2)
9315 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009316 }
9317
9318 return (len1 < len2) ? -1 : (len1 != len2);
9319}
9320
9321#endif
9322
Alexander Belopolsky40018472011-02-26 01:02:56 +00009323int
9324PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009326 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9327 if (PyUnicode_READY(left) == -1 ||
9328 PyUnicode_READY(right) == -1)
9329 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009330 return unicode_compare((PyUnicodeObject *)left,
9331 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009332 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009333 PyErr_Format(PyExc_TypeError,
9334 "Can't compare %.100s and %.100s",
9335 left->ob_type->tp_name,
9336 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337 return -1;
9338}
9339
Martin v. Löwis5b222132007-06-10 09:51:05 +00009340int
9341PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9342{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 Py_ssize_t i;
9344 int kind;
9345 void *data;
9346 Py_UCS4 chr;
9347
Martin v. Löwis5b222132007-06-10 09:51:05 +00009348 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349 if (PyUnicode_READY(uni) == -1)
9350 return -1;
9351 kind = PyUnicode_KIND(uni);
9352 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009353 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9355 if (chr != str[i])
9356 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009357 /* This check keeps Python strings that end in '\0' from comparing equal
9358 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009360 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009361 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009362 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009363 return 0;
9364}
9365
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009366
Benjamin Peterson29060642009-01-31 22:14:21 +00009367#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009368 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009369
Alexander Belopolsky40018472011-02-26 01:02:56 +00009370PyObject *
9371PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009372{
9373 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009374
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009375 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9376 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 if (PyUnicode_READY(left) == -1 ||
9378 PyUnicode_READY(right) == -1)
9379 return NULL;
9380 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9381 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009382 if (op == Py_EQ) {
9383 Py_INCREF(Py_False);
9384 return Py_False;
9385 }
9386 if (op == Py_NE) {
9387 Py_INCREF(Py_True);
9388 return Py_True;
9389 }
9390 }
9391 if (left == right)
9392 result = 0;
9393 else
9394 result = unicode_compare((PyUnicodeObject *)left,
9395 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009396
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009397 /* Convert the return value to a Boolean */
9398 switch (op) {
9399 case Py_EQ:
9400 v = TEST_COND(result == 0);
9401 break;
9402 case Py_NE:
9403 v = TEST_COND(result != 0);
9404 break;
9405 case Py_LE:
9406 v = TEST_COND(result <= 0);
9407 break;
9408 case Py_GE:
9409 v = TEST_COND(result >= 0);
9410 break;
9411 case Py_LT:
9412 v = TEST_COND(result == -1);
9413 break;
9414 case Py_GT:
9415 v = TEST_COND(result == 1);
9416 break;
9417 default:
9418 PyErr_BadArgument();
9419 return NULL;
9420 }
9421 Py_INCREF(v);
9422 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009424
Brian Curtindfc80e32011-08-10 20:28:54 -05009425 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009426}
9427
Alexander Belopolsky40018472011-02-26 01:02:56 +00009428int
9429PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009430{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009431 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 int kind1, kind2, kind;
9433 void *buf1, *buf2;
9434 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009435 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009436
9437 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009438 sub = PyUnicode_FromObject(element);
9439 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009440 PyErr_Format(PyExc_TypeError,
9441 "'in <string>' requires string as left operand, not %s",
9442 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009443 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 if (PyUnicode_READY(sub) == -1)
9446 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009447
Thomas Wouters477c8d52006-05-27 19:21:47 +00009448 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009449 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009450 Py_DECREF(sub);
9451 return -1;
9452 }
9453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 kind1 = PyUnicode_KIND(str);
9455 kind2 = PyUnicode_KIND(sub);
9456 kind = kind1 > kind2 ? kind1 : kind2;
9457 buf1 = PyUnicode_DATA(str);
9458 buf2 = PyUnicode_DATA(sub);
9459 if (kind1 != kind)
9460 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9461 if (!buf1) {
9462 Py_DECREF(sub);
9463 return -1;
9464 }
9465 if (kind2 != kind)
9466 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9467 if (!buf2) {
9468 Py_DECREF(sub);
9469 if (kind1 != kind) PyMem_Free(buf1);
9470 return -1;
9471 }
9472 len1 = PyUnicode_GET_LENGTH(str);
9473 len2 = PyUnicode_GET_LENGTH(sub);
9474
9475 switch(kind) {
9476 case PyUnicode_1BYTE_KIND:
9477 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9478 break;
9479 case PyUnicode_2BYTE_KIND:
9480 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9481 break;
9482 case PyUnicode_4BYTE_KIND:
9483 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9484 break;
9485 default:
9486 result = -1;
9487 assert(0);
9488 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009489
9490 Py_DECREF(str);
9491 Py_DECREF(sub);
9492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493 if (kind1 != kind)
9494 PyMem_Free(buf1);
9495 if (kind2 != kind)
9496 PyMem_Free(buf2);
9497
Guido van Rossum403d68b2000-03-13 15:55:09 +00009498 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009499}
9500
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501/* Concat to string or Unicode object giving a new Unicode object. */
9502
Alexander Belopolsky40018472011-02-26 01:02:56 +00009503PyObject *
9504PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 PyObject *u = NULL, *v = NULL, *w;
9507 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508
9509 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009512 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009515 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516
9517 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009519 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009521 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009523 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525 }
9526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009528 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 w = PyUnicode_New(
9532 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9533 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009536 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9537 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009538 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009539 v, 0,
9540 PyUnicode_GET_LENGTH(v)) < 0)
9541 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542 Py_DECREF(u);
9543 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547 Py_XDECREF(u);
9548 Py_XDECREF(v);
9549 return NULL;
9550}
9551
Walter Dörwald1ab83302007-05-18 17:15:44 +00009552void
9553PyUnicode_Append(PyObject **pleft, PyObject *right)
9554{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009555 PyObject *new;
9556 if (*pleft == NULL)
9557 return;
9558 if (right == NULL || !PyUnicode_Check(*pleft)) {
9559 Py_DECREF(*pleft);
9560 *pleft = NULL;
9561 return;
9562 }
9563 new = PyUnicode_Concat(*pleft, right);
9564 Py_DECREF(*pleft);
9565 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009566}
9567
9568void
9569PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9570{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009571 PyUnicode_Append(pleft, right);
9572 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009573}
9574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009575PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009576 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009577\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009578Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009579string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009580interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581
9582static PyObject *
9583unicode_count(PyUnicodeObject *self, PyObject *args)
9584{
9585 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009586 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009587 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009588 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009589 int kind1, kind2, kind;
9590 void *buf1, *buf2;
9591 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592
Jesus Ceaac451502011-04-20 17:09:23 +02009593 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9594 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009595 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009597 kind1 = PyUnicode_KIND(self);
9598 kind2 = PyUnicode_KIND(substring);
9599 kind = kind1 > kind2 ? kind1 : kind2;
9600 buf1 = PyUnicode_DATA(self);
9601 buf2 = PyUnicode_DATA(substring);
9602 if (kind1 != kind)
9603 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9604 if (!buf1) {
9605 Py_DECREF(substring);
9606 return NULL;
9607 }
9608 if (kind2 != kind)
9609 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9610 if (!buf2) {
9611 Py_DECREF(substring);
9612 if (kind1 != kind) PyMem_Free(buf1);
9613 return NULL;
9614 }
9615 len1 = PyUnicode_GET_LENGTH(self);
9616 len2 = PyUnicode_GET_LENGTH(substring);
9617
9618 ADJUST_INDICES(start, end, len1);
9619 switch(kind) {
9620 case PyUnicode_1BYTE_KIND:
9621 iresult = ucs1lib_count(
9622 ((Py_UCS1*)buf1) + start, end - start,
9623 buf2, len2, PY_SSIZE_T_MAX
9624 );
9625 break;
9626 case PyUnicode_2BYTE_KIND:
9627 iresult = ucs2lib_count(
9628 ((Py_UCS2*)buf1) + start, end - start,
9629 buf2, len2, PY_SSIZE_T_MAX
9630 );
9631 break;
9632 case PyUnicode_4BYTE_KIND:
9633 iresult = ucs4lib_count(
9634 ((Py_UCS4*)buf1) + start, end - start,
9635 buf2, len2, PY_SSIZE_T_MAX
9636 );
9637 break;
9638 default:
9639 assert(0); iresult = 0;
9640 }
9641
9642 result = PyLong_FromSsize_t(iresult);
9643
9644 if (kind1 != kind)
9645 PyMem_Free(buf1);
9646 if (kind2 != kind)
9647 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648
9649 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009650
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651 return result;
9652}
9653
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009654PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009655 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009657Encode S using the codec registered for encoding. Default encoding\n\
9658is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009659handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009660a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9661'xmlcharrefreplace' as well as any other name registered with\n\
9662codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663
9664static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009665unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009667 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668 char *encoding = NULL;
9669 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009670
Benjamin Peterson308d6372009-09-18 21:42:35 +00009671 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9672 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009674 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009675}
9676
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009677PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009678 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679\n\
9680Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009681If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682
9683static PyObject*
9684unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9685{
9686 Py_UNICODE *e;
9687 Py_UNICODE *p;
9688 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009689 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691 PyUnicodeObject *u;
9692 int tabsize = 8;
9693
9694 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009695 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9698 return NULL;
9699
Thomas Wouters7e474022000-07-16 12:04:32 +00009700 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009701 i = 0; /* chars up to and including most recent \n or \r */
9702 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9704 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009706 if (tabsize > 0) {
9707 incr = tabsize - (j % tabsize); /* cannot overflow */
9708 if (j > PY_SSIZE_T_MAX - incr)
9709 goto overflow1;
9710 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009711 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009714 if (j > PY_SSIZE_T_MAX - 1)
9715 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716 j++;
9717 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009718 if (i > PY_SSIZE_T_MAX - j)
9719 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009721 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722 }
9723 }
9724
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009725 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009726 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009727
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728 /* Second pass: create output string and fill it */
9729 u = _PyUnicode_New(i + j);
9730 if (!u)
9731 return NULL;
9732
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009733 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009734 q = _PyUnicode_WSTR(u); /* next output char */
9735 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009739 if (tabsize > 0) {
9740 i = tabsize - (j % tabsize);
9741 j += i;
9742 while (i--) {
9743 if (q >= qe)
9744 goto overflow2;
9745 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009746 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009747 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009748 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009749 else {
9750 if (q >= qe)
9751 goto overflow2;
9752 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009753 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754 if (*p == '\n' || *p == '\r')
9755 j = 0;
9756 }
9757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009758 if (PyUnicode_READY(u) == -1) {
9759 Py_DECREF(u);
9760 return NULL;
9761 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009763
9764 overflow2:
9765 Py_DECREF(u);
9766 overflow1:
9767 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9768 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769}
9770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009771PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009772 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773\n\
9774Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009775such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776arguments start and end are interpreted as in slice notation.\n\
9777\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009778Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779
9780static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009782{
Jesus Ceaac451502011-04-20 17:09:23 +02009783 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009784 Py_ssize_t start;
9785 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009786 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787
Jesus Ceaac451502011-04-20 17:09:23 +02009788 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9789 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 if (PyUnicode_READY(self) == -1)
9793 return NULL;
9794 if (PyUnicode_READY(substring) == -1)
9795 return NULL;
9796
9797 result = any_find_slice(
9798 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9799 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009800 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801
9802 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 if (result == -2)
9805 return NULL;
9806
Christian Heimes217cfd12007-12-02 14:31:20 +00009807 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009808}
9809
9810static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009811unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 Py_UCS4 ch;
9814
9815 if (PyUnicode_READY(self) == -1)
9816 return NULL;
9817 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818 PyErr_SetString(PyExc_IndexError, "string index out of range");
9819 return NULL;
9820 }
9821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9823 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824}
9825
Guido van Rossumc2504932007-09-18 19:42:40 +00009826/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009827 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009828static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009829unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830{
Guido van Rossumc2504932007-09-18 19:42:40 +00009831 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009832 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 if (_PyUnicode_HASH(self) != -1)
9835 return _PyUnicode_HASH(self);
9836 if (PyUnicode_READY(self) == -1)
9837 return -1;
9838 len = PyUnicode_GET_LENGTH(self);
9839
9840 /* The hash function as a macro, gets expanded three times below. */
9841#define HASH(P) \
9842 x = (Py_uhash_t)*P << 7; \
9843 while (--len >= 0) \
9844 x = (1000003*x) ^ (Py_uhash_t)*P++;
9845
9846 switch (PyUnicode_KIND(self)) {
9847 case PyUnicode_1BYTE_KIND: {
9848 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9849 HASH(c);
9850 break;
9851 }
9852 case PyUnicode_2BYTE_KIND: {
9853 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9854 HASH(s);
9855 break;
9856 }
9857 default: {
9858 Py_UCS4 *l;
9859 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9860 "Impossible switch case in unicode_hash");
9861 l = PyUnicode_4BYTE_DATA(self);
9862 HASH(l);
9863 break;
9864 }
9865 }
9866 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9867
Guido van Rossumc2504932007-09-18 19:42:40 +00009868 if (x == -1)
9869 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009871 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009875PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009876 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009877\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009878Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879
9880static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009883 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009884 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009885 Py_ssize_t start;
9886 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887
Jesus Ceaac451502011-04-20 17:09:23 +02009888 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9889 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 if (PyUnicode_READY(self) == -1)
9893 return NULL;
9894 if (PyUnicode_READY(substring) == -1)
9895 return NULL;
9896
9897 result = any_find_slice(
9898 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9899 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009900 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901
9902 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 if (result == -2)
9905 return NULL;
9906
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907 if (result < 0) {
9908 PyErr_SetString(PyExc_ValueError, "substring not found");
9909 return NULL;
9910 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009911
Christian Heimes217cfd12007-12-02 14:31:20 +00009912 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913}
9914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009915PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009916 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009917\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009918Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009919at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920
9921static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009922unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 Py_ssize_t i, length;
9925 int kind;
9926 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009927 int cased;
9928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 if (PyUnicode_READY(self) == -1)
9930 return NULL;
9931 length = PyUnicode_GET_LENGTH(self);
9932 kind = PyUnicode_KIND(self);
9933 data = PyUnicode_DATA(self);
9934
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 if (length == 1)
9937 return PyBool_FromLong(
9938 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009940 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009942 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009943
Guido van Rossumd57fd912000-03-10 22:53:23 +00009944 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 for (i = 0; i < length; i++) {
9946 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009947
Benjamin Peterson29060642009-01-31 22:14:21 +00009948 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9949 return PyBool_FromLong(0);
9950 else if (!cased && Py_UNICODE_ISLOWER(ch))
9951 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009953 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009954}
9955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009956PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009957 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009959Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009960at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961
9962static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009963unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009964{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 Py_ssize_t i, length;
9966 int kind;
9967 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968 int cased;
9969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 if (PyUnicode_READY(self) == -1)
9971 return NULL;
9972 length = PyUnicode_GET_LENGTH(self);
9973 kind = PyUnicode_KIND(self);
9974 data = PyUnicode_DATA(self);
9975
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 if (length == 1)
9978 return PyBool_FromLong(
9979 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009981 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009983 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009984
Guido van Rossumd57fd912000-03-10 22:53:23 +00009985 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 for (i = 0; i < length; i++) {
9987 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009988
Benjamin Peterson29060642009-01-31 22:14:21 +00009989 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9990 return PyBool_FromLong(0);
9991 else if (!cased && Py_UNICODE_ISUPPER(ch))
9992 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009994 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009995}
9996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009997PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009998 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009999\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010000Return True if S is a titlecased string and there is at least one\n\
10001character in S, i.e. upper- and titlecase characters may only\n\
10002follow uncased characters and lowercase characters only cased ones.\n\
10003Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004
10005static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010006unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 Py_ssize_t i, length;
10009 int kind;
10010 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010011 int cased, previous_is_cased;
10012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 if (PyUnicode_READY(self) == -1)
10014 return NULL;
10015 length = PyUnicode_GET_LENGTH(self);
10016 kind = PyUnicode_KIND(self);
10017 data = PyUnicode_DATA(self);
10018
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 if (length == 1) {
10021 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10022 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10023 (Py_UNICODE_ISUPPER(ch) != 0));
10024 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010026 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010028 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010029
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030 cased = 0;
10031 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 for (i = 0; i < length; i++) {
10033 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010034
Benjamin Peterson29060642009-01-31 22:14:21 +000010035 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10036 if (previous_is_cased)
10037 return PyBool_FromLong(0);
10038 previous_is_cased = 1;
10039 cased = 1;
10040 }
10041 else if (Py_UNICODE_ISLOWER(ch)) {
10042 if (!previous_is_cased)
10043 return PyBool_FromLong(0);
10044 previous_is_cased = 1;
10045 cased = 1;
10046 }
10047 else
10048 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010049 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010050 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010051}
10052
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010053PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010054 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010055\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010056Return True if all characters in S are whitespace\n\
10057and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010058
10059static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010060unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010061{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 Py_ssize_t i, length;
10063 int kind;
10064 void *data;
10065
10066 if (PyUnicode_READY(self) == -1)
10067 return NULL;
10068 length = PyUnicode_GET_LENGTH(self);
10069 kind = PyUnicode_KIND(self);
10070 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010071
Guido van Rossumd57fd912000-03-10 22:53:23 +000010072 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 if (length == 1)
10074 return PyBool_FromLong(
10075 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010076
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010077 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010079 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 for (i = 0; i < length; i++) {
10082 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010083 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010084 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010085 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010086 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010087}
10088
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010089PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010090 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010091\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010092Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010093and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010094
10095static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010096unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010097{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 Py_ssize_t i, length;
10099 int kind;
10100 void *data;
10101
10102 if (PyUnicode_READY(self) == -1)
10103 return NULL;
10104 length = PyUnicode_GET_LENGTH(self);
10105 kind = PyUnicode_KIND(self);
10106 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010107
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010108 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 if (length == 1)
10110 return PyBool_FromLong(
10111 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010112
10113 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010115 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 for (i = 0; i < length; i++) {
10118 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010119 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010120 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010121 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010122}
10123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010124PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010125 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010126\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010127Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010128and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010129
10130static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010131unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010132{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 int kind;
10134 void *data;
10135 Py_ssize_t len, i;
10136
10137 if (PyUnicode_READY(self) == -1)
10138 return NULL;
10139
10140 kind = PyUnicode_KIND(self);
10141 data = PyUnicode_DATA(self);
10142 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010143
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010144 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 if (len == 1) {
10146 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10147 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10148 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010149
10150 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010152 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 for (i = 0; i < len; i++) {
10155 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010156 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010157 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010158 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010159 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010160}
10161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010162PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010163 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010165Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010166False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167
10168static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010169unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 Py_ssize_t i, length;
10172 int kind;
10173 void *data;
10174
10175 if (PyUnicode_READY(self) == -1)
10176 return NULL;
10177 length = PyUnicode_GET_LENGTH(self);
10178 kind = PyUnicode_KIND(self);
10179 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (length == 1)
10183 return PyBool_FromLong(
10184 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010186 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010188 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 for (i = 0; i < length; i++) {
10191 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010192 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010193 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010194 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010195}
10196
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010197PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010198 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010200Return True if all characters in S are digits\n\
10201and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202
10203static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010204unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 Py_ssize_t i, length;
10207 int kind;
10208 void *data;
10209
10210 if (PyUnicode_READY(self) == -1)
10211 return NULL;
10212 length = PyUnicode_GET_LENGTH(self);
10213 kind = PyUnicode_KIND(self);
10214 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010215
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 if (length == 1) {
10218 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10219 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010222 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010224 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 for (i = 0; i < length; i++) {
10227 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010228 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010230 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231}
10232
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010233PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010234 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010236Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010237False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238
10239static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010240unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 Py_ssize_t i, length;
10243 int kind;
10244 void *data;
10245
10246 if (PyUnicode_READY(self) == -1)
10247 return NULL;
10248 length = PyUnicode_GET_LENGTH(self);
10249 kind = PyUnicode_KIND(self);
10250 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251
Guido van Rossumd57fd912000-03-10 22:53:23 +000010252 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 if (length == 1)
10254 return PyBool_FromLong(
10255 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010257 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010259 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 for (i = 0; i < length; i++) {
10262 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010263 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010265 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266}
10267
Martin v. Löwis47383402007-08-15 07:32:56 +000010268int
10269PyUnicode_IsIdentifier(PyObject *self)
10270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 int kind;
10272 void *data;
10273 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010274 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 if (PyUnicode_READY(self) == -1) {
10277 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010278 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 }
10280
10281 /* Special case for empty strings */
10282 if (PyUnicode_GET_LENGTH(self) == 0)
10283 return 0;
10284 kind = PyUnicode_KIND(self);
10285 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010286
10287 /* PEP 3131 says that the first character must be in
10288 XID_Start and subsequent characters in XID_Continue,
10289 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010290 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010291 letters, digits, underscore). However, given the current
10292 definition of XID_Start and XID_Continue, it is sufficient
10293 to check just for these, except that _ must be allowed
10294 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010296 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010297 return 0;
10298
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010299 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010301 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010302 return 1;
10303}
10304
10305PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010306 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010307\n\
10308Return True if S is a valid identifier according\n\
10309to the language definition.");
10310
10311static PyObject*
10312unicode_isidentifier(PyObject *self)
10313{
10314 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10315}
10316
Georg Brandl559e5d72008-06-11 18:37:52 +000010317PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010318 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010319\n\
10320Return True if all characters in S are considered\n\
10321printable in repr() or S is empty, False otherwise.");
10322
10323static PyObject*
10324unicode_isprintable(PyObject *self)
10325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 Py_ssize_t i, length;
10327 int kind;
10328 void *data;
10329
10330 if (PyUnicode_READY(self) == -1)
10331 return NULL;
10332 length = PyUnicode_GET_LENGTH(self);
10333 kind = PyUnicode_KIND(self);
10334 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010335
10336 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 if (length == 1)
10338 return PyBool_FromLong(
10339 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 for (i = 0; i < length; i++) {
10342 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010343 Py_RETURN_FALSE;
10344 }
10345 }
10346 Py_RETURN_TRUE;
10347}
10348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010349PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010350 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351\n\
10352Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010353iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010354
10355static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010356unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010358 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359}
10360
Martin v. Löwis18e16552006-02-15 17:27:45 +000010361static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362unicode_length(PyUnicodeObject *self)
10363{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 if (PyUnicode_READY(self) == -1)
10365 return -1;
10366 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367}
10368
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010369PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010370 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010372Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010373done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374
10375static PyObject *
10376unicode_ljust(PyUnicodeObject *self, PyObject *args)
10377{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010378 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 Py_UCS4 fillchar = ' ';
10380
10381 if (PyUnicode_READY(self) == -1)
10382 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010383
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010384 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385 return NULL;
10386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388 Py_INCREF(self);
10389 return (PyObject*) self;
10390 }
10391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393}
10394
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010395PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010396 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010398Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399
10400static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010401unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403 return fixup(self, fixlower);
10404}
10405
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010406#define LEFTSTRIP 0
10407#define RIGHTSTRIP 1
10408#define BOTHSTRIP 2
10409
10410/* Arrays indexed by above */
10411static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10412
10413#define STRIPNAME(i) (stripformat[i]+3)
10414
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010415/* externally visible for str.strip(unicode) */
10416PyObject *
10417_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10418{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 void *data;
10420 int kind;
10421 Py_ssize_t i, j, len;
10422 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10425 return NULL;
10426
10427 kind = PyUnicode_KIND(self);
10428 data = PyUnicode_DATA(self);
10429 len = PyUnicode_GET_LENGTH(self);
10430 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10431 PyUnicode_DATA(sepobj),
10432 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010433
Benjamin Peterson14339b62009-01-31 16:36:08 +000010434 i = 0;
10435 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 while (i < len &&
10437 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010438 i++;
10439 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010440 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010441
Benjamin Peterson14339b62009-01-31 16:36:08 +000010442 j = len;
10443 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010444 do {
10445 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 } while (j >= i &&
10447 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010448 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010449 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010450
Victor Stinner12bab6d2011-10-01 01:53:49 +020010451 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452}
10453
10454PyObject*
10455PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10456{
10457 unsigned char *data;
10458 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010459 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460
Victor Stinnerde636f32011-10-01 03:55:54 +020010461 if (PyUnicode_READY(self) == -1)
10462 return NULL;
10463
10464 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10465
Victor Stinner12bab6d2011-10-01 01:53:49 +020010466 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010468 if (PyUnicode_CheckExact(self)) {
10469 Py_INCREF(self);
10470 return self;
10471 }
10472 else
10473 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 }
10475
Victor Stinner12bab6d2011-10-01 01:53:49 +020010476 length = end - start;
10477 if (length == 1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 return unicode_getitem((PyUnicodeObject*)self, start);
10479
Victor Stinnerde636f32011-10-01 03:55:54 +020010480 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010481 PyErr_SetString(PyExc_IndexError, "string index out of range");
10482 return NULL;
10483 }
10484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 kind = PyUnicode_KIND(self);
10486 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010487 return PyUnicode_FromKindAndData(kind,
10488 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010489 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491
10492static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010493do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 int kind;
10496 void *data;
10497 Py_ssize_t len, i, j;
10498
10499 if (PyUnicode_READY(self) == -1)
10500 return NULL;
10501
10502 kind = PyUnicode_KIND(self);
10503 data = PyUnicode_DATA(self);
10504 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010505
Benjamin Peterson14339b62009-01-31 16:36:08 +000010506 i = 0;
10507 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010509 i++;
10510 }
10511 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010512
Benjamin Peterson14339b62009-01-31 16:36:08 +000010513 j = len;
10514 if (striptype != LEFTSTRIP) {
10515 do {
10516 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010518 j++;
10519 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010520
Victor Stinner12bab6d2011-10-01 01:53:49 +020010521 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522}
10523
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010524
10525static PyObject *
10526do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10527{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010528 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010529
Benjamin Peterson14339b62009-01-31 16:36:08 +000010530 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10531 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010532
Benjamin Peterson14339b62009-01-31 16:36:08 +000010533 if (sep != NULL && sep != Py_None) {
10534 if (PyUnicode_Check(sep))
10535 return _PyUnicode_XStrip(self, striptype, sep);
10536 else {
10537 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010538 "%s arg must be None or str",
10539 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010540 return NULL;
10541 }
10542 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010543
Benjamin Peterson14339b62009-01-31 16:36:08 +000010544 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010545}
10546
10547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010548PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010549 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010550\n\
10551Return a copy of the string S with leading and trailing\n\
10552whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010553If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010554
10555static PyObject *
10556unicode_strip(PyUnicodeObject *self, PyObject *args)
10557{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010558 if (PyTuple_GET_SIZE(args) == 0)
10559 return do_strip(self, BOTHSTRIP); /* Common case */
10560 else
10561 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010562}
10563
10564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010565PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010566 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010567\n\
10568Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010569If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010570
10571static PyObject *
10572unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10573{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010574 if (PyTuple_GET_SIZE(args) == 0)
10575 return do_strip(self, LEFTSTRIP); /* Common case */
10576 else
10577 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010578}
10579
10580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010581PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010582 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010583\n\
10584Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010585If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010586
10587static PyObject *
10588unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10589{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010590 if (PyTuple_GET_SIZE(args) == 0)
10591 return do_strip(self, RIGHTSTRIP); /* Common case */
10592 else
10593 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010594}
10595
10596
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010598unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599{
10600 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602
Georg Brandl222de0f2009-04-12 12:01:50 +000010603 if (len < 1) {
10604 Py_INCREF(unicode_empty);
10605 return (PyObject *)unicode_empty;
10606 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607
Tim Peters7a29bd52001-09-12 03:03:31 +000010608 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609 /* no repeat, return original string */
10610 Py_INCREF(str);
10611 return (PyObject*) str;
10612 }
Tim Peters8f422462000-09-09 06:13:41 +000010613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 if (PyUnicode_READY(str) == -1)
10615 return NULL;
10616
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010617 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010618 PyErr_SetString(PyExc_OverflowError,
10619 "repeated string is too long");
10620 return NULL;
10621 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010625 if (!u)
10626 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010627 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 if (PyUnicode_GET_LENGTH(str) == 1) {
10630 const int kind = PyUnicode_KIND(str);
10631 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10632 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010633 if (kind == PyUnicode_1BYTE_KIND)
10634 memset(to, (unsigned char)fill_char, len);
10635 else {
10636 for (n = 0; n < len; ++n)
10637 PyUnicode_WRITE(kind, to, n, fill_char);
10638 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 }
10640 else {
10641 /* number of characters copied this far */
10642 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10643 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10644 char *to = (char *) PyUnicode_DATA(u);
10645 Py_MEMCPY(to, PyUnicode_DATA(str),
10646 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010647 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 n = (done <= nchars-done) ? done : nchars-done;
10649 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010652 }
10653
10654 return (PyObject*) u;
10655}
10656
Alexander Belopolsky40018472011-02-26 01:02:56 +000010657PyObject *
10658PyUnicode_Replace(PyObject *obj,
10659 PyObject *subobj,
10660 PyObject *replobj,
10661 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662{
10663 PyObject *self;
10664 PyObject *str1;
10665 PyObject *str2;
10666 PyObject *result;
10667
10668 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010669 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010670 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010672 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010673 Py_DECREF(self);
10674 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010675 }
10676 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010677 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010678 Py_DECREF(self);
10679 Py_DECREF(str1);
10680 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683 Py_DECREF(self);
10684 Py_DECREF(str1);
10685 Py_DECREF(str2);
10686 return result;
10687}
10688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010689PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010690 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691\n\
10692Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010693old replaced by new. If the optional argument count is\n\
10694given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695
10696static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 PyObject *str1;
10700 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010701 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702 PyObject *result;
10703
Martin v. Löwis18e16552006-02-15 17:27:45 +000010704 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010706 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010707 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 str1 = PyUnicode_FromObject(str1);
10709 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10710 return NULL;
10711 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020010712 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010713 Py_DECREF(str1);
10714 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716
10717 result = replace(self, str1, str2, maxcount);
10718
10719 Py_DECREF(str1);
10720 Py_DECREF(str2);
10721 return result;
10722}
10723
Alexander Belopolsky40018472011-02-26 01:02:56 +000010724static PyObject *
10725unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010727 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 Py_ssize_t isize;
10729 Py_ssize_t osize, squote, dquote, i, o;
10730 Py_UCS4 max, quote;
10731 int ikind, okind;
10732 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010735 return NULL;
10736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 isize = PyUnicode_GET_LENGTH(unicode);
10738 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 /* Compute length of output, quote characters, and
10741 maximum character */
10742 osize = 2; /* quotes */
10743 max = 127;
10744 squote = dquote = 0;
10745 ikind = PyUnicode_KIND(unicode);
10746 for (i = 0; i < isize; i++) {
10747 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10748 switch (ch) {
10749 case '\'': squote++; osize++; break;
10750 case '"': dquote++; osize++; break;
10751 case '\\': case '\t': case '\r': case '\n':
10752 osize += 2; break;
10753 default:
10754 /* Fast-path ASCII */
10755 if (ch < ' ' || ch == 0x7f)
10756 osize += 4; /* \xHH */
10757 else if (ch < 0x7f)
10758 osize++;
10759 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10760 osize++;
10761 max = ch > max ? ch : max;
10762 }
10763 else if (ch < 0x100)
10764 osize += 4; /* \xHH */
10765 else if (ch < 0x10000)
10766 osize += 6; /* \uHHHH */
10767 else
10768 osize += 10; /* \uHHHHHHHH */
10769 }
10770 }
10771
10772 quote = '\'';
10773 if (squote) {
10774 if (dquote)
10775 /* Both squote and dquote present. Use squote,
10776 and escape them */
10777 osize += squote;
10778 else
10779 quote = '"';
10780 }
10781
10782 repr = PyUnicode_New(osize, max);
10783 if (repr == NULL)
10784 return NULL;
10785 okind = PyUnicode_KIND(repr);
10786 odata = PyUnicode_DATA(repr);
10787
10788 PyUnicode_WRITE(okind, odata, 0, quote);
10789 PyUnicode_WRITE(okind, odata, osize-1, quote);
10790
10791 for (i = 0, o = 1; i < isize; i++) {
10792 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010793
10794 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 if ((ch == quote) || (ch == '\\')) {
10796 PyUnicode_WRITE(okind, odata, o++, '\\');
10797 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010798 continue;
10799 }
10800
Benjamin Peterson29060642009-01-31 22:14:21 +000010801 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010802 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 PyUnicode_WRITE(okind, odata, o++, '\\');
10804 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010805 }
10806 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 PyUnicode_WRITE(okind, odata, o++, '\\');
10808 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010809 }
10810 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 PyUnicode_WRITE(okind, odata, o++, '\\');
10812 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010813 }
10814
10815 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010816 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 PyUnicode_WRITE(okind, odata, o++, '\\');
10818 PyUnicode_WRITE(okind, odata, o++, 'x');
10819 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10820 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010821 }
10822
Georg Brandl559e5d72008-06-11 18:37:52 +000010823 /* Copy ASCII characters as-is */
10824 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010826 }
10827
Benjamin Peterson29060642009-01-31 22:14:21 +000010828 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010829 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010830 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010831 (categories Z* and C* except ASCII space)
10832 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010834 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 if (ch <= 0xff) {
10836 PyUnicode_WRITE(okind, odata, o++, '\\');
10837 PyUnicode_WRITE(okind, odata, o++, 'x');
10838 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10839 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010840 }
10841 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 else if (ch >= 0x10000) {
10843 PyUnicode_WRITE(okind, odata, o++, '\\');
10844 PyUnicode_WRITE(okind, odata, o++, 'U');
10845 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10846 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10847 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10848 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10849 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10850 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10851 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10852 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010853 }
10854 /* Map 16-bit characters to '\uxxxx' */
10855 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856 PyUnicode_WRITE(okind, odata, o++, '\\');
10857 PyUnicode_WRITE(okind, odata, o++, 'u');
10858 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10859 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10860 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10861 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010862 }
10863 }
10864 /* Copy characters as-is */
10865 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010867 }
10868 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010869 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010871 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872}
10873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010874PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010875 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876\n\
10877Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010878such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879arguments start and end are interpreted as in slice notation.\n\
10880\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010881Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882
10883static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010884unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885{
Jesus Ceaac451502011-04-20 17:09:23 +020010886 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010887 Py_ssize_t start;
10888 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010889 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890
Jesus Ceaac451502011-04-20 17:09:23 +020010891 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10892 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010893 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010895 if (PyUnicode_READY(self) == -1)
10896 return NULL;
10897 if (PyUnicode_READY(substring) == -1)
10898 return NULL;
10899
10900 result = any_find_slice(
10901 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10902 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010903 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904
10905 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907 if (result == -2)
10908 return NULL;
10909
Christian Heimes217cfd12007-12-02 14:31:20 +000010910 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911}
10912
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010913PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010914 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010916Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917
10918static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010920{
Jesus Ceaac451502011-04-20 17:09:23 +020010921 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010922 Py_ssize_t start;
10923 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010924 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925
Jesus Ceaac451502011-04-20 17:09:23 +020010926 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10927 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010928 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 if (PyUnicode_READY(self) == -1)
10931 return NULL;
10932 if (PyUnicode_READY(substring) == -1)
10933 return NULL;
10934
10935 result = any_find_slice(
10936 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10937 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010938 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939
10940 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942 if (result == -2)
10943 return NULL;
10944
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945 if (result < 0) {
10946 PyErr_SetString(PyExc_ValueError, "substring not found");
10947 return NULL;
10948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949
Christian Heimes217cfd12007-12-02 14:31:20 +000010950 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951}
10952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010953PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010954 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010956Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010957done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
10959static PyObject *
10960unicode_rjust(PyUnicodeObject *self, PyObject *args)
10961{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010962 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 Py_UCS4 fillchar = ' ';
10964
Victor Stinnere9a29352011-10-01 02:14:59 +020010965 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010967
Victor Stinnere9a29352011-10-01 02:14:59 +020010968 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969 return NULL;
10970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972 Py_INCREF(self);
10973 return (PyObject*) self;
10974 }
10975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977}
10978
Alexander Belopolsky40018472011-02-26 01:02:56 +000010979PyObject *
10980PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981{
10982 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010983
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984 s = PyUnicode_FromObject(s);
10985 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010986 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010987 if (sep != NULL) {
10988 sep = PyUnicode_FromObject(sep);
10989 if (sep == NULL) {
10990 Py_DECREF(s);
10991 return NULL;
10992 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993 }
10994
10995 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
10996
10997 Py_DECREF(s);
10998 Py_XDECREF(sep);
10999 return result;
11000}
11001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011002PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004\n\
11005Return a list of the words in S, using sep as the\n\
11006delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011007splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011008whitespace string is a separator and empty strings are\n\
11009removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010
11011static PyObject*
11012unicode_split(PyUnicodeObject *self, PyObject *args)
11013{
11014 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011015 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016
Martin v. Löwis18e16552006-02-15 17:27:45 +000011017 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018 return NULL;
11019
11020 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011021 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011023 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011025 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026}
11027
Thomas Wouters477c8d52006-05-27 19:21:47 +000011028PyObject *
11029PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11030{
11031 PyObject* str_obj;
11032 PyObject* sep_obj;
11033 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 int kind1, kind2, kind;
11035 void *buf1 = NULL, *buf2 = NULL;
11036 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011037
11038 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011039 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011040 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011041 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011043 Py_DECREF(str_obj);
11044 return NULL;
11045 }
11046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 kind1 = PyUnicode_KIND(str_in);
11048 kind2 = PyUnicode_KIND(sep_obj);
11049 kind = kind1 > kind2 ? kind1 : kind2;
11050 buf1 = PyUnicode_DATA(str_in);
11051 if (kind1 != kind)
11052 buf1 = _PyUnicode_AsKind(str_in, kind);
11053 if (!buf1)
11054 goto onError;
11055 buf2 = PyUnicode_DATA(sep_obj);
11056 if (kind2 != kind)
11057 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11058 if (!buf2)
11059 goto onError;
11060 len1 = PyUnicode_GET_LENGTH(str_obj);
11061 len2 = PyUnicode_GET_LENGTH(sep_obj);
11062
11063 switch(PyUnicode_KIND(str_in)) {
11064 case PyUnicode_1BYTE_KIND:
11065 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11066 break;
11067 case PyUnicode_2BYTE_KIND:
11068 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11069 break;
11070 case PyUnicode_4BYTE_KIND:
11071 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11072 break;
11073 default:
11074 assert(0);
11075 out = 0;
11076 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011077
11078 Py_DECREF(sep_obj);
11079 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 if (kind1 != kind)
11081 PyMem_Free(buf1);
11082 if (kind2 != kind)
11083 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011084
11085 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 onError:
11087 Py_DECREF(sep_obj);
11088 Py_DECREF(str_obj);
11089 if (kind1 != kind && buf1)
11090 PyMem_Free(buf1);
11091 if (kind2 != kind && buf2)
11092 PyMem_Free(buf2);
11093 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011094}
11095
11096
11097PyObject *
11098PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11099{
11100 PyObject* str_obj;
11101 PyObject* sep_obj;
11102 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103 int kind1, kind2, kind;
11104 void *buf1 = NULL, *buf2 = NULL;
11105 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011106
11107 str_obj = PyUnicode_FromObject(str_in);
11108 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011109 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011110 sep_obj = PyUnicode_FromObject(sep_in);
11111 if (!sep_obj) {
11112 Py_DECREF(str_obj);
11113 return NULL;
11114 }
11115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 kind1 = PyUnicode_KIND(str_in);
11117 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011118 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 buf1 = PyUnicode_DATA(str_in);
11120 if (kind1 != kind)
11121 buf1 = _PyUnicode_AsKind(str_in, kind);
11122 if (!buf1)
11123 goto onError;
11124 buf2 = PyUnicode_DATA(sep_obj);
11125 if (kind2 != kind)
11126 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11127 if (!buf2)
11128 goto onError;
11129 len1 = PyUnicode_GET_LENGTH(str_obj);
11130 len2 = PyUnicode_GET_LENGTH(sep_obj);
11131
11132 switch(PyUnicode_KIND(str_in)) {
11133 case PyUnicode_1BYTE_KIND:
11134 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11135 break;
11136 case PyUnicode_2BYTE_KIND:
11137 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11138 break;
11139 case PyUnicode_4BYTE_KIND:
11140 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11141 break;
11142 default:
11143 assert(0);
11144 out = 0;
11145 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011146
11147 Py_DECREF(sep_obj);
11148 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 if (kind1 != kind)
11150 PyMem_Free(buf1);
11151 if (kind2 != kind)
11152 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011153
11154 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 onError:
11156 Py_DECREF(sep_obj);
11157 Py_DECREF(str_obj);
11158 if (kind1 != kind && buf1)
11159 PyMem_Free(buf1);
11160 if (kind2 != kind && buf2)
11161 PyMem_Free(buf2);
11162 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011163}
11164
11165PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011166 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011167\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011168Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011169the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011170found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011171
11172static PyObject*
11173unicode_partition(PyUnicodeObject *self, PyObject *separator)
11174{
11175 return PyUnicode_Partition((PyObject *)self, separator);
11176}
11177
11178PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011179 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011180\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011181Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011182the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011183separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011184
11185static PyObject*
11186unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11187{
11188 return PyUnicode_RPartition((PyObject *)self, separator);
11189}
11190
Alexander Belopolsky40018472011-02-26 01:02:56 +000011191PyObject *
11192PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011193{
11194 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011195
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011196 s = PyUnicode_FromObject(s);
11197 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011198 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011199 if (sep != NULL) {
11200 sep = PyUnicode_FromObject(sep);
11201 if (sep == NULL) {
11202 Py_DECREF(s);
11203 return NULL;
11204 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011205 }
11206
11207 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11208
11209 Py_DECREF(s);
11210 Py_XDECREF(sep);
11211 return result;
11212}
11213
11214PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011215 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011216\n\
11217Return a list of the words in S, using sep as the\n\
11218delimiter string, starting at the end of the string and\n\
11219working to the front. If maxsplit is given, at most maxsplit\n\
11220splits are done. If sep is not specified, any whitespace string\n\
11221is a separator.");
11222
11223static PyObject*
11224unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11225{
11226 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011227 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011228
Martin v. Löwis18e16552006-02-15 17:27:45 +000011229 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011230 return NULL;
11231
11232 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011233 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011234 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011235 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011236 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011237 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011238}
11239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011240PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011241 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242\n\
11243Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011244Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011245is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246
11247static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011248unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011250 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011251 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011253 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11254 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255 return NULL;
11256
Guido van Rossum86662912000-04-11 15:38:46 +000011257 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258}
11259
11260static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011261PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262{
Walter Dörwald346737f2007-05-31 10:44:43 +000011263 if (PyUnicode_CheckExact(self)) {
11264 Py_INCREF(self);
11265 return self;
11266 } else
11267 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011268 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269}
11270
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011271PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011272 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273\n\
11274Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011275and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276
11277static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011278unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280 return fixup(self, fixswapcase);
11281}
11282
Georg Brandlceee0772007-11-27 23:48:05 +000011283PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011284 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011285\n\
11286Return a translation table usable for str.translate().\n\
11287If there is only one argument, it must be a dictionary mapping Unicode\n\
11288ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011289Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011290If there are two arguments, they must be strings of equal length, and\n\
11291in the resulting dictionary, each character in x will be mapped to the\n\
11292character at the same position in y. If there is a third argument, it\n\
11293must be a string, whose characters will be mapped to None in the result.");
11294
11295static PyObject*
11296unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11297{
11298 PyObject *x, *y = NULL, *z = NULL;
11299 PyObject *new = NULL, *key, *value;
11300 Py_ssize_t i = 0;
11301 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011302
Georg Brandlceee0772007-11-27 23:48:05 +000011303 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11304 return NULL;
11305 new = PyDict_New();
11306 if (!new)
11307 return NULL;
11308 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 int x_kind, y_kind, z_kind;
11310 void *x_data, *y_data, *z_data;
11311
Georg Brandlceee0772007-11-27 23:48:05 +000011312 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011313 if (!PyUnicode_Check(x)) {
11314 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11315 "be a string if there is a second argument");
11316 goto err;
11317 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011319 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11320 "arguments must have equal length");
11321 goto err;
11322 }
11323 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 x_kind = PyUnicode_KIND(x);
11325 y_kind = PyUnicode_KIND(y);
11326 x_data = PyUnicode_DATA(x);
11327 y_data = PyUnicode_DATA(y);
11328 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11329 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11330 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011331 if (!key || !value)
11332 goto err;
11333 res = PyDict_SetItem(new, key, value);
11334 Py_DECREF(key);
11335 Py_DECREF(value);
11336 if (res < 0)
11337 goto err;
11338 }
11339 /* create entries for deleting chars in z */
11340 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 z_kind = PyUnicode_KIND(z);
11342 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011343 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011345 if (!key)
11346 goto err;
11347 res = PyDict_SetItem(new, key, Py_None);
11348 Py_DECREF(key);
11349 if (res < 0)
11350 goto err;
11351 }
11352 }
11353 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011354 int kind;
11355 void *data;
11356
Georg Brandlceee0772007-11-27 23:48:05 +000011357 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011358 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011359 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11360 "to maketrans it must be a dict");
11361 goto err;
11362 }
11363 /* copy entries into the new dict, converting string keys to int keys */
11364 while (PyDict_Next(x, &i, &key, &value)) {
11365 if (PyUnicode_Check(key)) {
11366 /* convert string keys to integer keys */
11367 PyObject *newkey;
11368 if (PyUnicode_GET_SIZE(key) != 1) {
11369 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11370 "table must be of length 1");
11371 goto err;
11372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 kind = PyUnicode_KIND(key);
11374 data = PyUnicode_DATA(key);
11375 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011376 if (!newkey)
11377 goto err;
11378 res = PyDict_SetItem(new, newkey, value);
11379 Py_DECREF(newkey);
11380 if (res < 0)
11381 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011382 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011383 /* just keep integer keys */
11384 if (PyDict_SetItem(new, key, value) < 0)
11385 goto err;
11386 } else {
11387 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11388 "be strings or integers");
11389 goto err;
11390 }
11391 }
11392 }
11393 return new;
11394 err:
11395 Py_DECREF(new);
11396 return NULL;
11397}
11398
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011399PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011400 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401\n\
11402Return a copy of the string S, where all characters have been mapped\n\
11403through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011404Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011405Unmapped characters are left untouched. Characters mapped to None\n\
11406are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407
11408static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412}
11413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011417Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418
11419static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011420unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422 return fixup(self, fixupper);
11423}
11424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011425PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011426 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011428Pad a numeric string S with zeros on the left, to fill a field\n\
11429of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
11431static PyObject *
11432unicode_zfill(PyUnicodeObject *self, PyObject *args)
11433{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011434 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011436 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011437 int kind;
11438 void *data;
11439 Py_UCS4 chr;
11440
11441 if (PyUnicode_READY(self) == -1)
11442 return NULL;
11443
Martin v. Löwis18e16552006-02-15 17:27:45 +000011444 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445 return NULL;
11446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011448 if (PyUnicode_CheckExact(self)) {
11449 Py_INCREF(self);
11450 return (PyObject*) self;
11451 }
11452 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011453 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454 }
11455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457
11458 u = pad(self, fill, 0, '0');
11459
Walter Dörwald068325e2002-04-15 13:36:47 +000011460 if (u == NULL)
11461 return NULL;
11462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 kind = PyUnicode_KIND(u);
11464 data = PyUnicode_DATA(u);
11465 chr = PyUnicode_READ(kind, data, fill);
11466
11467 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 PyUnicode_WRITE(kind, data, 0, chr);
11470 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471 }
11472
11473 return (PyObject*) u;
11474}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475
11476#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011477static PyObject *
11478unicode__decimal2ascii(PyObject *self)
11479{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011481}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482#endif
11483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011484PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011487Return True if S starts with the specified prefix, False otherwise.\n\
11488With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011489With optional end, stop comparing S at that position.\n\
11490prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491
11492static PyObject *
11493unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011496 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011498 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011499 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011500 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501
Jesus Ceaac451502011-04-20 17:09:23 +020011502 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011504 if (PyTuple_Check(subobj)) {
11505 Py_ssize_t i;
11506 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11507 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011508 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011509 if (substring == NULL)
11510 return NULL;
11511 result = tailmatch(self, substring, start, end, -1);
11512 Py_DECREF(substring);
11513 if (result) {
11514 Py_RETURN_TRUE;
11515 }
11516 }
11517 /* nothing matched */
11518 Py_RETURN_FALSE;
11519 }
11520 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011521 if (substring == NULL) {
11522 if (PyErr_ExceptionMatches(PyExc_TypeError))
11523 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11524 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011525 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011526 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011527 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011529 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530}
11531
11532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011533PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011534 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011536Return True if S ends with the specified suffix, False otherwise.\n\
11537With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011538With optional end, stop comparing S at that position.\n\
11539suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540
11541static PyObject *
11542unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011543 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011545 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011547 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011548 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011549 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550
Jesus Ceaac451502011-04-20 17:09:23 +020011551 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011553 if (PyTuple_Check(subobj)) {
11554 Py_ssize_t i;
11555 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11556 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011558 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011560 result = tailmatch(self, substring, start, end, +1);
11561 Py_DECREF(substring);
11562 if (result) {
11563 Py_RETURN_TRUE;
11564 }
11565 }
11566 Py_RETURN_FALSE;
11567 }
11568 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011569 if (substring == NULL) {
11570 if (PyErr_ExceptionMatches(PyExc_TypeError))
11571 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11572 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011573 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011574 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011575 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011577 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578}
11579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011581
11582PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011584\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011585Return a formatted version of S, using substitutions from args and kwargs.\n\
11586The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011587
Eric Smith27bbca62010-11-04 17:06:58 +000011588PyDoc_STRVAR(format_map__doc__,
11589 "S.format_map(mapping) -> str\n\
11590\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011591Return a formatted version of S, using substitutions from mapping.\n\
11592The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011593
Eric Smith4a7d76d2008-05-30 18:10:19 +000011594static PyObject *
11595unicode__format__(PyObject* self, PyObject* args)
11596{
11597 PyObject *format_spec;
11598
11599 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11600 return NULL;
11601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11603 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011604}
11605
Eric Smith8c663262007-08-25 02:26:07 +000011606PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011607 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011608\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011609Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011610
11611static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011612unicode__sizeof__(PyUnicodeObject *v)
11613{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 Py_ssize_t size;
11615
11616 /* If it's a compact object, account for base structure +
11617 character data. */
11618 if (PyUnicode_IS_COMPACT_ASCII(v))
11619 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11620 else if (PyUnicode_IS_COMPACT(v))
11621 size = sizeof(PyCompactUnicodeObject) +
11622 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11623 else {
11624 /* If it is a two-block object, account for base object, and
11625 for character block if present. */
11626 size = sizeof(PyUnicodeObject);
11627 if (v->data.any)
11628 size += (PyUnicode_GET_LENGTH(v) + 1) *
11629 PyUnicode_CHARACTER_SIZE(v);
11630 }
11631 /* If the wstr pointer is present, account for it unless it is shared
11632 with the data pointer. Since PyUnicode_DATA will crash if the object
11633 is not ready, check whether it's either not ready (in which case the
11634 data is entirely in wstr) or if the data is not shared. */
11635 if (_PyUnicode_WSTR(v) &&
11636 (!PyUnicode_IS_READY(v) ||
11637 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11638 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011639 if (!PyUnicode_IS_COMPACT_ASCII(v)
11640 && _PyUnicode_UTF8(v)
11641 && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11642 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643
11644 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011645}
11646
11647PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011648 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011649
11650static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011651unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011652{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011653 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654 if (!copy)
11655 return NULL;
11656 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011657}
11658
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659static PyMethodDef unicode_methods[] = {
11660
11661 /* Order is according to common usage: often used methods should
11662 appear first, since lookup is done sequentially. */
11663
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011664 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011665 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11666 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011667 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011668 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11669 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11670 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11671 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11672 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11673 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11674 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011675 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011676 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11677 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11678 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011679 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011680 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11681 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11682 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011683 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011684 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011685 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011686 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011687 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11688 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11689 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11690 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11691 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11692 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11693 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11694 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11695 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11696 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11697 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11698 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11699 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11700 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011701 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011702 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011703 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011704 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011705 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011706 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011707 {"maketrans", (PyCFunction) unicode_maketrans,
11708 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011709 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011710#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011711 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712#endif
11713
11714#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011715 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011716 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717#endif
11718
Benjamin Peterson14339b62009-01-31 16:36:08 +000011719 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720 {NULL, NULL}
11721};
11722
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011723static PyObject *
11724unicode_mod(PyObject *v, PyObject *w)
11725{
Brian Curtindfc80e32011-08-10 20:28:54 -050011726 if (!PyUnicode_Check(v))
11727 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011728 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011729}
11730
11731static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011732 0, /*nb_add*/
11733 0, /*nb_subtract*/
11734 0, /*nb_multiply*/
11735 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011736};
11737
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011739 (lenfunc) unicode_length, /* sq_length */
11740 PyUnicode_Concat, /* sq_concat */
11741 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11742 (ssizeargfunc) unicode_getitem, /* sq_item */
11743 0, /* sq_slice */
11744 0, /* sq_ass_item */
11745 0, /* sq_ass_slice */
11746 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747};
11748
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011749static PyObject*
11750unicode_subscript(PyUnicodeObject* self, PyObject* item)
11751{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 if (PyUnicode_READY(self) == -1)
11753 return NULL;
11754
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011755 if (PyIndex_Check(item)) {
11756 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011757 if (i == -1 && PyErr_Occurred())
11758 return NULL;
11759 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011761 return unicode_getitem(self, i);
11762 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011763 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011765 Py_UNICODE* result_buf;
11766 PyObject* result;
11767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011769 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011770 return NULL;
11771 }
11772
11773 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 return PyUnicode_New(0, 0);
11775 } else if (start == 0 && step == 1 &&
11776 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011777 PyUnicode_CheckExact(self)) {
11778 Py_INCREF(self);
11779 return (PyObject *)self;
11780 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011781 return PyUnicode_Substring((PyObject*)self,
11782 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011783 } else {
11784 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011785 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11786 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011787
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 if (result_buf == NULL)
11789 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011790
11791 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11792 result_buf[i] = source_buf[cur];
11793 }
Tim Petersced69f82003-09-16 20:30:58 +000011794
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011795 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011796 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011797 return result;
11798 }
11799 } else {
11800 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11801 return NULL;
11802 }
11803}
11804
11805static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011806 (lenfunc)unicode_length, /* mp_length */
11807 (binaryfunc)unicode_subscript, /* mp_subscript */
11808 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011809};
11810
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812/* Helpers for PyUnicode_Format() */
11813
11814static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011815getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011817 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011819 (*p_argidx)++;
11820 if (arglen < 0)
11821 return args;
11822 else
11823 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824 }
11825 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011826 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827 return NULL;
11828}
11829
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011830/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011832static PyObject *
11833formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011835 char *p;
11836 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011838
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839 x = PyFloat_AsDouble(v);
11840 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011841 return NULL;
11842
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011844 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011845
Eric Smith0923d1d2009-04-16 20:16:10 +000011846 p = PyOS_double_to_string(x, type, prec,
11847 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011848 if (p == NULL)
11849 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011851 PyMem_Free(p);
11852 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853}
11854
Tim Peters38fd5b62000-09-21 05:43:11 +000011855static PyObject*
11856formatlong(PyObject *val, int flags, int prec, int type)
11857{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011858 char *buf;
11859 int len;
11860 PyObject *str; /* temporary string object. */
11861 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011862
Benjamin Peterson14339b62009-01-31 16:36:08 +000011863 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11864 if (!str)
11865 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011867 Py_DECREF(str);
11868 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011869}
11870
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011873 size_t buflen,
11874 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011876 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011877 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 if (PyUnicode_GET_LENGTH(v) == 1) {
11879 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011880 buf[1] = '\0';
11881 return 1;
11882 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 goto onError;
11884 }
11885 else {
11886 /* Integer input truncated to a character */
11887 long x;
11888 x = PyLong_AsLong(v);
11889 if (x == -1 && PyErr_Occurred())
11890 goto onError;
11891
11892 if (x < 0 || x > 0x10ffff) {
11893 PyErr_SetString(PyExc_OverflowError,
11894 "%c arg not in range(0x110000)");
11895 return -1;
11896 }
11897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011899 buf[1] = '\0';
11900 return 1;
11901 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011902
Benjamin Peterson29060642009-01-31 22:14:21 +000011903 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011904 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011905 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011906 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907}
11908
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011909/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011910 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011911*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011912#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011913
Alexander Belopolsky40018472011-02-26 01:02:56 +000011914PyObject *
11915PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 void *fmt;
11918 int fmtkind;
11919 PyObject *result;
11920 Py_UCS4 *res, *res0;
11921 Py_UCS4 max;
11922 int kind;
11923 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011927
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 PyErr_BadInternalCall();
11930 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11933 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011934 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 fmt = PyUnicode_DATA(uformat);
11936 fmtkind = PyUnicode_KIND(uformat);
11937 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11938 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939
11940 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11942 if (res0 == NULL) {
11943 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946
11947 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011948 arglen = PyTuple_Size(args);
11949 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950 }
11951 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011952 arglen = -1;
11953 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011955 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011956 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011957 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958
11959 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011961 if (--rescnt < 0) {
11962 rescnt = fmtcnt + 100;
11963 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11965 if (res0 == NULL){
11966 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011967 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 }
11969 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011970 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011971 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011973 }
11974 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011975 /* Got a format specifier */
11976 int flags = 0;
11977 Py_ssize_t width = -1;
11978 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 Py_UCS4 c = '\0';
11980 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 int isnumok;
11982 PyObject *v = NULL;
11983 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 void *pbuf;
11985 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 Py_ssize_t len, len1;
11988 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 fmtpos++;
11991 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
11992 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000011993 Py_ssize_t keylen;
11994 PyObject *key;
11995 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000011996
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 if (dict == NULL) {
11998 PyErr_SetString(PyExc_TypeError,
11999 "format requires a mapping");
12000 goto onError;
12001 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012003 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 /* Skip over balanced parentheses */
12006 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012008 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012010 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012014 if (fmtcnt < 0 || pcount > 0) {
12015 PyErr_SetString(PyExc_ValueError,
12016 "incomplete format key");
12017 goto onError;
12018 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012019 key = PyUnicode_Substring((PyObject*)uformat,
12020 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 if (key == NULL)
12022 goto onError;
12023 if (args_owned) {
12024 Py_DECREF(args);
12025 args_owned = 0;
12026 }
12027 args = PyObject_GetItem(dict, key);
12028 Py_DECREF(key);
12029 if (args == NULL) {
12030 goto onError;
12031 }
12032 args_owned = 1;
12033 arglen = -1;
12034 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012035 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012036 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012038 case '-': flags |= F_LJUST; continue;
12039 case '+': flags |= F_SIGN; continue;
12040 case ' ': flags |= F_BLANK; continue;
12041 case '#': flags |= F_ALT; continue;
12042 case '0': flags |= F_ZERO; continue;
12043 }
12044 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012045 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012046 if (c == '*') {
12047 v = getnextarg(args, arglen, &argidx);
12048 if (v == NULL)
12049 goto onError;
12050 if (!PyLong_Check(v)) {
12051 PyErr_SetString(PyExc_TypeError,
12052 "* wants int");
12053 goto onError;
12054 }
12055 width = PyLong_AsLong(v);
12056 if (width == -1 && PyErr_Occurred())
12057 goto onError;
12058 if (width < 0) {
12059 flags |= F_LJUST;
12060 width = -width;
12061 }
12062 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012064 }
12065 else if (c >= '0' && c <= '9') {
12066 width = c - '0';
12067 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012069 if (c < '0' || c > '9')
12070 break;
12071 if ((width*10) / 10 != width) {
12072 PyErr_SetString(PyExc_ValueError,
12073 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012074 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012075 }
12076 width = width*10 + (c - '0');
12077 }
12078 }
12079 if (c == '.') {
12080 prec = 0;
12081 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012083 if (c == '*') {
12084 v = getnextarg(args, arglen, &argidx);
12085 if (v == NULL)
12086 goto onError;
12087 if (!PyLong_Check(v)) {
12088 PyErr_SetString(PyExc_TypeError,
12089 "* wants int");
12090 goto onError;
12091 }
12092 prec = PyLong_AsLong(v);
12093 if (prec == -1 && PyErr_Occurred())
12094 goto onError;
12095 if (prec < 0)
12096 prec = 0;
12097 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012099 }
12100 else if (c >= '0' && c <= '9') {
12101 prec = c - '0';
12102 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012104 if (c < '0' || c > '9')
12105 break;
12106 if ((prec*10) / 10 != prec) {
12107 PyErr_SetString(PyExc_ValueError,
12108 "prec too big");
12109 goto onError;
12110 }
12111 prec = prec*10 + (c - '0');
12112 }
12113 }
12114 } /* prec */
12115 if (fmtcnt >= 0) {
12116 if (c == 'h' || c == 'l' || c == 'L') {
12117 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012119 }
12120 }
12121 if (fmtcnt < 0) {
12122 PyErr_SetString(PyExc_ValueError,
12123 "incomplete format");
12124 goto onError;
12125 }
12126 if (c != '%') {
12127 v = getnextarg(args, arglen, &argidx);
12128 if (v == NULL)
12129 goto onError;
12130 }
12131 sign = 0;
12132 fill = ' ';
12133 switch (c) {
12134
12135 case '%':
12136 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012138 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012140 len = 1;
12141 break;
12142
12143 case 's':
12144 case 'r':
12145 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012146 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012147 temp = v;
12148 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012149 }
12150 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012151 if (c == 's')
12152 temp = PyObject_Str(v);
12153 else if (c == 'r')
12154 temp = PyObject_Repr(v);
12155 else
12156 temp = PyObject_ASCII(v);
12157 if (temp == NULL)
12158 goto onError;
12159 if (PyUnicode_Check(temp))
12160 /* nothing to do */;
12161 else {
12162 Py_DECREF(temp);
12163 PyErr_SetString(PyExc_TypeError,
12164 "%s argument has non-string str()");
12165 goto onError;
12166 }
12167 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168 if (PyUnicode_READY(temp) == -1) {
12169 Py_CLEAR(temp);
12170 goto onError;
12171 }
12172 pbuf = PyUnicode_DATA(temp);
12173 kind = PyUnicode_KIND(temp);
12174 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012175 if (prec >= 0 && len > prec)
12176 len = prec;
12177 break;
12178
12179 case 'i':
12180 case 'd':
12181 case 'u':
12182 case 'o':
12183 case 'x':
12184 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012185 isnumok = 0;
12186 if (PyNumber_Check(v)) {
12187 PyObject *iobj=NULL;
12188
12189 if (PyLong_Check(v)) {
12190 iobj = v;
12191 Py_INCREF(iobj);
12192 }
12193 else {
12194 iobj = PyNumber_Long(v);
12195 }
12196 if (iobj!=NULL) {
12197 if (PyLong_Check(iobj)) {
12198 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012199 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012200 Py_DECREF(iobj);
12201 if (!temp)
12202 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 if (PyUnicode_READY(temp) == -1) {
12204 Py_CLEAR(temp);
12205 goto onError;
12206 }
12207 pbuf = PyUnicode_DATA(temp);
12208 kind = PyUnicode_KIND(temp);
12209 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012210 sign = 1;
12211 }
12212 else {
12213 Py_DECREF(iobj);
12214 }
12215 }
12216 }
12217 if (!isnumok) {
12218 PyErr_Format(PyExc_TypeError,
12219 "%%%c format: a number is required, "
12220 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12221 goto onError;
12222 }
12223 if (flags & F_ZERO)
12224 fill = '0';
12225 break;
12226
12227 case 'e':
12228 case 'E':
12229 case 'f':
12230 case 'F':
12231 case 'g':
12232 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012233 temp = formatfloat(v, flags, prec, c);
12234 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012235 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 if (PyUnicode_READY(temp) == -1) {
12237 Py_CLEAR(temp);
12238 goto onError;
12239 }
12240 pbuf = PyUnicode_DATA(temp);
12241 kind = PyUnicode_KIND(temp);
12242 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012243 sign = 1;
12244 if (flags & F_ZERO)
12245 fill = '0';
12246 break;
12247
12248 case 'c':
12249 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012250 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012251 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 if (len < 0)
12253 goto onError;
12254 break;
12255
12256 default:
12257 PyErr_Format(PyExc_ValueError,
12258 "unsupported format character '%c' (0x%x) "
12259 "at index %zd",
12260 (31<=c && c<=126) ? (char)c : '?',
12261 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012262 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012263 goto onError;
12264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 /* pbuf is initialized here. */
12266 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012267 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12269 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12270 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012271 len--;
12272 }
12273 else if (flags & F_SIGN)
12274 sign = '+';
12275 else if (flags & F_BLANK)
12276 sign = ' ';
12277 else
12278 sign = 0;
12279 }
12280 if (width < len)
12281 width = len;
12282 if (rescnt - (sign != 0) < width) {
12283 reslen -= rescnt;
12284 rescnt = width + fmtcnt + 100;
12285 reslen += rescnt;
12286 if (reslen < 0) {
12287 Py_XDECREF(temp);
12288 PyErr_NoMemory();
12289 goto onError;
12290 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12292 if (res0 == 0) {
12293 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012294 Py_XDECREF(temp);
12295 goto onError;
12296 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012298 }
12299 if (sign) {
12300 if (fill != ' ')
12301 *res++ = sign;
12302 rescnt--;
12303 if (width > len)
12304 width--;
12305 }
12306 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12308 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012309 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12311 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012312 }
12313 rescnt -= 2;
12314 width -= 2;
12315 if (width < 0)
12316 width = 0;
12317 len -= 2;
12318 }
12319 if (width > len && !(flags & F_LJUST)) {
12320 do {
12321 --rescnt;
12322 *res++ = fill;
12323 } while (--width > len);
12324 }
12325 if (fill == ' ') {
12326 if (sign)
12327 *res++ = sign;
12328 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12330 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12331 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12332 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012333 }
12334 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 /* Copy all characters, preserving len */
12336 len1 = len;
12337 while (len1--) {
12338 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12339 rescnt--;
12340 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 while (--width >= len) {
12342 --rescnt;
12343 *res++ = ' ';
12344 }
12345 if (dict && (argidx < arglen) && c != '%') {
12346 PyErr_SetString(PyExc_TypeError,
12347 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012348 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012349 goto onError;
12350 }
12351 Py_XDECREF(temp);
12352 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012353 } /* until end */
12354 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012355 PyErr_SetString(PyExc_TypeError,
12356 "not all arguments converted during string formatting");
12357 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358 }
12359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360
12361 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12362 if (*res > max)
12363 max = *res;
12364 result = PyUnicode_New(reslen - rescnt, max);
12365 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012366 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 kind = PyUnicode_KIND(result);
12368 for (res = res0; res < res0+reslen-rescnt; res++)
12369 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12370 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012372 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373 }
12374 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012375 return (PyObject *)result;
12376
Benjamin Peterson29060642009-01-31 22:14:21 +000012377 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379 Py_DECREF(uformat);
12380 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012381 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382 }
12383 return NULL;
12384}
12385
Jeremy Hylton938ace62002-07-17 16:30:39 +000012386static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012387unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12388
Tim Peters6d6c1a32001-08-02 04:15:00 +000012389static PyObject *
12390unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12391{
Benjamin Peterson29060642009-01-31 22:14:21 +000012392 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012393 static char *kwlist[] = {"object", "encoding", "errors", 0};
12394 char *encoding = NULL;
12395 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012396
Benjamin Peterson14339b62009-01-31 16:36:08 +000012397 if (type != &PyUnicode_Type)
12398 return unicode_subtype_new(type, args, kwds);
12399 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012400 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012401 return NULL;
12402 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012403 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012404 if (encoding == NULL && errors == NULL)
12405 return PyObject_Str(x);
12406 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012407 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012408}
12409
Guido van Rossume023fe02001-08-30 03:12:59 +000012410static PyObject *
12411unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12412{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012413 PyUnicodeObject *tmp, *pnew;
12414 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012416
Benjamin Peterson14339b62009-01-31 16:36:08 +000012417 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12418 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12419 if (tmp == NULL)
12420 return NULL;
12421 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012422 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12423 // it seems kind of strange that tp_alloc gets passed the size
12424 // of the unicode string because there will follow another
12425 // malloc.
12426 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12427 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012428 if (pnew == NULL) {
12429 Py_DECREF(tmp);
12430 return NULL;
12431 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12433 if (_PyUnicode_WSTR(pnew) == NULL) {
12434 err = PyErr_NoMemory();
12435 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012436 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12438 _PyUnicode_WSTR_LENGTH(pnew) = n;
12439 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12440 _PyUnicode_STATE(pnew).interned = 0;
12441 _PyUnicode_STATE(pnew).kind = 0;
12442 _PyUnicode_STATE(pnew).compact = 0;
12443 _PyUnicode_STATE(pnew).ready = 0;
12444 _PyUnicode_STATE(pnew).ascii = 0;
12445 pnew->data.any = NULL;
12446 _PyUnicode_LENGTH(pnew) = 0;
12447 pnew->_base.utf8 = NULL;
12448 pnew->_base.utf8_length = 0;
12449
12450 if (PyUnicode_READY(pnew) == -1) {
12451 PyObject_FREE(_PyUnicode_WSTR(pnew));
12452 goto onError;
12453 }
12454
Benjamin Peterson14339b62009-01-31 16:36:08 +000012455 Py_DECREF(tmp);
12456 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457
12458 onError:
12459 _Py_ForgetReference((PyObject *)pnew);
12460 PyObject_Del(pnew);
12461 Py_DECREF(tmp);
12462 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012463}
12464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012465PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012466 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012467\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012468Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012469encoding defaults to the current default string encoding.\n\
12470errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012471
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012472static PyObject *unicode_iter(PyObject *seq);
12473
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012475 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012476 "str", /* tp_name */
12477 sizeof(PyUnicodeObject), /* tp_size */
12478 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012479 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012480 (destructor)unicode_dealloc, /* tp_dealloc */
12481 0, /* tp_print */
12482 0, /* tp_getattr */
12483 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012484 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012485 unicode_repr, /* tp_repr */
12486 &unicode_as_number, /* tp_as_number */
12487 &unicode_as_sequence, /* tp_as_sequence */
12488 &unicode_as_mapping, /* tp_as_mapping */
12489 (hashfunc) unicode_hash, /* tp_hash*/
12490 0, /* tp_call*/
12491 (reprfunc) unicode_str, /* tp_str */
12492 PyObject_GenericGetAttr, /* tp_getattro */
12493 0, /* tp_setattro */
12494 0, /* tp_as_buffer */
12495 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012496 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012497 unicode_doc, /* tp_doc */
12498 0, /* tp_traverse */
12499 0, /* tp_clear */
12500 PyUnicode_RichCompare, /* tp_richcompare */
12501 0, /* tp_weaklistoffset */
12502 unicode_iter, /* tp_iter */
12503 0, /* tp_iternext */
12504 unicode_methods, /* tp_methods */
12505 0, /* tp_members */
12506 0, /* tp_getset */
12507 &PyBaseObject_Type, /* tp_base */
12508 0, /* tp_dict */
12509 0, /* tp_descr_get */
12510 0, /* tp_descr_set */
12511 0, /* tp_dictoffset */
12512 0, /* tp_init */
12513 0, /* tp_alloc */
12514 unicode_new, /* tp_new */
12515 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516};
12517
12518/* Initialize the Unicode implementation */
12519
Thomas Wouters78890102000-07-22 19:25:51 +000012520void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012522 int i;
12523
Thomas Wouters477c8d52006-05-27 19:21:47 +000012524 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012526 0x000A, /* LINE FEED */
12527 0x000D, /* CARRIAGE RETURN */
12528 0x001C, /* FILE SEPARATOR */
12529 0x001D, /* GROUP SEPARATOR */
12530 0x001E, /* RECORD SEPARATOR */
12531 0x0085, /* NEXT LINE */
12532 0x2028, /* LINE SEPARATOR */
12533 0x2029, /* PARAGRAPH SEPARATOR */
12534 };
12535
Fred Drakee4315f52000-05-09 19:53:39 +000012536 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012538 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012540
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012541 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012542 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012543 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012544 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012545
12546 /* initialize the linebreak bloom filter */
12547 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012549 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012550
12551 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552}
12553
12554/* Finalize the Unicode implementation */
12555
Christian Heimesa156e092008-02-16 07:38:31 +000012556int
12557PyUnicode_ClearFreeList(void)
12558{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012560}
12561
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562void
Thomas Wouters78890102000-07-22 19:25:51 +000012563_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012565 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012567 Py_XDECREF(unicode_empty);
12568 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012569
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012570 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012571 if (unicode_latin1[i]) {
12572 Py_DECREF(unicode_latin1[i]);
12573 unicode_latin1[i] = NULL;
12574 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012575 }
Christian Heimesa156e092008-02-16 07:38:31 +000012576 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012578
Walter Dörwald16807132007-05-25 13:52:07 +000012579void
12580PyUnicode_InternInPlace(PyObject **p)
12581{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012582 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12583 PyObject *t;
12584 if (s == NULL || !PyUnicode_Check(s))
12585 Py_FatalError(
12586 "PyUnicode_InternInPlace: unicode strings only please!");
12587 /* If it's a subclass, we don't really know what putting
12588 it in the interned dict might do. */
12589 if (!PyUnicode_CheckExact(s))
12590 return;
12591 if (PyUnicode_CHECK_INTERNED(s))
12592 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 if (PyUnicode_READY(s) == -1) {
12594 assert(0 && "ready fail in intern...");
12595 return;
12596 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012597 if (interned == NULL) {
12598 interned = PyDict_New();
12599 if (interned == NULL) {
12600 PyErr_Clear(); /* Don't leave an exception */
12601 return;
12602 }
12603 }
12604 /* It might be that the GetItem call fails even
12605 though the key is present in the dictionary,
12606 namely when this happens during a stack overflow. */
12607 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012608 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012609 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012610
Benjamin Peterson29060642009-01-31 22:14:21 +000012611 if (t) {
12612 Py_INCREF(t);
12613 Py_DECREF(*p);
12614 *p = t;
12615 return;
12616 }
Walter Dörwald16807132007-05-25 13:52:07 +000012617
Benjamin Peterson14339b62009-01-31 16:36:08 +000012618 PyThreadState_GET()->recursion_critical = 1;
12619 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12620 PyErr_Clear();
12621 PyThreadState_GET()->recursion_critical = 0;
12622 return;
12623 }
12624 PyThreadState_GET()->recursion_critical = 0;
12625 /* The two references in interned are not counted by refcnt.
12626 The deallocator will take care of this */
12627 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012629}
12630
12631void
12632PyUnicode_InternImmortal(PyObject **p)
12633{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12635
Benjamin Peterson14339b62009-01-31 16:36:08 +000012636 PyUnicode_InternInPlace(p);
12637 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012639 Py_INCREF(*p);
12640 }
Walter Dörwald16807132007-05-25 13:52:07 +000012641}
12642
12643PyObject *
12644PyUnicode_InternFromString(const char *cp)
12645{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012646 PyObject *s = PyUnicode_FromString(cp);
12647 if (s == NULL)
12648 return NULL;
12649 PyUnicode_InternInPlace(&s);
12650 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012651}
12652
Alexander Belopolsky40018472011-02-26 01:02:56 +000012653void
12654_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012655{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012656 PyObject *keys;
12657 PyUnicodeObject *s;
12658 Py_ssize_t i, n;
12659 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012660
Benjamin Peterson14339b62009-01-31 16:36:08 +000012661 if (interned == NULL || !PyDict_Check(interned))
12662 return;
12663 keys = PyDict_Keys(interned);
12664 if (keys == NULL || !PyList_Check(keys)) {
12665 PyErr_Clear();
12666 return;
12667 }
Walter Dörwald16807132007-05-25 13:52:07 +000012668
Benjamin Peterson14339b62009-01-31 16:36:08 +000012669 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12670 detector, interned unicode strings are not forcibly deallocated;
12671 rather, we give them their stolen references back, and then clear
12672 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012673
Benjamin Peterson14339b62009-01-31 16:36:08 +000012674 n = PyList_GET_SIZE(keys);
12675 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012676 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012677 for (i = 0; i < n; i++) {
12678 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679 if (PyUnicode_READY(s) == -1)
12680 fprintf(stderr, "could not ready string\n");
12681 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012682 case SSTATE_NOT_INTERNED:
12683 /* XXX Shouldn't happen */
12684 break;
12685 case SSTATE_INTERNED_IMMORTAL:
12686 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012687 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012688 break;
12689 case SSTATE_INTERNED_MORTAL:
12690 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012692 break;
12693 default:
12694 Py_FatalError("Inconsistent interned string state.");
12695 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012697 }
12698 fprintf(stderr, "total size of all interned strings: "
12699 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12700 "mortal/immortal\n", mortal_size, immortal_size);
12701 Py_DECREF(keys);
12702 PyDict_Clear(interned);
12703 Py_DECREF(interned);
12704 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012705}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012706
12707
12708/********************* Unicode Iterator **************************/
12709
12710typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012711 PyObject_HEAD
12712 Py_ssize_t it_index;
12713 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012714} unicodeiterobject;
12715
12716static void
12717unicodeiter_dealloc(unicodeiterobject *it)
12718{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012719 _PyObject_GC_UNTRACK(it);
12720 Py_XDECREF(it->it_seq);
12721 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012722}
12723
12724static int
12725unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12726{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012727 Py_VISIT(it->it_seq);
12728 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012729}
12730
12731static PyObject *
12732unicodeiter_next(unicodeiterobject *it)
12733{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012734 PyUnicodeObject *seq;
12735 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012736
Benjamin Peterson14339b62009-01-31 16:36:08 +000012737 assert(it != NULL);
12738 seq = it->it_seq;
12739 if (seq == NULL)
12740 return NULL;
12741 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12744 int kind = PyUnicode_KIND(seq);
12745 void *data = PyUnicode_DATA(seq);
12746 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12747 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012748 if (item != NULL)
12749 ++it->it_index;
12750 return item;
12751 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012752
Benjamin Peterson14339b62009-01-31 16:36:08 +000012753 Py_DECREF(seq);
12754 it->it_seq = NULL;
12755 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012756}
12757
12758static PyObject *
12759unicodeiter_len(unicodeiterobject *it)
12760{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012761 Py_ssize_t len = 0;
12762 if (it->it_seq)
12763 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12764 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012765}
12766
12767PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12768
12769static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012770 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012771 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012772 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012773};
12774
12775PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012776 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12777 "str_iterator", /* tp_name */
12778 sizeof(unicodeiterobject), /* tp_basicsize */
12779 0, /* tp_itemsize */
12780 /* methods */
12781 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12782 0, /* tp_print */
12783 0, /* tp_getattr */
12784 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012785 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012786 0, /* tp_repr */
12787 0, /* tp_as_number */
12788 0, /* tp_as_sequence */
12789 0, /* tp_as_mapping */
12790 0, /* tp_hash */
12791 0, /* tp_call */
12792 0, /* tp_str */
12793 PyObject_GenericGetAttr, /* tp_getattro */
12794 0, /* tp_setattro */
12795 0, /* tp_as_buffer */
12796 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12797 0, /* tp_doc */
12798 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12799 0, /* tp_clear */
12800 0, /* tp_richcompare */
12801 0, /* tp_weaklistoffset */
12802 PyObject_SelfIter, /* tp_iter */
12803 (iternextfunc)unicodeiter_next, /* tp_iternext */
12804 unicodeiter_methods, /* tp_methods */
12805 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012806};
12807
12808static PyObject *
12809unicode_iter(PyObject *seq)
12810{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012811 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012812
Benjamin Peterson14339b62009-01-31 16:36:08 +000012813 if (!PyUnicode_Check(seq)) {
12814 PyErr_BadInternalCall();
12815 return NULL;
12816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012817 if (PyUnicode_READY(seq) == -1)
12818 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012819 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12820 if (it == NULL)
12821 return NULL;
12822 it->it_index = 0;
12823 Py_INCREF(seq);
12824 it->it_seq = (PyUnicodeObject *)seq;
12825 _PyObject_GC_TRACK(it);
12826 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012827}
12828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012829#define UNIOP(x) Py_UNICODE_##x
12830#define UNIOP_t Py_UNICODE
12831#include "uniops.h"
12832#undef UNIOP
12833#undef UNIOP_t
12834#define UNIOP(x) Py_UCS4_##x
12835#define UNIOP_t Py_UCS4
12836#include "uniops.h"
12837#undef UNIOP
12838#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012839
Victor Stinner71133ff2010-09-01 23:43:53 +000012840Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012841PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012842{
12843 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12844 Py_UNICODE *copy;
12845 Py_ssize_t size;
12846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847 if (!PyUnicode_Check(unicode)) {
12848 PyErr_BadArgument();
12849 return NULL;
12850 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012851 /* Ensure we won't overflow the size. */
12852 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12853 PyErr_NoMemory();
12854 return NULL;
12855 }
12856 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12857 size *= sizeof(Py_UNICODE);
12858 copy = PyMem_Malloc(size);
12859 if (copy == NULL) {
12860 PyErr_NoMemory();
12861 return NULL;
12862 }
12863 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12864 return copy;
12865}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012866
Georg Brandl66c221e2010-10-14 07:04:07 +000012867/* A _string module, to export formatter_parser and formatter_field_name_split
12868 to the string.Formatter class implemented in Python. */
12869
12870static PyMethodDef _string_methods[] = {
12871 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12872 METH_O, PyDoc_STR("split the argument as a field name")},
12873 {"formatter_parser", (PyCFunction) formatter_parser,
12874 METH_O, PyDoc_STR("parse the argument as a format string")},
12875 {NULL, NULL}
12876};
12877
12878static struct PyModuleDef _string_module = {
12879 PyModuleDef_HEAD_INIT,
12880 "_string",
12881 PyDoc_STR("string helper module"),
12882 0,
12883 _string_methods,
12884 NULL,
12885 NULL,
12886 NULL,
12887 NULL
12888};
12889
12890PyMODINIT_FUNC
12891PyInit__string(void)
12892{
12893 return PyModule_Create(&_string_module);
12894}
12895
12896
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012897#ifdef __cplusplus
12898}
12899#endif