blob: 0d06fcb1ecfbcf780192ff081872389edef294d8 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092/* Generic helper macro to convert characters of different types.
93 from_type and to_type have to be valid type names, begin and end
94 are pointers to the source characters which should be of type
95 "from_type *". to is a pointer of type "to_type *" and points to the
96 buffer where the result characters are written to. */
97#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
98 do { \
99 const from_type *iter_; to_type *to_; \
100 for (iter_ = (begin), to_ = (to_type *)(to); \
101 iter_ < (end); \
102 ++iter_, ++to_) { \
103 *to_ = (to_type)*iter_; \
104 } \
105 } while (0)
106
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107#define _PyUnicode_UTF8(op) \
108 (((PyCompactUnicodeObject*)(op))->utf8)
109#define PyUnicode_UTF8(op) \
110 (assert(PyUnicode_Check(op)), \
111 assert(PyUnicode_IS_READY(op)), \
112 PyUnicode_IS_COMPACT_ASCII(op) ? \
113 ((char*)((PyASCIIObject*)(op) + 1)) : \
114 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200115#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 (((PyCompactUnicodeObject*)(op))->utf8_length)
117#define PyUnicode_UTF8_LENGTH(op) \
118 (assert(PyUnicode_Check(op)), \
119 assert(PyUnicode_IS_READY(op)), \
120 PyUnicode_IS_COMPACT_ASCII(op) ? \
121 ((PyASCIIObject*)(op))->length : \
122 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
124#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
125#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
126#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
128#define _PyUnicode_KIND(op) \
129 (assert(PyUnicode_Check(op)), \
130 ((PyASCIIObject *)(op))->state.kind)
131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(PyUnicode_Check(op)), \
133 ((PyASCIIObject *)(op))->length)
134
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200135/* The Unicode string has been modified: reset the hash */
136#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200138
Walter Dörwald16807132007-05-25 13:52:07 +0000139/* This dictionary holds all interned unicode strings. Note that references
140 to strings in this dictionary are *not* counted in the string's ob_refcnt.
141 When the interned string reaches a refcnt of 0 the string deallocation
142 function will delete the reference from this dictionary.
143
144 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000145 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000146*/
147static PyObject *interned;
148
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000149/* The empty Unicode object is shared to improve performance. */
150static PyUnicodeObject *unicode_empty;
151
152/* Single character Unicode strings in the Latin-1 range are being
153 shared as well. */
154static PyUnicodeObject *unicode_latin1[256];
155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Fast detection of the most frequent whitespace characters */
157const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000159/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000161/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* case 0x000C: * FORM FEED */
163/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 1, 1, 1, 1, 1, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000166/* case 0x001C: * FILE SEPARATOR */
167/* case 0x001D: * GROUP SEPARATOR */
168/* case 0x001E: * RECORD SEPARATOR */
169/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 1, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
Alexander Belopolsky40018472011-02-26 01:02:56 +0000187static PyObject *
188unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000189 PyObject **errorHandler,const char *encoding, const char *reason,
190 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
191 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
192
Alexander Belopolsky40018472011-02-26 01:02:56 +0000193static void
194raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300195 const char *encoding,
196 const Py_UNICODE *unicode, Py_ssize_t size,
197 Py_ssize_t startpos, Py_ssize_t endpos,
198 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000199
Christian Heimes190d79e2008-01-30 11:58:22 +0000200/* Same for linebreaks */
201static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000204/* 0x000B, * LINE TABULATION */
205/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000206/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* 0x001C, * FILE SEPARATOR */
210/* 0x001D, * GROUP SEPARATOR */
211/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 1, 1, 1, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300228/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
229 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000230Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000231PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000232{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000233#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000235#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 /* This is actually an illegal character, so it should
237 not be passed to unichr. */
238 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000239#endif
240}
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242/* --- Bloom Filters ----------------------------------------------------- */
243
244/* stuff to implement simple "bloom filters" for Unicode characters.
245 to keep things simple, we use a single bitmask, using the least 5
246 bits from each unicode characters as the bit index. */
247
248/* the linebreak mask is set up by Unicode_Init below */
249
Antoine Pitrouf068f942010-01-13 14:19:12 +0000250#if LONG_BIT >= 128
251#define BLOOM_WIDTH 128
252#elif LONG_BIT >= 64
253#define BLOOM_WIDTH 64
254#elif LONG_BIT >= 32
255#define BLOOM_WIDTH 32
256#else
257#error "LONG_BIT is smaller than 32"
258#endif
259
Thomas Wouters477c8d52006-05-27 19:21:47 +0000260#define BLOOM_MASK unsigned long
261
262static BLOOM_MASK bloom_linebreak;
263
Antoine Pitrouf068f942010-01-13 14:19:12 +0000264#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
265#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266
Benjamin Peterson29060642009-01-31 22:14:21 +0000267#define BLOOM_LINEBREAK(ch) \
268 ((ch) < 128U ? ascii_linebreak[(ch)] : \
269 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270
Alexander Belopolsky40018472011-02-26 01:02:56 +0000271Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200272make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273{
274 /* calculate simple bloom-style bitmask for a given unicode string */
275
Antoine Pitrouf068f942010-01-13 14:19:12 +0000276 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000277 Py_ssize_t i;
278
279 mask = 0;
280 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
283 return mask;
284}
285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200286#define BLOOM_MEMBER(mask, chr, str) \
287 (BLOOM(mask, chr) \
288 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000289
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290/* --- Unicode Object ----------------------------------------------------- */
291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200292static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200293fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
294
295Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
296 Py_ssize_t size, Py_UCS4 ch,
297 int direction)
298{
299 /* like wcschr, but doesn't stop at NULL characters */
300 Py_ssize_t i;
301 if (direction == 1) {
302 for(i = 0; i < size; i++)
303 if (PyUnicode_READ(kind, s, i) == ch)
304 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
305 }
306 else {
307 for(i = size-1; i >= 0; i--)
308 if (PyUnicode_READ(kind, s, i) == ch)
309 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
310 }
311 return NULL;
312}
313
Alexander Belopolsky40018472011-02-26 01:02:56 +0000314static int
315unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000317{
318 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200320 /* Resizing is only supported for old unicode objects. */
321 assert(!PyUnicode_IS_COMPACT(unicode));
322 assert(_PyUnicode_WSTR(unicode) != NULL);
323
324 /* ... and only if they have not been readied yet, because
325 callees usually rely on the wstr representation when resizing. */
326 assert(unicode->data.any == NULL);
327
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000328 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200329 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000330 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000332 /* Resizing shared object (unicode_empty or single character
333 objects) in-place is not allowed. Use PyUnicode_Resize()
334 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000335
Benjamin Peterson14339b62009-01-31 16:36:08 +0000336 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200337 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
338 _PyUnicode_WSTR(unicode)[0] < 256U &&
339 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000341 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 return -1;
343 }
344
Thomas Wouters477c8d52006-05-27 19:21:47 +0000345 /* We allocate one more byte to make sure the string is Ux0000 terminated.
346 The overallocation is also used by fastsearch, which assumes that it's
347 safe to look at str[length] (without making any assumptions about what
348 it contains). */
349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350 oldstr = _PyUnicode_WSTR(unicode);
351 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
352 sizeof(Py_UNICODE) * (length + 1));
353 if (!_PyUnicode_WSTR(unicode)) {
354 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 PyErr_NoMemory();
356 return -1;
357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 _PyUnicode_WSTR(unicode)[length] = 0;
359 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360
Benjamin Peterson29060642009-01-31 22:14:21 +0000361 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362 if (unicode->data.any != NULL) {
363 PyObject_FREE(unicode->data.any);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200364 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != unicode->data.any) {
365 PyObject_FREE(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200367 _PyUnicode_UTF8(unicode) = NULL;
368 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200369 unicode->data.any = NULL;
370 _PyUnicode_LENGTH(unicode) = 0;
371 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
372 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200374 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000375
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return 0;
377}
378
379/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000380 Ux0000 terminated; some code (e.g. new_identifier)
381 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382
383 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385
386*/
387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200388#ifdef Py_DEBUG
389int unicode_old_new_calls = 0;
390#endif
391
Alexander Belopolsky40018472011-02-26 01:02:56 +0000392static PyUnicodeObject *
393_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394{
395 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200396 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 if (length == 0 && unicode_empty != NULL) {
400 Py_INCREF(unicode_empty);
401 return unicode_empty;
402 }
403
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000404 /* Ensure we won't overflow the size. */
405 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
406 return (PyUnicodeObject *)PyErr_NoMemory();
407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200408 if (length < 0) {
409 PyErr_SetString(PyExc_SystemError,
410 "Negative size passed to _PyUnicode_New");
411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000412 }
413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200414#ifdef Py_DEBUG
415 ++unicode_old_new_calls;
416#endif
417
418 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
419 if (unicode == NULL)
420 return NULL;
421 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
422 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
423 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 PyErr_NoMemory();
425 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200427
Jeremy Hyltond8082792003-09-16 19:41:39 +0000428 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000429 * the caller fails before initializing str -- unicode_resize()
430 * reads str[0], and the Keep-Alive optimization can keep memory
431 * allocated for str alive across a call to unicode_dealloc(unicode).
432 * We don't want unicode_resize to read uninitialized memory in
433 * that case.
434 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200435 _PyUnicode_WSTR(unicode)[0] = 0;
436 _PyUnicode_WSTR(unicode)[length] = 0;
437 _PyUnicode_WSTR_LENGTH(unicode) = length;
438 _PyUnicode_HASH(unicode) = -1;
439 _PyUnicode_STATE(unicode).interned = 0;
440 _PyUnicode_STATE(unicode).kind = 0;
441 _PyUnicode_STATE(unicode).compact = 0;
442 _PyUnicode_STATE(unicode).ready = 0;
443 _PyUnicode_STATE(unicode).ascii = 0;
444 unicode->data.any = NULL;
445 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200446 _PyUnicode_UTF8(unicode) = NULL;
447 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000449
Benjamin Peterson29060642009-01-31 22:14:21 +0000450 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000451 /* XXX UNREF/NEWREF interface should be more symmetrical */
452 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000453 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000454 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000455 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456}
457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200458#ifdef Py_DEBUG
459int unicode_new_new_calls = 0;
460
461/* Functions wrapping macros for use in debugger */
462char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200463 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200464}
465
466void *_PyUnicode_compact_data(void *unicode) {
467 return _PyUnicode_COMPACT_DATA(unicode);
468}
469void *_PyUnicode_data(void *unicode){
470 printf("obj %p\n", unicode);
471 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
472 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
473 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
474 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
475 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
476 return PyUnicode_DATA(unicode);
477}
478#endif
479
480PyObject *
481PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
482{
483 PyObject *obj;
484 PyCompactUnicodeObject *unicode;
485 void *data;
486 int kind_state;
487 int is_sharing = 0, is_ascii = 0;
488 Py_ssize_t char_size;
489 Py_ssize_t struct_size;
490
491 /* Optimization for empty strings */
492 if (size == 0 && unicode_empty != NULL) {
493 Py_INCREF(unicode_empty);
494 return (PyObject *)unicode_empty;
495 }
496
497#ifdef Py_DEBUG
498 ++unicode_new_new_calls;
499#endif
500
501 struct_size = sizeof(PyCompactUnicodeObject);
502 if (maxchar < 128) {
503 kind_state = PyUnicode_1BYTE_KIND;
504 char_size = 1;
505 is_ascii = 1;
506 struct_size = sizeof(PyASCIIObject);
507 }
508 else if (maxchar < 256) {
509 kind_state = PyUnicode_1BYTE_KIND;
510 char_size = 1;
511 }
512 else if (maxchar < 65536) {
513 kind_state = PyUnicode_2BYTE_KIND;
514 char_size = 2;
515 if (sizeof(wchar_t) == 2)
516 is_sharing = 1;
517 }
518 else {
519 kind_state = PyUnicode_4BYTE_KIND;
520 char_size = 4;
521 if (sizeof(wchar_t) == 4)
522 is_sharing = 1;
523 }
524
525 /* Ensure we won't overflow the size. */
526 if (size < 0) {
527 PyErr_SetString(PyExc_SystemError,
528 "Negative size passed to PyUnicode_New");
529 return NULL;
530 }
531 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
532 return PyErr_NoMemory();
533
534 /* Duplicated allocation code from _PyObject_New() instead of a call to
535 * PyObject_New() so we are able to allocate space for the object and
536 * it's data buffer.
537 */
538 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
539 if (obj == NULL)
540 return PyErr_NoMemory();
541 obj = PyObject_INIT(obj, &PyUnicode_Type);
542 if (obj == NULL)
543 return NULL;
544
545 unicode = (PyCompactUnicodeObject *)obj;
546 if (is_ascii)
547 data = ((PyASCIIObject*)obj) + 1;
548 else
549 data = unicode + 1;
550 _PyUnicode_LENGTH(unicode) = size;
551 _PyUnicode_HASH(unicode) = -1;
552 _PyUnicode_STATE(unicode).interned = 0;
553 _PyUnicode_STATE(unicode).kind = kind_state;
554 _PyUnicode_STATE(unicode).compact = 1;
555 _PyUnicode_STATE(unicode).ready = 1;
556 _PyUnicode_STATE(unicode).ascii = is_ascii;
557 if (is_ascii) {
558 ((char*)data)[size] = 0;
559 _PyUnicode_WSTR(unicode) = NULL;
560 }
561 else if (kind_state == PyUnicode_1BYTE_KIND) {
562 ((char*)data)[size] = 0;
563 _PyUnicode_WSTR(unicode) = NULL;
564 _PyUnicode_WSTR_LENGTH(unicode) = 0;
565 unicode->utf8_length = 0;
566 unicode->utf8 = NULL;
567 }
568 else {
569 unicode->utf8 = NULL;
570 if (kind_state == PyUnicode_2BYTE_KIND)
571 ((Py_UCS2*)data)[size] = 0;
572 else /* kind_state == PyUnicode_4BYTE_KIND */
573 ((Py_UCS4*)data)[size] = 0;
574 if (is_sharing) {
575 _PyUnicode_WSTR_LENGTH(unicode) = size;
576 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
577 }
578 else {
579 _PyUnicode_WSTR_LENGTH(unicode) = 0;
580 _PyUnicode_WSTR(unicode) = NULL;
581 }
582 }
583 return obj;
584}
585
586#if SIZEOF_WCHAR_T == 2
587/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
588 will decode surrogate pairs, the other conversions are implemented as macros
589 for efficency.
590
591 This function assumes that unicode can hold one more code point than wstr
592 characters for a terminating null character. */
593static int
594unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
595 PyUnicodeObject *unicode)
596{
597 const wchar_t *iter;
598 Py_UCS4 *ucs4_out;
599
600 assert(unicode && PyUnicode_Check(unicode));
601 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
602 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
603
604 for (iter = begin; iter < end; ) {
605 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
606 _PyUnicode_GET_LENGTH(unicode)));
607 if (*iter >= 0xD800 && *iter <= 0xDBFF
608 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
609 {
610 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
611 iter += 2;
612 }
613 else {
614 *ucs4_out++ = *iter;
615 iter++;
616 }
617 }
618 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
619 _PyUnicode_GET_LENGTH(unicode)));
620
621 return 0;
622}
623#endif
624
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200625Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200626PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
627 PyObject *from, Py_ssize_t from_start,
628 Py_ssize_t how_many)
629{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200630 unsigned int from_kind, to_kind;
631 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200632
Victor Stinnerb1536152011-09-30 02:26:10 +0200633 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
634 PyErr_BadInternalCall();
635 return -1;
636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637
638 if (PyUnicode_READY(from))
639 return -1;
640 if (PyUnicode_READY(to))
641 return -1;
642
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200643 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200644 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
645 PyErr_Format(PyExc_ValueError,
646 "Cannot write %zi characters at %zi "
647 "in a string of %zi characters",
648 how_many, to_start, PyUnicode_GET_LENGTH(to));
649 return -1;
650 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200651 if (how_many == 0)
652 return 0;
653
654 if (Py_REFCNT(to) != 1) {
655 PyErr_SetString(PyExc_ValueError,
656 "Cannot modify a string having more than 1 reference");
657 return -1;
658 }
Victor Stinnerc17f5402011-09-29 00:16:58 +0200659 _PyUnicode_DIRTY(to);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200661 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200662 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200663 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200664 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200665
666 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200667 /* fast path */
Victor Stinnera0702ab2011-09-29 14:14:38 +0200668 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200669 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200670 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200671 + PyUnicode_KIND_SIZE(from_kind, from_start),
672 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200673 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200674 else if (from_kind == PyUnicode_1BYTE_KIND
675 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200676 {
677 _PyUnicode_CONVERT_BYTES(
678 Py_UCS1, Py_UCS2,
679 PyUnicode_1BYTE_DATA(from) + from_start,
680 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
681 PyUnicode_2BYTE_DATA(to) + to_start
682 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200683 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200684 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200685 && to_kind == PyUnicode_4BYTE_KIND)
686 {
687 _PyUnicode_CONVERT_BYTES(
688 Py_UCS1, Py_UCS4,
689 PyUnicode_1BYTE_DATA(from) + from_start,
690 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
691 PyUnicode_4BYTE_DATA(to) + to_start
692 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200693 }
694 else if (from_kind == PyUnicode_2BYTE_KIND
695 && to_kind == PyUnicode_4BYTE_KIND)
696 {
697 _PyUnicode_CONVERT_BYTES(
698 Py_UCS2, Py_UCS4,
699 PyUnicode_2BYTE_DATA(from) + from_start,
700 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
701 PyUnicode_4BYTE_DATA(to) + to_start
702 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200703 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200704 else {
705 int invalid_kinds;
706 if (from_kind > to_kind) {
707 /* slow path to check for character overflow */
708 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
709 Py_UCS4 ch, maxchar;
710 Py_ssize_t i;
711
712 maxchar = 0;
713 invalid_kinds = 0;
714 for (i=0; i < how_many; i++) {
715 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
716 if (ch > maxchar) {
717 maxchar = ch;
718 if (maxchar > to_maxchar) {
719 invalid_kinds = 1;
720 break;
721 }
722 }
723 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
724 }
725 }
726 else
727 invalid_kinds = 1;
728 if (invalid_kinds) {
729 PyErr_Format(PyExc_ValueError,
730 "Cannot copy UCS%u characters "
731 "into a string of UCS%u characters",
732 1 << (from_kind - 1),
733 1 << (to_kind -1));
734 return -1;
735 }
736 }
737 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200738}
739
Victor Stinner17222162011-09-28 22:15:37 +0200740/* Find the maximum code point and count the number of surrogate pairs so a
741 correct string length can be computed before converting a string to UCS4.
742 This function counts single surrogates as a character and not as a pair.
743
744 Return 0 on success, or -1 on error. */
745static int
746find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
747 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200748{
749 const wchar_t *iter;
750
751 if (num_surrogates == NULL || maxchar == NULL) {
752 PyErr_SetString(PyExc_SystemError,
753 "unexpected NULL arguments to "
754 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
755 return -1;
756 }
757
758 *num_surrogates = 0;
759 *maxchar = 0;
760
761 for (iter = begin; iter < end; ) {
762 if (*iter > *maxchar)
763 *maxchar = *iter;
764#if SIZEOF_WCHAR_T == 2
765 if (*iter >= 0xD800 && *iter <= 0xDBFF
766 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
767 {
768 Py_UCS4 surrogate_val;
769 surrogate_val = (((iter[0] & 0x3FF)<<10)
770 | (iter[1] & 0x3FF)) + 0x10000;
771 ++(*num_surrogates);
772 if (surrogate_val > *maxchar)
773 *maxchar = surrogate_val;
774 iter += 2;
775 }
776 else
777 iter++;
778#else
779 iter++;
780#endif
781 }
782 return 0;
783}
784
785#ifdef Py_DEBUG
786int unicode_ready_calls = 0;
787#endif
788
789int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200790_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200791{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200792 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200793 wchar_t *end;
794 Py_UCS4 maxchar = 0;
795 Py_ssize_t num_surrogates;
796#if SIZEOF_WCHAR_T == 2
797 Py_ssize_t length_wo_surrogates;
798#endif
799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200800 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200801 strings were created using _PyObject_New() and where no canonical
802 representation (the str field) has been set yet aka strings
803 which are not yet ready. */
804 assert(PyUnicode_Check(obj));
805 assert(!PyUnicode_IS_READY(obj));
806 assert(!PyUnicode_IS_COMPACT(obj));
807 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200809 assert(unicode->data.any == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200810 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200811 /* Actually, it should neither be interned nor be anything else: */
812 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813
814#ifdef Py_DEBUG
815 ++unicode_ready_calls;
816#endif
817
818 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200819 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200820 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200822
823 if (maxchar < 256) {
824 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
825 if (!unicode->data.any) {
826 PyErr_NoMemory();
827 return -1;
828 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200829 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200830 _PyUnicode_WSTR(unicode), end,
831 PyUnicode_1BYTE_DATA(unicode));
832 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
833 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
834 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
835 if (maxchar < 128) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200836 _PyUnicode_UTF8(unicode) = unicode->data.any;
837 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838 }
839 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200840 _PyUnicode_UTF8(unicode) = NULL;
841 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842 }
843 PyObject_FREE(_PyUnicode_WSTR(unicode));
844 _PyUnicode_WSTR(unicode) = NULL;
845 _PyUnicode_WSTR_LENGTH(unicode) = 0;
846 }
847 /* In this case we might have to convert down from 4-byte native
848 wchar_t to 2-byte unicode. */
849 else if (maxchar < 65536) {
850 assert(num_surrogates == 0 &&
851 "FindMaxCharAndNumSurrogatePairs() messed up");
852
Victor Stinner506f5922011-09-28 22:34:18 +0200853#if SIZEOF_WCHAR_T == 2
854 /* We can share representations and are done. */
855 unicode->data.any = _PyUnicode_WSTR(unicode);
856 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
857 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
858 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200859 _PyUnicode_UTF8(unicode) = NULL;
860 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200861#else
862 /* sizeof(wchar_t) == 4 */
863 unicode->data.any = PyObject_MALLOC(
864 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
865 if (!unicode->data.any) {
866 PyErr_NoMemory();
867 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200868 }
Victor Stinner506f5922011-09-28 22:34:18 +0200869 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
870 _PyUnicode_WSTR(unicode), end,
871 PyUnicode_2BYTE_DATA(unicode));
872 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
873 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
874 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200875 _PyUnicode_UTF8(unicode) = NULL;
876 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200877 PyObject_FREE(_PyUnicode_WSTR(unicode));
878 _PyUnicode_WSTR(unicode) = NULL;
879 _PyUnicode_WSTR_LENGTH(unicode) = 0;
880#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881 }
882 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
883 else {
884#if SIZEOF_WCHAR_T == 2
885 /* in case the native representation is 2-bytes, we need to allocate a
886 new normalized 4-byte version. */
887 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
888 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
889 if (!unicode->data.any) {
890 PyErr_NoMemory();
891 return -1;
892 }
893 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
894 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200895 _PyUnicode_UTF8(unicode) = NULL;
896 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
898 unicode) < 0) {
899 assert(0 && "ConvertWideCharToUCS4 failed");
900 return -1;
901 }
902 PyObject_FREE(_PyUnicode_WSTR(unicode));
903 _PyUnicode_WSTR(unicode) = NULL;
904 _PyUnicode_WSTR_LENGTH(unicode) = 0;
905#else
906 assert(num_surrogates == 0);
907
908 unicode->data.any = _PyUnicode_WSTR(unicode);
909 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200910 _PyUnicode_UTF8(unicode) = NULL;
911 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
913#endif
914 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
915 }
916 _PyUnicode_STATE(unicode).ready = 1;
917 return 0;
918}
919
Alexander Belopolsky40018472011-02-26 01:02:56 +0000920static void
921unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000922{
Walter Dörwald16807132007-05-25 13:52:07 +0000923 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000924 case SSTATE_NOT_INTERNED:
925 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000926
Benjamin Peterson29060642009-01-31 22:14:21 +0000927 case SSTATE_INTERNED_MORTAL:
928 /* revive dead object temporarily for DelItem */
929 Py_REFCNT(unicode) = 3;
930 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
931 Py_FatalError(
932 "deletion of interned string failed");
933 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000934
Benjamin Peterson29060642009-01-31 22:14:21 +0000935 case SSTATE_INTERNED_IMMORTAL:
936 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000937
Benjamin Peterson29060642009-01-31 22:14:21 +0000938 default:
939 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000940 }
941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200942 if (_PyUnicode_WSTR(unicode) &&
943 (!PyUnicode_IS_READY(unicode) ||
944 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
945 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200946 if (!PyUnicode_IS_COMPACT_ASCII(unicode)
947 && _PyUnicode_UTF8(unicode)
948 && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
949 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200950
951 if (PyUnicode_IS_COMPACT(unicode)) {
952 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000953 }
954 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200955 if (unicode->data.any)
956 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000957 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000958 }
959}
960
Alexander Belopolsky40018472011-02-26 01:02:56 +0000961static int
962_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000963{
964 register PyUnicodeObject *v;
965
966 /* Argument checks */
967 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000968 PyErr_BadInternalCall();
969 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000970 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000971 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
973 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000974 PyErr_BadInternalCall();
975 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000976 }
977
978 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 possible since these are being shared.
980 The same goes for new-representation unicode objects or objects which
981 have already been readied.
982 For these, we simply return a fresh copy with the same Unicode content.
983 */
984 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
985 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
986 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000987 PyUnicodeObject *w = _PyUnicode_New(length);
988 if (w == NULL)
989 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
991 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000992 Py_DECREF(*unicode);
993 *unicode = w;
994 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000995 }
996
997 /* Note that we don't have to modify *unicode for unshared Unicode
998 objects, since we can modify them in-place. */
999 return unicode_resize(v, length);
1000}
1001
Alexander Belopolsky40018472011-02-26 01:02:56 +00001002int
1003PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001004{
1005 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
1006}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001008static PyObject*
1009get_latin1_char(unsigned char ch)
1010{
1011 PyUnicodeObject *unicode = unicode_latin1[ch];
1012 if (!unicode) {
1013 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
1014 if (!unicode)
1015 return NULL;
1016 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1017 unicode_latin1[ch] = unicode;
1018 }
1019 Py_INCREF(unicode);
1020 return (PyObject *)unicode;
1021}
1022
Alexander Belopolsky40018472011-02-26 01:02:56 +00001023PyObject *
1024PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025{
1026 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027 Py_UCS4 maxchar = 0;
1028 Py_ssize_t num_surrogates;
1029
1030 if (u == NULL)
1031 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001032
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001033 /* If the Unicode data is known at construction time, we can apply
1034 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001036 /* Optimization for empty strings */
1037 if (size == 0 && unicode_empty != NULL) {
1038 Py_INCREF(unicode_empty);
1039 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001040 }
Tim Petersced69f82003-09-16 20:30:58 +00001041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001042 /* Single character Unicode objects in the Latin-1 range are
1043 shared when using this constructor */
1044 if (size == 1 && *u < 256)
1045 return get_latin1_char((unsigned char)*u);
1046
1047 /* If not empty and not single character, copy the Unicode data
1048 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001049 if (find_maxchar_surrogates(u, u + size,
1050 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 return NULL;
1052
1053 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1054 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055 if (!unicode)
1056 return NULL;
1057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 switch (PyUnicode_KIND(unicode)) {
1059 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001060 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1062 break;
1063 case PyUnicode_2BYTE_KIND:
1064#if Py_UNICODE_SIZE == 2
1065 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1066#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001067 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1069#endif
1070 break;
1071 case PyUnicode_4BYTE_KIND:
1072#if SIZEOF_WCHAR_T == 2
1073 /* This is the only case which has to process surrogates, thus
1074 a simple copy loop is not enough and we need a function. */
1075 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1076 Py_DECREF(unicode);
1077 return NULL;
1078 }
1079#else
1080 assert(num_surrogates == 0);
1081 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1082#endif
1083 break;
1084 default:
1085 assert(0 && "Impossible state");
1086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087
1088 return (PyObject *)unicode;
1089}
1090
Alexander Belopolsky40018472011-02-26 01:02:56 +00001091PyObject *
1092PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001093{
1094 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001095
Benjamin Peterson14339b62009-01-31 16:36:08 +00001096 if (size < 0) {
1097 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001098 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001099 return NULL;
1100 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001101
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001102 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001103 some optimizations which share commonly used objects.
1104 Also, this means the input must be UTF-8, so fall back to the
1105 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001106 if (u != NULL) {
1107
Benjamin Peterson29060642009-01-31 22:14:21 +00001108 /* Optimization for empty strings */
1109 if (size == 0 && unicode_empty != NULL) {
1110 Py_INCREF(unicode_empty);
1111 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001112 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001113
1114 /* Single characters are shared when using this constructor.
1115 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 if (size == 1 && Py_CHARMASK(*u) < 128)
1117 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001118
1119 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001120 }
1121
Walter Dörwald55507312007-05-18 13:12:10 +00001122 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001123 if (!unicode)
1124 return NULL;
1125
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001126 return (PyObject *)unicode;
1127}
1128
Alexander Belopolsky40018472011-02-26 01:02:56 +00001129PyObject *
1130PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001131{
1132 size_t size = strlen(u);
1133 if (size > PY_SSIZE_T_MAX) {
1134 PyErr_SetString(PyExc_OverflowError, "input too long");
1135 return NULL;
1136 }
1137
1138 return PyUnicode_FromStringAndSize(u, size);
1139}
1140
Victor Stinnere57b1c02011-09-28 22:20:48 +02001141static PyObject*
1142_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001143{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 PyObject *res;
1145 unsigned char max = 127;
1146 Py_ssize_t i;
1147 for (i = 0; i < size; i++) {
1148 if (u[i] & 0x80) {
1149 max = 255;
1150 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001151 }
1152 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153 res = PyUnicode_New(size, max);
1154 if (!res)
1155 return NULL;
1156 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1157 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001158}
1159
Victor Stinnere57b1c02011-09-28 22:20:48 +02001160static PyObject*
1161_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162{
1163 PyObject *res;
1164 Py_UCS2 max = 0;
1165 Py_ssize_t i;
1166 for (i = 0; i < size; i++)
1167 if (u[i] > max)
1168 max = u[i];
1169 res = PyUnicode_New(size, max);
1170 if (!res)
1171 return NULL;
1172 if (max >= 256)
1173 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1174 else
1175 for (i = 0; i < size; i++)
1176 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1177 return res;
1178}
1179
Victor Stinnere57b1c02011-09-28 22:20:48 +02001180static PyObject*
1181_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182{
1183 PyObject *res;
1184 Py_UCS4 max = 0;
1185 Py_ssize_t i;
1186 for (i = 0; i < size; i++)
1187 if (u[i] > max)
1188 max = u[i];
1189 res = PyUnicode_New(size, max);
1190 if (!res)
1191 return NULL;
1192 if (max >= 0x10000)
1193 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1194 else {
1195 int kind = PyUnicode_KIND(res);
1196 void *data = PyUnicode_DATA(res);
1197 for (i = 0; i < size; i++)
1198 PyUnicode_WRITE(kind, data, i, u[i]);
1199 }
1200 return res;
1201}
1202
1203PyObject*
1204PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1205{
1206 switch(kind) {
1207 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001208 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001210 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001212 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001213 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001214 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001215 return NULL;
1216}
1217
Victor Stinner034f6cf2011-09-30 02:26:44 +02001218PyObject*
1219PyUnicode_Copy(PyObject *unicode)
1220{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001221 Py_ssize_t size;
1222 PyObject *copy;
1223 void *data;
1224
Victor Stinner034f6cf2011-09-30 02:26:44 +02001225 if (!PyUnicode_Check(unicode)) {
1226 PyErr_BadInternalCall();
1227 return NULL;
1228 }
1229 if (PyUnicode_READY(unicode))
1230 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001231
1232 size = PyUnicode_GET_LENGTH(unicode);
1233 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1234 if (!copy)
1235 return NULL;
1236 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1237
1238 data = PyUnicode_DATA(unicode);
1239 switch (PyUnicode_KIND(unicode))
1240 {
1241 case PyUnicode_1BYTE_KIND:
1242 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1243 break;
1244 case PyUnicode_2BYTE_KIND:
1245 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1246 break;
1247 case PyUnicode_4BYTE_KIND:
1248 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1249 break;
1250 default:
1251 assert(0);
1252 break;
1253 }
1254 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001255}
1256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257
1258/* Widen Unicode objects to larger buffers.
1259 Return NULL if the string is too wide already. */
1260
1261void*
1262_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1263{
1264 Py_ssize_t i;
1265 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1266 void *d = PyUnicode_DATA(s);
1267 unsigned int skind = PyUnicode_KIND(s);
1268 if (PyUnicode_KIND(s) >= kind) {
1269 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1270 return NULL;
1271 }
1272 switch(kind) {
1273 case PyUnicode_2BYTE_KIND: {
1274 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1275 if (!result) {
1276 PyErr_NoMemory();
1277 return 0;
1278 }
1279 for (i = 0; i < len; i++)
1280 result[i] = ((Py_UCS1*)d)[i];
1281 return result;
1282 }
1283 case PyUnicode_4BYTE_KIND: {
1284 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1285 if (!result) {
1286 PyErr_NoMemory();
1287 return 0;
1288 }
1289 for (i = 0; i < len; i++)
1290 result[i] = PyUnicode_READ(skind, d, i);
1291 return result;
1292 }
1293 }
1294 Py_FatalError("invalid kind");
1295 return NULL;
1296}
1297
1298static Py_UCS4*
1299as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1300 int copy_null)
1301{
1302 int kind;
1303 void *data;
1304 Py_ssize_t len, targetlen;
1305 if (PyUnicode_READY(string) == -1)
1306 return NULL;
1307 kind = PyUnicode_KIND(string);
1308 data = PyUnicode_DATA(string);
1309 len = PyUnicode_GET_LENGTH(string);
1310 targetlen = len;
1311 if (copy_null)
1312 targetlen++;
1313 if (!target) {
1314 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1315 PyErr_NoMemory();
1316 return NULL;
1317 }
1318 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1319 if (!target) {
1320 PyErr_NoMemory();
1321 return NULL;
1322 }
1323 }
1324 else {
1325 if (targetsize < targetlen) {
1326 PyErr_Format(PyExc_SystemError,
1327 "string is longer than the buffer");
1328 if (copy_null && 0 < targetsize)
1329 target[0] = 0;
1330 return NULL;
1331 }
1332 }
1333 if (kind != PyUnicode_4BYTE_KIND) {
1334 Py_ssize_t i;
1335 for (i = 0; i < len; i++)
1336 target[i] = PyUnicode_READ(kind, data, i);
1337 }
1338 else
1339 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1340 if (copy_null)
1341 target[len] = 0;
1342 return target;
1343}
1344
1345Py_UCS4*
1346PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1347 int copy_null)
1348{
1349 if (target == NULL || targetsize < 1) {
1350 PyErr_BadInternalCall();
1351 return NULL;
1352 }
1353 return as_ucs4(string, target, targetsize, copy_null);
1354}
1355
1356Py_UCS4*
1357PyUnicode_AsUCS4Copy(PyObject *string)
1358{
1359 return as_ucs4(string, NULL, 0, 1);
1360}
1361
1362#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001363
Alexander Belopolsky40018472011-02-26 01:02:56 +00001364PyObject *
1365PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001368 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001370 PyErr_BadInternalCall();
1371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001372 }
1373
Martin v. Löwis790465f2008-04-05 20:41:37 +00001374 if (size == -1) {
1375 size = wcslen(w);
1376 }
1377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001379}
1380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001382
Walter Dörwald346737f2007-05-31 10:44:43 +00001383static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001384makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1385 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001386{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001387 *fmt++ = '%';
1388 if (width) {
1389 if (zeropad)
1390 *fmt++ = '0';
1391 fmt += sprintf(fmt, "%d", width);
1392 }
1393 if (precision)
1394 fmt += sprintf(fmt, ".%d", precision);
1395 if (longflag)
1396 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001397 else if (longlongflag) {
1398 /* longlongflag should only ever be nonzero on machines with
1399 HAVE_LONG_LONG defined */
1400#ifdef HAVE_LONG_LONG
1401 char *f = PY_FORMAT_LONG_LONG;
1402 while (*f)
1403 *fmt++ = *f++;
1404#else
1405 /* we shouldn't ever get here */
1406 assert(0);
1407 *fmt++ = 'l';
1408#endif
1409 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001410 else if (size_tflag) {
1411 char *f = PY_FORMAT_SIZE_T;
1412 while (*f)
1413 *fmt++ = *f++;
1414 }
1415 *fmt++ = c;
1416 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001417}
1418
Victor Stinner96865452011-03-01 23:44:09 +00001419/* helper for PyUnicode_FromFormatV() */
1420
1421static const char*
1422parse_format_flags(const char *f,
1423 int *p_width, int *p_precision,
1424 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1425{
1426 int width, precision, longflag, longlongflag, size_tflag;
1427
1428 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1429 f++;
1430 width = 0;
1431 while (Py_ISDIGIT((unsigned)*f))
1432 width = (width*10) + *f++ - '0';
1433 precision = 0;
1434 if (*f == '.') {
1435 f++;
1436 while (Py_ISDIGIT((unsigned)*f))
1437 precision = (precision*10) + *f++ - '0';
1438 if (*f == '%') {
1439 /* "%.3%s" => f points to "3" */
1440 f--;
1441 }
1442 }
1443 if (*f == '\0') {
1444 /* bogus format "%.1" => go backward, f points to "1" */
1445 f--;
1446 }
1447 if (p_width != NULL)
1448 *p_width = width;
1449 if (p_precision != NULL)
1450 *p_precision = precision;
1451
1452 /* Handle %ld, %lu, %lld and %llu. */
1453 longflag = 0;
1454 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001455 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001456
1457 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001458 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001459 longflag = 1;
1460 ++f;
1461 }
1462#ifdef HAVE_LONG_LONG
1463 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001464 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001465 longlongflag = 1;
1466 f += 2;
1467 }
1468#endif
1469 }
1470 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001471 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001472 size_tflag = 1;
1473 ++f;
1474 }
1475 if (p_longflag != NULL)
1476 *p_longflag = longflag;
1477 if (p_longlongflag != NULL)
1478 *p_longlongflag = longlongflag;
1479 if (p_size_tflag != NULL)
1480 *p_size_tflag = size_tflag;
1481 return f;
1482}
1483
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001484/* maximum number of characters required for output of %ld. 21 characters
1485 allows for 64-bit integers (in decimal) and an optional sign. */
1486#define MAX_LONG_CHARS 21
1487/* maximum number of characters required for output of %lld.
1488 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1489 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1490#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1491
Walter Dörwaldd2034312007-05-18 16:29:38 +00001492PyObject *
1493PyUnicode_FromFormatV(const char *format, va_list vargs)
1494{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001495 va_list count;
1496 Py_ssize_t callcount = 0;
1497 PyObject **callresults = NULL;
1498 PyObject **callresult = NULL;
1499 Py_ssize_t n = 0;
1500 int width = 0;
1501 int precision = 0;
1502 int zeropad;
1503 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001505 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001506 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1508 Py_UCS4 argmaxchar;
1509 Py_ssize_t numbersize = 0;
1510 char *numberresults = NULL;
1511 char *numberresult = NULL;
1512 Py_ssize_t i;
1513 int kind;
1514 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001515
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001516 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001517 /* step 1: count the number of %S/%R/%A/%s format specifications
1518 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1519 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520 * result in an array)
1521 * also esimate a upper bound for all the number formats in the string,
1522 * numbers will be formated in step 3 and be keept in a '\0'-separated
1523 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001524 for (f = format; *f; f++) {
1525 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001526 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001527 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1528 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1529 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1530 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001533#ifdef HAVE_LONG_LONG
1534 if (longlongflag) {
1535 if (width < MAX_LONG_LONG_CHARS)
1536 width = MAX_LONG_LONG_CHARS;
1537 }
1538 else
1539#endif
1540 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1541 including sign. Decimal takes the most space. This
1542 isn't enough for octal. If a width is specified we
1543 need more (which we allocate later). */
1544 if (width < MAX_LONG_CHARS)
1545 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001546
1547 /* account for the size + '\0' to separate numbers
1548 inside of the numberresults buffer */
1549 numbersize += (width + 1);
1550 }
1551 }
1552 else if ((unsigned char)*f > 127) {
1553 PyErr_Format(PyExc_ValueError,
1554 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1555 "string, got a non-ASCII byte: 0x%02x",
1556 (unsigned char)*f);
1557 return NULL;
1558 }
1559 }
1560 /* step 2: allocate memory for the results of
1561 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1562 if (callcount) {
1563 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1564 if (!callresults) {
1565 PyErr_NoMemory();
1566 return NULL;
1567 }
1568 callresult = callresults;
1569 }
1570 /* step 2.5: allocate memory for the results of formating numbers */
1571 if (numbersize) {
1572 numberresults = PyObject_Malloc(numbersize);
1573 if (!numberresults) {
1574 PyErr_NoMemory();
1575 goto fail;
1576 }
1577 numberresult = numberresults;
1578 }
1579
1580 /* step 3: format numbers and figure out how large a buffer we need */
1581 for (f = format; *f; f++) {
1582 if (*f == '%') {
1583 const char* p;
1584 int longflag;
1585 int longlongflag;
1586 int size_tflag;
1587 int numprinted;
1588
1589 p = f;
1590 zeropad = (f[1] == '0');
1591 f = parse_format_flags(f, &width, &precision,
1592 &longflag, &longlongflag, &size_tflag);
1593 switch (*f) {
1594 case 'c':
1595 {
1596 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001597 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001598 n++;
1599 break;
1600 }
1601 case '%':
1602 n++;
1603 break;
1604 case 'i':
1605 case 'd':
1606 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1607 width, precision, *f);
1608 if (longflag)
1609 numprinted = sprintf(numberresult, fmt,
1610 va_arg(count, long));
1611#ifdef HAVE_LONG_LONG
1612 else if (longlongflag)
1613 numprinted = sprintf(numberresult, fmt,
1614 va_arg(count, PY_LONG_LONG));
1615#endif
1616 else if (size_tflag)
1617 numprinted = sprintf(numberresult, fmt,
1618 va_arg(count, Py_ssize_t));
1619 else
1620 numprinted = sprintf(numberresult, fmt,
1621 va_arg(count, int));
1622 n += numprinted;
1623 /* advance by +1 to skip over the '\0' */
1624 numberresult += (numprinted + 1);
1625 assert(*(numberresult - 1) == '\0');
1626 assert(*(numberresult - 2) != '\0');
1627 assert(numprinted >= 0);
1628 assert(numberresult <= numberresults + numbersize);
1629 break;
1630 case 'u':
1631 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1632 width, precision, 'u');
1633 if (longflag)
1634 numprinted = sprintf(numberresult, fmt,
1635 va_arg(count, unsigned long));
1636#ifdef HAVE_LONG_LONG
1637 else if (longlongflag)
1638 numprinted = sprintf(numberresult, fmt,
1639 va_arg(count, unsigned PY_LONG_LONG));
1640#endif
1641 else if (size_tflag)
1642 numprinted = sprintf(numberresult, fmt,
1643 va_arg(count, size_t));
1644 else
1645 numprinted = sprintf(numberresult, fmt,
1646 va_arg(count, unsigned int));
1647 n += numprinted;
1648 numberresult += (numprinted + 1);
1649 assert(*(numberresult - 1) == '\0');
1650 assert(*(numberresult - 2) != '\0');
1651 assert(numprinted >= 0);
1652 assert(numberresult <= numberresults + numbersize);
1653 break;
1654 case 'x':
1655 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1656 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1657 n += numprinted;
1658 numberresult += (numprinted + 1);
1659 assert(*(numberresult - 1) == '\0');
1660 assert(*(numberresult - 2) != '\0');
1661 assert(numprinted >= 0);
1662 assert(numberresult <= numberresults + numbersize);
1663 break;
1664 case 'p':
1665 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1666 /* %p is ill-defined: ensure leading 0x. */
1667 if (numberresult[1] == 'X')
1668 numberresult[1] = 'x';
1669 else if (numberresult[1] != 'x') {
1670 memmove(numberresult + 2, numberresult,
1671 strlen(numberresult) + 1);
1672 numberresult[0] = '0';
1673 numberresult[1] = 'x';
1674 numprinted += 2;
1675 }
1676 n += numprinted;
1677 numberresult += (numprinted + 1);
1678 assert(*(numberresult - 1) == '\0');
1679 assert(*(numberresult - 2) != '\0');
1680 assert(numprinted >= 0);
1681 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001682 break;
1683 case 's':
1684 {
1685 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001686 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001687 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1688 if (!str)
1689 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690 /* since PyUnicode_DecodeUTF8 returns already flexible
1691 unicode objects, there is no need to call ready on them */
1692 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001693 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001695 /* Remember the str and switch to the next slot */
1696 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001697 break;
1698 }
1699 case 'U':
1700 {
1701 PyObject *obj = va_arg(count, PyObject *);
1702 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001703 if (PyUnicode_READY(obj) == -1)
1704 goto fail;
1705 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001706 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001707 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001708 break;
1709 }
1710 case 'V':
1711 {
1712 PyObject *obj = va_arg(count, PyObject *);
1713 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001714 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001715 assert(obj || str);
1716 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001717 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 if (PyUnicode_READY(obj) == -1)
1719 goto fail;
1720 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001721 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001723 *callresult++ = NULL;
1724 }
1725 else {
1726 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1727 if (!str_obj)
1728 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001730 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001732 *callresult++ = str_obj;
1733 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001734 break;
1735 }
1736 case 'S':
1737 {
1738 PyObject *obj = va_arg(count, PyObject *);
1739 PyObject *str;
1740 assert(obj);
1741 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001743 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001745 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001747 /* Remember the str and switch to the next slot */
1748 *callresult++ = str;
1749 break;
1750 }
1751 case 'R':
1752 {
1753 PyObject *obj = va_arg(count, PyObject *);
1754 PyObject *repr;
1755 assert(obj);
1756 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001758 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001760 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001762 /* Remember the repr and switch to the next slot */
1763 *callresult++ = repr;
1764 break;
1765 }
1766 case 'A':
1767 {
1768 PyObject *obj = va_arg(count, PyObject *);
1769 PyObject *ascii;
1770 assert(obj);
1771 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001773 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001775 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001777 /* Remember the repr and switch to the next slot */
1778 *callresult++ = ascii;
1779 break;
1780 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001781 default:
1782 /* if we stumble upon an unknown
1783 formatting code, copy the rest of
1784 the format string to the output
1785 string. (we cannot just skip the
1786 code, since there's no way to know
1787 what's in the argument list) */
1788 n += strlen(p);
1789 goto expand;
1790 }
1791 } else
1792 n++;
1793 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001794 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001795 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001797 we don't have to resize the string.
1798 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001800 if (!string)
1801 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 kind = PyUnicode_KIND(string);
1803 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001804 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001808 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001809 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001810
1811 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1813 /* checking for == because the last argument could be a empty
1814 string, which causes i to point to end, the assert at the end of
1815 the loop */
1816 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001817
Benjamin Peterson14339b62009-01-31 16:36:08 +00001818 switch (*f) {
1819 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001820 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821 const int ordinal = va_arg(vargs, int);
1822 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001823 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001824 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001825 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001826 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001827 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001828 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 case 'p':
1830 /* unused, since we already have the result */
1831 if (*f == 'p')
1832 (void) va_arg(vargs, void *);
1833 else
1834 (void) va_arg(vargs, int);
1835 /* extract the result from numberresults and append. */
1836 for (; *numberresult; ++i, ++numberresult)
1837 PyUnicode_WRITE(kind, data, i, *numberresult);
1838 /* skip over the separating '\0' */
1839 assert(*numberresult == '\0');
1840 numberresult++;
1841 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001842 break;
1843 case 's':
1844 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001845 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001847 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 size = PyUnicode_GET_LENGTH(*callresult);
1849 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001850 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1851 *callresult, 0,
1852 size) < 0)
1853 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001854 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001855 /* We're done with the unicode()/repr() => forget it */
1856 Py_DECREF(*callresult);
1857 /* switch to next unicode()/repr() result */
1858 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001859 break;
1860 }
1861 case 'U':
1862 {
1863 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 Py_ssize_t size;
1865 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1866 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001867 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1868 obj, 0,
1869 size) < 0)
1870 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001872 break;
1873 }
1874 case 'V':
1875 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001876 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001877 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001878 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001879 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001880 size = PyUnicode_GET_LENGTH(obj);
1881 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001882 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1883 obj, 0,
1884 size) < 0)
1885 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001887 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 size = PyUnicode_GET_LENGTH(*callresult);
1889 assert(PyUnicode_KIND(*callresult) <=
1890 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001891 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1892 *callresult,
1893 0, size) < 0)
1894 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001896 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001897 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001898 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001899 break;
1900 }
1901 case 'S':
1902 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001903 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001904 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001905 /* unused, since we already have the result */
1906 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001908 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1909 *callresult, 0,
1910 PyUnicode_GET_LENGTH(*callresult)) < 0)
1911 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001913 /* We're done with the unicode()/repr() => forget it */
1914 Py_DECREF(*callresult);
1915 /* switch to next unicode()/repr() result */
1916 ++callresult;
1917 break;
1918 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001919 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001921 break;
1922 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 for (; *p; ++p, ++i)
1924 PyUnicode_WRITE(kind, data, i, *p);
1925 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001926 goto end;
1927 }
Victor Stinner1205f272010-09-11 00:54:47 +00001928 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 else {
1930 assert(i < PyUnicode_GET_LENGTH(string));
1931 PyUnicode_WRITE(kind, data, i++, *f);
1932 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001933 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001934 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001935
Benjamin Peterson29060642009-01-31 22:14:21 +00001936 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001937 if (callresults)
1938 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 if (numberresults)
1940 PyObject_Free(numberresults);
1941 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001942 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001943 if (callresults) {
1944 PyObject **callresult2 = callresults;
1945 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001946 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001947 ++callresult2;
1948 }
1949 PyObject_Free(callresults);
1950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001951 if (numberresults)
1952 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001953 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001954}
1955
Walter Dörwaldd2034312007-05-18 16:29:38 +00001956PyObject *
1957PyUnicode_FromFormat(const char *format, ...)
1958{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001959 PyObject* ret;
1960 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001961
1962#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001963 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001964#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001965 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001966#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001967 ret = PyUnicode_FromFormatV(format, vargs);
1968 va_end(vargs);
1969 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001970}
1971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972#ifdef HAVE_WCHAR_H
1973
Victor Stinner5593d8a2010-10-02 11:11:27 +00001974/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1975 convert a Unicode object to a wide character string.
1976
Victor Stinnerd88d9832011-09-06 02:00:05 +02001977 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001978 character) required to convert the unicode object. Ignore size argument.
1979
Victor Stinnerd88d9832011-09-06 02:00:05 +02001980 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001981 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001982 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001983static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001984unicode_aswidechar(PyUnicodeObject *unicode,
1985 wchar_t *w,
1986 Py_ssize_t size)
1987{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001988 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989 const wchar_t *wstr;
1990
1991 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1992 if (wstr == NULL)
1993 return -1;
1994
Victor Stinner5593d8a2010-10-02 11:11:27 +00001995 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001996 if (size > res)
1997 size = res + 1;
1998 else
1999 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002001 return res;
2002 }
2003 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002005}
2006
2007Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002008PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002009 wchar_t *w,
2010 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011{
2012 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002013 PyErr_BadInternalCall();
2014 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002016 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017}
2018
Victor Stinner137c34c2010-09-29 10:25:54 +00002019wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002020PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002021 Py_ssize_t *size)
2022{
2023 wchar_t* buffer;
2024 Py_ssize_t buflen;
2025
2026 if (unicode == NULL) {
2027 PyErr_BadInternalCall();
2028 return NULL;
2029 }
2030
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002031 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 if (buflen == -1)
2033 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002034 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002035 PyErr_NoMemory();
2036 return NULL;
2037 }
2038
Victor Stinner137c34c2010-09-29 10:25:54 +00002039 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2040 if (buffer == NULL) {
2041 PyErr_NoMemory();
2042 return NULL;
2043 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002044 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045 if (buflen == -1)
2046 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002047 if (size != NULL)
2048 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002049 return buffer;
2050}
2051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053
Alexander Belopolsky40018472011-02-26 01:02:56 +00002054PyObject *
2055PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002056{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002058 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 PyErr_SetString(PyExc_ValueError,
2060 "chr() arg not in range(0x110000)");
2061 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002062 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002064 if (ordinal < 256)
2065 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002067 v = PyUnicode_New(1, ordinal);
2068 if (v == NULL)
2069 return NULL;
2070 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2071 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002072}
2073
Alexander Belopolsky40018472011-02-26 01:02:56 +00002074PyObject *
2075PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002077 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002078 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002079 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002080 if (PyUnicode_READY(obj))
2081 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002082 Py_INCREF(obj);
2083 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002084 }
2085 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002086 /* For a Unicode subtype that's not a Unicode object,
2087 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002088 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002089 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002090 PyErr_Format(PyExc_TypeError,
2091 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002092 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002093 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002094}
2095
Alexander Belopolsky40018472011-02-26 01:02:56 +00002096PyObject *
2097PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002098 const char *encoding,
2099 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002100{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002101 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002102 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002103
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002105 PyErr_BadInternalCall();
2106 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002108
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002109 /* Decoding bytes objects is the most common case and should be fast */
2110 if (PyBytes_Check(obj)) {
2111 if (PyBytes_GET_SIZE(obj) == 0) {
2112 Py_INCREF(unicode_empty);
2113 v = (PyObject *) unicode_empty;
2114 }
2115 else {
2116 v = PyUnicode_Decode(
2117 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2118 encoding, errors);
2119 }
2120 return v;
2121 }
2122
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002123 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002124 PyErr_SetString(PyExc_TypeError,
2125 "decoding str is not supported");
2126 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002127 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002128
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002129 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2130 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2131 PyErr_Format(PyExc_TypeError,
2132 "coercing to str: need bytes, bytearray "
2133 "or buffer-like object, %.80s found",
2134 Py_TYPE(obj)->tp_name);
2135 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002136 }
Tim Petersced69f82003-09-16 20:30:58 +00002137
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002138 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002139 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002140 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141 }
Tim Petersced69f82003-09-16 20:30:58 +00002142 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002143 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002144
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002145 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002146 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147}
2148
Victor Stinner600d3be2010-06-10 12:00:55 +00002149/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002150 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2151 1 on success. */
2152static int
2153normalize_encoding(const char *encoding,
2154 char *lower,
2155 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002157 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002158 char *l;
2159 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002160
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002161 e = encoding;
2162 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002163 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002164 while (*e) {
2165 if (l == l_end)
2166 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002167 if (Py_ISUPPER(*e)) {
2168 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002169 }
2170 else if (*e == '_') {
2171 *l++ = '-';
2172 e++;
2173 }
2174 else {
2175 *l++ = *e++;
2176 }
2177 }
2178 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002179 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002180}
2181
Alexander Belopolsky40018472011-02-26 01:02:56 +00002182PyObject *
2183PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002184 Py_ssize_t size,
2185 const char *encoding,
2186 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002187{
2188 PyObject *buffer = NULL, *unicode;
2189 Py_buffer info;
2190 char lower[11]; /* Enough for any encoding shortcut */
2191
2192 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002193 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002194
2195 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002196 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002197 if ((strcmp(lower, "utf-8") == 0) ||
2198 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002199 return PyUnicode_DecodeUTF8(s, size, errors);
2200 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002201 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002202 (strcmp(lower, "iso-8859-1") == 0))
2203 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002204#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002205 else if (strcmp(lower, "mbcs") == 0)
2206 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002207#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002208 else if (strcmp(lower, "ascii") == 0)
2209 return PyUnicode_DecodeASCII(s, size, errors);
2210 else if (strcmp(lower, "utf-16") == 0)
2211 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2212 else if (strcmp(lower, "utf-32") == 0)
2213 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215
2216 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002217 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002218 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002219 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002220 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221 if (buffer == NULL)
2222 goto onError;
2223 unicode = PyCodec_Decode(buffer, encoding, errors);
2224 if (unicode == NULL)
2225 goto onError;
2226 if (!PyUnicode_Check(unicode)) {
2227 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002228 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002229 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 Py_DECREF(unicode);
2231 goto onError;
2232 }
2233 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 if (PyUnicode_READY(unicode)) {
2235 Py_DECREF(unicode);
2236 return NULL;
2237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002239
Benjamin Peterson29060642009-01-31 22:14:21 +00002240 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 Py_XDECREF(buffer);
2242 return NULL;
2243}
2244
Alexander Belopolsky40018472011-02-26 01:02:56 +00002245PyObject *
2246PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002247 const char *encoding,
2248 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002249{
2250 PyObject *v;
2251
2252 if (!PyUnicode_Check(unicode)) {
2253 PyErr_BadArgument();
2254 goto onError;
2255 }
2256
2257 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002258 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002259
2260 /* Decode via the codec registry */
2261 v = PyCodec_Decode(unicode, encoding, errors);
2262 if (v == NULL)
2263 goto onError;
2264 return v;
2265
Benjamin Peterson29060642009-01-31 22:14:21 +00002266 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002267 return NULL;
2268}
2269
Alexander Belopolsky40018472011-02-26 01:02:56 +00002270PyObject *
2271PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002272 const char *encoding,
2273 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002274{
2275 PyObject *v;
2276
2277 if (!PyUnicode_Check(unicode)) {
2278 PyErr_BadArgument();
2279 goto onError;
2280 }
2281
2282 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002283 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002284
2285 /* Decode via the codec registry */
2286 v = PyCodec_Decode(unicode, encoding, errors);
2287 if (v == NULL)
2288 goto onError;
2289 if (!PyUnicode_Check(v)) {
2290 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002291 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002292 Py_TYPE(v)->tp_name);
2293 Py_DECREF(v);
2294 goto onError;
2295 }
2296 return v;
2297
Benjamin Peterson29060642009-01-31 22:14:21 +00002298 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002299 return NULL;
2300}
2301
Alexander Belopolsky40018472011-02-26 01:02:56 +00002302PyObject *
2303PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002304 Py_ssize_t size,
2305 const char *encoding,
2306 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307{
2308 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002309
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310 unicode = PyUnicode_FromUnicode(s, size);
2311 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002312 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2314 Py_DECREF(unicode);
2315 return v;
2316}
2317
Alexander Belopolsky40018472011-02-26 01:02:56 +00002318PyObject *
2319PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002320 const char *encoding,
2321 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002322{
2323 PyObject *v;
2324
2325 if (!PyUnicode_Check(unicode)) {
2326 PyErr_BadArgument();
2327 goto onError;
2328 }
2329
2330 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002331 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002332
2333 /* Encode via the codec registry */
2334 v = PyCodec_Encode(unicode, encoding, errors);
2335 if (v == NULL)
2336 goto onError;
2337 return v;
2338
Benjamin Peterson29060642009-01-31 22:14:21 +00002339 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002340 return NULL;
2341}
2342
Victor Stinnerad158722010-10-27 00:25:46 +00002343PyObject *
2344PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002345{
Victor Stinner99b95382011-07-04 14:23:54 +02002346#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002347 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2348 PyUnicode_GET_SIZE(unicode),
2349 NULL);
2350#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002351 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002352#else
Victor Stinner793b5312011-04-27 00:24:21 +02002353 PyInterpreterState *interp = PyThreadState_GET()->interp;
2354 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2355 cannot use it to encode and decode filenames before it is loaded. Load
2356 the Python codec requires to encode at least its own filename. Use the C
2357 version of the locale codec until the codec registry is initialized and
2358 the Python codec is loaded.
2359
2360 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2361 cannot only rely on it: check also interp->fscodec_initialized for
2362 subinterpreters. */
2363 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002364 return PyUnicode_AsEncodedString(unicode,
2365 Py_FileSystemDefaultEncoding,
2366 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002367 }
2368 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002369 /* locale encoding with surrogateescape */
2370 wchar_t *wchar;
2371 char *bytes;
2372 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002373 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002374
2375 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2376 if (wchar == NULL)
2377 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002378 bytes = _Py_wchar2char(wchar, &error_pos);
2379 if (bytes == NULL) {
2380 if (error_pos != (size_t)-1) {
2381 char *errmsg = strerror(errno);
2382 PyObject *exc = NULL;
2383 if (errmsg == NULL)
2384 errmsg = "Py_wchar2char() failed";
2385 raise_encode_exception(&exc,
2386 "filesystemencoding",
2387 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2388 error_pos, error_pos+1,
2389 errmsg);
2390 Py_XDECREF(exc);
2391 }
2392 else
2393 PyErr_NoMemory();
2394 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002395 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002396 }
2397 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002398
2399 bytes_obj = PyBytes_FromString(bytes);
2400 PyMem_Free(bytes);
2401 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002402 }
Victor Stinnerad158722010-10-27 00:25:46 +00002403#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002404}
2405
Alexander Belopolsky40018472011-02-26 01:02:56 +00002406PyObject *
2407PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002408 const char *encoding,
2409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410{
2411 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002412 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002413
Guido van Rossumd57fd912000-03-10 22:53:23 +00002414 if (!PyUnicode_Check(unicode)) {
2415 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002416 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002417 }
Fred Drakee4315f52000-05-09 19:53:39 +00002418
Victor Stinner2f283c22011-03-02 01:21:46 +00002419 if (encoding == NULL) {
2420 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002422 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002423 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002424 }
Fred Drakee4315f52000-05-09 19:53:39 +00002425
2426 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002427 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002428 if ((strcmp(lower, "utf-8") == 0) ||
2429 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002430 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002431 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002433 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002435 }
Victor Stinner37296e82010-06-10 13:36:23 +00002436 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002437 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002438 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002440#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002441 else if (strcmp(lower, "mbcs") == 0)
2442 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2443 PyUnicode_GET_SIZE(unicode),
2444 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002445#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002446 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002448 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002449
2450 /* Encode via the codec registry */
2451 v = PyCodec_Encode(unicode, encoding, errors);
2452 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002453 return NULL;
2454
2455 /* The normal path */
2456 if (PyBytes_Check(v))
2457 return v;
2458
2459 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002460 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002461 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002462 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002463
2464 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2465 "encoder %s returned bytearray instead of bytes",
2466 encoding);
2467 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002468 Py_DECREF(v);
2469 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002470 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002471
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002472 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2473 Py_DECREF(v);
2474 return b;
2475 }
2476
2477 PyErr_Format(PyExc_TypeError,
2478 "encoder did not return a bytes object (type=%.400s)",
2479 Py_TYPE(v)->tp_name);
2480 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002481 return NULL;
2482}
2483
Alexander Belopolsky40018472011-02-26 01:02:56 +00002484PyObject *
2485PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002486 const char *encoding,
2487 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002488{
2489 PyObject *v;
2490
2491 if (!PyUnicode_Check(unicode)) {
2492 PyErr_BadArgument();
2493 goto onError;
2494 }
2495
2496 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002497 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002498
2499 /* Encode via the codec registry */
2500 v = PyCodec_Encode(unicode, encoding, errors);
2501 if (v == NULL)
2502 goto onError;
2503 if (!PyUnicode_Check(v)) {
2504 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002505 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002506 Py_TYPE(v)->tp_name);
2507 Py_DECREF(v);
2508 goto onError;
2509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002511
Benjamin Peterson29060642009-01-31 22:14:21 +00002512 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 return NULL;
2514}
2515
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002516PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002517PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002518 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002519 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2520}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002521
Christian Heimes5894ba72007-11-04 11:43:14 +00002522PyObject*
2523PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2524{
Victor Stinner99b95382011-07-04 14:23:54 +02002525#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002526 return PyUnicode_DecodeMBCS(s, size, NULL);
2527#elif defined(__APPLE__)
2528 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2529#else
Victor Stinner793b5312011-04-27 00:24:21 +02002530 PyInterpreterState *interp = PyThreadState_GET()->interp;
2531 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2532 cannot use it to encode and decode filenames before it is loaded. Load
2533 the Python codec requires to encode at least its own filename. Use the C
2534 version of the locale codec until the codec registry is initialized and
2535 the Python codec is loaded.
2536
2537 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2538 cannot only rely on it: check also interp->fscodec_initialized for
2539 subinterpreters. */
2540 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002541 return PyUnicode_Decode(s, size,
2542 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002543 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002544 }
2545 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002546 /* locale encoding with surrogateescape */
2547 wchar_t *wchar;
2548 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002549 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002550
2551 if (s[size] != '\0' || size != strlen(s)) {
2552 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2553 return NULL;
2554 }
2555
Victor Stinner168e1172010-10-16 23:16:16 +00002556 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002557 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002558 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002559
Victor Stinner168e1172010-10-16 23:16:16 +00002560 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002561 PyMem_Free(wchar);
2562 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002563 }
Victor Stinnerad158722010-10-27 00:25:46 +00002564#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002565}
2566
Martin v. Löwis011e8422009-05-05 04:43:17 +00002567
2568int
2569PyUnicode_FSConverter(PyObject* arg, void* addr)
2570{
2571 PyObject *output = NULL;
2572 Py_ssize_t size;
2573 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002574 if (arg == NULL) {
2575 Py_DECREF(*(PyObject**)addr);
2576 return 1;
2577 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002578 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002579 output = arg;
2580 Py_INCREF(output);
2581 }
2582 else {
2583 arg = PyUnicode_FromObject(arg);
2584 if (!arg)
2585 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002586 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002587 Py_DECREF(arg);
2588 if (!output)
2589 return 0;
2590 if (!PyBytes_Check(output)) {
2591 Py_DECREF(output);
2592 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2593 return 0;
2594 }
2595 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002596 size = PyBytes_GET_SIZE(output);
2597 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002598 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002599 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002600 Py_DECREF(output);
2601 return 0;
2602 }
2603 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002604 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002605}
2606
2607
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002608int
2609PyUnicode_FSDecoder(PyObject* arg, void* addr)
2610{
2611 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002612 if (arg == NULL) {
2613 Py_DECREF(*(PyObject**)addr);
2614 return 1;
2615 }
2616 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002617 if (PyUnicode_READY(arg))
2618 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002619 output = arg;
2620 Py_INCREF(output);
2621 }
2622 else {
2623 arg = PyBytes_FromObject(arg);
2624 if (!arg)
2625 return 0;
2626 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2627 PyBytes_GET_SIZE(arg));
2628 Py_DECREF(arg);
2629 if (!output)
2630 return 0;
2631 if (!PyUnicode_Check(output)) {
2632 Py_DECREF(output);
2633 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2634 return 0;
2635 }
2636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2638 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002639 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2640 Py_DECREF(output);
2641 return 0;
2642 }
2643 *(PyObject**)addr = output;
2644 return Py_CLEANUP_SUPPORTED;
2645}
2646
2647
Martin v. Löwis5b222132007-06-10 09:51:05 +00002648char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002649PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002650{
Christian Heimesf3863112007-11-22 07:46:41 +00002651 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2653
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002654 if (!PyUnicode_Check(unicode)) {
2655 PyErr_BadArgument();
2656 return NULL;
2657 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002659 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002660
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002661 if (PyUnicode_UTF8(unicode) == NULL) {
2662 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2664 if (bytes == NULL)
2665 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002666 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2667 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 Py_DECREF(bytes);
2669 return NULL;
2670 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002671 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2672 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 Py_DECREF(bytes);
2674 }
2675
2676 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002677 *psize = PyUnicode_UTF8_LENGTH(unicode);
2678 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002679}
2680
2681char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002683{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002684 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2685}
2686
2687#ifdef Py_DEBUG
2688int unicode_as_unicode_calls = 0;
2689#endif
2690
2691
2692Py_UNICODE *
2693PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2694{
2695 PyUnicodeObject *u;
2696 const unsigned char *one_byte;
2697#if SIZEOF_WCHAR_T == 4
2698 const Py_UCS2 *two_bytes;
2699#else
2700 const Py_UCS4 *four_bytes;
2701 const Py_UCS4 *ucs4_end;
2702 Py_ssize_t num_surrogates;
2703#endif
2704 wchar_t *w;
2705 wchar_t *wchar_end;
2706
2707 if (!PyUnicode_Check(unicode)) {
2708 PyErr_BadArgument();
2709 return NULL;
2710 }
2711 u = (PyUnicodeObject*)unicode;
2712 if (_PyUnicode_WSTR(u) == NULL) {
2713 /* Non-ASCII compact unicode object */
2714 assert(_PyUnicode_KIND(u) != 0);
2715 assert(PyUnicode_IS_READY(u));
2716
2717#ifdef Py_DEBUG
2718 ++unicode_as_unicode_calls;
2719#endif
2720
2721 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2722#if SIZEOF_WCHAR_T == 2
2723 four_bytes = PyUnicode_4BYTE_DATA(u);
2724 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2725 num_surrogates = 0;
2726
2727 for (; four_bytes < ucs4_end; ++four_bytes) {
2728 if (*four_bytes > 0xFFFF)
2729 ++num_surrogates;
2730 }
2731
2732 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2733 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2734 if (!_PyUnicode_WSTR(u)) {
2735 PyErr_NoMemory();
2736 return NULL;
2737 }
2738 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2739
2740 w = _PyUnicode_WSTR(u);
2741 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2742 four_bytes = PyUnicode_4BYTE_DATA(u);
2743 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2744 if (*four_bytes > 0xFFFF) {
2745 /* encode surrogate pair in this case */
2746 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2747 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2748 }
2749 else
2750 *w = *four_bytes;
2751
2752 if (w > wchar_end) {
2753 assert(0 && "Miscalculated string end");
2754 }
2755 }
2756 *w = 0;
2757#else
2758 /* sizeof(wchar_t) == 4 */
2759 Py_FatalError("Impossible unicode object state, wstr and str "
2760 "should share memory already.");
2761 return NULL;
2762#endif
2763 }
2764 else {
2765 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2766 (_PyUnicode_LENGTH(u) + 1));
2767 if (!_PyUnicode_WSTR(u)) {
2768 PyErr_NoMemory();
2769 return NULL;
2770 }
2771 if (!PyUnicode_IS_COMPACT_ASCII(u))
2772 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2773 w = _PyUnicode_WSTR(u);
2774 wchar_end = w + _PyUnicode_LENGTH(u);
2775
2776 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2777 one_byte = PyUnicode_1BYTE_DATA(u);
2778 for (; w < wchar_end; ++one_byte, ++w)
2779 *w = *one_byte;
2780 /* null-terminate the wstr */
2781 *w = 0;
2782 }
2783 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2784#if SIZEOF_WCHAR_T == 4
2785 two_bytes = PyUnicode_2BYTE_DATA(u);
2786 for (; w < wchar_end; ++two_bytes, ++w)
2787 *w = *two_bytes;
2788 /* null-terminate the wstr */
2789 *w = 0;
2790#else
2791 /* sizeof(wchar_t) == 2 */
2792 PyObject_FREE(_PyUnicode_WSTR(u));
2793 _PyUnicode_WSTR(u) = NULL;
2794 Py_FatalError("Impossible unicode object state, wstr "
2795 "and str should share memory already.");
2796 return NULL;
2797#endif
2798 }
2799 else {
2800 assert(0 && "This should never happen.");
2801 }
2802 }
2803 }
2804 if (size != NULL)
2805 *size = PyUnicode_WSTR_LENGTH(u);
2806 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002807}
2808
Alexander Belopolsky40018472011-02-26 01:02:56 +00002809Py_UNICODE *
2810PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002812 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813}
2814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002815
Alexander Belopolsky40018472011-02-26 01:02:56 +00002816Py_ssize_t
2817PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818{
2819 if (!PyUnicode_Check(unicode)) {
2820 PyErr_BadArgument();
2821 goto onError;
2822 }
2823 return PyUnicode_GET_SIZE(unicode);
2824
Benjamin Peterson29060642009-01-31 22:14:21 +00002825 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 return -1;
2827}
2828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002829Py_ssize_t
2830PyUnicode_GetLength(PyObject *unicode)
2831{
2832 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2833 PyErr_BadArgument();
2834 return -1;
2835 }
2836
2837 return PyUnicode_GET_LENGTH(unicode);
2838}
2839
2840Py_UCS4
2841PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2842{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02002843 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
2844 PyErr_BadArgument();
2845 return (Py_UCS4)-1;
2846 }
2847 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
2848 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002849 return (Py_UCS4)-1;
2850 }
2851 return PyUnicode_READ_CHAR(unicode, index);
2852}
2853
2854int
2855PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2856{
2857 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2858 return PyErr_BadArgument();
2859 return -1;
2860 }
2861
2862 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2863 index, ch);
2864 return 0;
2865}
2866
Alexander Belopolsky40018472011-02-26 01:02:56 +00002867const char *
2868PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002869{
Victor Stinner42cb4622010-09-01 19:39:01 +00002870 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002871}
2872
Victor Stinner554f3f02010-06-16 23:33:54 +00002873/* create or adjust a UnicodeDecodeError */
2874static void
2875make_decode_exception(PyObject **exceptionObject,
2876 const char *encoding,
2877 const char *input, Py_ssize_t length,
2878 Py_ssize_t startpos, Py_ssize_t endpos,
2879 const char *reason)
2880{
2881 if (*exceptionObject == NULL) {
2882 *exceptionObject = PyUnicodeDecodeError_Create(
2883 encoding, input, length, startpos, endpos, reason);
2884 }
2885 else {
2886 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2887 goto onError;
2888 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2889 goto onError;
2890 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2891 goto onError;
2892 }
2893 return;
2894
2895onError:
2896 Py_DECREF(*exceptionObject);
2897 *exceptionObject = NULL;
2898}
2899
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002900/* error handling callback helper:
2901 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002902 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002903 and adjust various state variables.
2904 return 0 on success, -1 on error
2905*/
2906
Alexander Belopolsky40018472011-02-26 01:02:56 +00002907static int
2908unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002909 const char *encoding, const char *reason,
2910 const char **input, const char **inend, Py_ssize_t *startinpos,
2911 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2912 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002913{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002914 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002915
2916 PyObject *restuple = NULL;
2917 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002918 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002919 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002920 Py_ssize_t requiredsize;
2921 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002922 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002923 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002924 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002925 int res = -1;
2926
2927 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002928 *errorHandler = PyCodec_LookupError(errors);
2929 if (*errorHandler == NULL)
2930 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002931 }
2932
Victor Stinner554f3f02010-06-16 23:33:54 +00002933 make_decode_exception(exceptionObject,
2934 encoding,
2935 *input, *inend - *input,
2936 *startinpos, *endinpos,
2937 reason);
2938 if (*exceptionObject == NULL)
2939 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002940
2941 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2942 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002943 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002944 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002945 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002946 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002947 }
2948 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002949 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002950
2951 /* Copy back the bytes variables, which might have been modified by the
2952 callback */
2953 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2954 if (!inputobj)
2955 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002956 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002957 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002958 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002959 *input = PyBytes_AS_STRING(inputobj);
2960 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002961 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002962 /* we can DECREF safely, as the exception has another reference,
2963 so the object won't go away. */
2964 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002965
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002966 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002967 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002968 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002969 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2970 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002971 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002972
2973 /* need more space? (at least enough for what we
2974 have+the replacement+the rest of the string (starting
2975 at the new input position), so we won't have to check space
2976 when there are no errors in the rest of the string) */
2977 repptr = PyUnicode_AS_UNICODE(repunicode);
2978 repsize = PyUnicode_GET_SIZE(repunicode);
2979 requiredsize = *outpos + repsize + insize-newpos;
2980 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002981 if (requiredsize<2*outsize)
2982 requiredsize = 2*outsize;
2983 if (_PyUnicode_Resize(output, requiredsize) < 0)
2984 goto onError;
2985 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002986 }
2987 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002988 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002989 Py_UNICODE_COPY(*outptr, repptr, repsize);
2990 *outptr += repsize;
2991 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002992
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002993 /* we made it! */
2994 res = 0;
2995
Benjamin Peterson29060642009-01-31 22:14:21 +00002996 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002997 Py_XDECREF(restuple);
2998 return res;
2999}
3000
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003001/* --- UTF-7 Codec -------------------------------------------------------- */
3002
Antoine Pitrou244651a2009-05-04 18:56:13 +00003003/* See RFC2152 for details. We encode conservatively and decode liberally. */
3004
3005/* Three simple macros defining base-64. */
3006
3007/* Is c a base-64 character? */
3008
3009#define IS_BASE64(c) \
3010 (((c) >= 'A' && (c) <= 'Z') || \
3011 ((c) >= 'a' && (c) <= 'z') || \
3012 ((c) >= '0' && (c) <= '9') || \
3013 (c) == '+' || (c) == '/')
3014
3015/* given that c is a base-64 character, what is its base-64 value? */
3016
3017#define FROM_BASE64(c) \
3018 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3019 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3020 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3021 (c) == '+' ? 62 : 63)
3022
3023/* What is the base-64 character of the bottom 6 bits of n? */
3024
3025#define TO_BASE64(n) \
3026 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3027
3028/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3029 * decoded as itself. We are permissive on decoding; the only ASCII
3030 * byte not decoding to itself is the + which begins a base64
3031 * string. */
3032
3033#define DECODE_DIRECT(c) \
3034 ((c) <= 127 && (c) != '+')
3035
3036/* The UTF-7 encoder treats ASCII characters differently according to
3037 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3038 * the above). See RFC2152. This array identifies these different
3039 * sets:
3040 * 0 : "Set D"
3041 * alphanumeric and '(),-./:?
3042 * 1 : "Set O"
3043 * !"#$%&*;<=>@[]^_`{|}
3044 * 2 : "whitespace"
3045 * ht nl cr sp
3046 * 3 : special (must be base64 encoded)
3047 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3048 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003049
Tim Petersced69f82003-09-16 20:30:58 +00003050static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003051char utf7_category[128] = {
3052/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3053 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3054/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3055 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3056/* sp ! " # $ % & ' ( ) * + , - . / */
3057 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3058/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3059 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3060/* @ A B C D E F G H I J K L M N O */
3061 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3062/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3063 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3064/* ` a b c d e f g h i j k l m n o */
3065 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3066/* p q r s t u v w x y z { | } ~ del */
3067 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003068};
3069
Antoine Pitrou244651a2009-05-04 18:56:13 +00003070/* ENCODE_DIRECT: this character should be encoded as itself. The
3071 * answer depends on whether we are encoding set O as itself, and also
3072 * on whether we are encoding whitespace as itself. RFC2152 makes it
3073 * clear that the answers to these questions vary between
3074 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003075
Antoine Pitrou244651a2009-05-04 18:56:13 +00003076#define ENCODE_DIRECT(c, directO, directWS) \
3077 ((c) < 128 && (c) > 0 && \
3078 ((utf7_category[(c)] == 0) || \
3079 (directWS && (utf7_category[(c)] == 2)) || \
3080 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003081
Alexander Belopolsky40018472011-02-26 01:02:56 +00003082PyObject *
3083PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003084 Py_ssize_t size,
3085 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003086{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003087 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3088}
3089
Antoine Pitrou244651a2009-05-04 18:56:13 +00003090/* The decoder. The only state we preserve is our read position,
3091 * i.e. how many characters we have consumed. So if we end in the
3092 * middle of a shift sequence we have to back off the read position
3093 * and the output to the beginning of the sequence, otherwise we lose
3094 * all the shift state (seen bits, number of bits seen, high
3095 * surrogate). */
3096
Alexander Belopolsky40018472011-02-26 01:02:56 +00003097PyObject *
3098PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003099 Py_ssize_t size,
3100 const char *errors,
3101 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003102{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003103 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003104 Py_ssize_t startinpos;
3105 Py_ssize_t endinpos;
3106 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003107 const char *e;
3108 PyUnicodeObject *unicode;
3109 Py_UNICODE *p;
3110 const char *errmsg = "";
3111 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003112 Py_UNICODE *shiftOutStart;
3113 unsigned int base64bits = 0;
3114 unsigned long base64buffer = 0;
3115 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003116 PyObject *errorHandler = NULL;
3117 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003118
3119 unicode = _PyUnicode_New(size);
3120 if (!unicode)
3121 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003122 if (size == 0) {
3123 if (consumed)
3124 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003125 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003126 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003128 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003129 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003130 e = s + size;
3131
3132 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003133 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003134 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003135 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003136
Antoine Pitrou244651a2009-05-04 18:56:13 +00003137 if (inShift) { /* in a base-64 section */
3138 if (IS_BASE64(ch)) { /* consume a base-64 character */
3139 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3140 base64bits += 6;
3141 s++;
3142 if (base64bits >= 16) {
3143 /* we have enough bits for a UTF-16 value */
3144 Py_UNICODE outCh = (Py_UNICODE)
3145 (base64buffer >> (base64bits-16));
3146 base64bits -= 16;
3147 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3148 if (surrogate) {
3149 /* expecting a second surrogate */
3150 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3151#ifdef Py_UNICODE_WIDE
3152 *p++ = (((surrogate & 0x3FF)<<10)
3153 | (outCh & 0x3FF)) + 0x10000;
3154#else
3155 *p++ = surrogate;
3156 *p++ = outCh;
3157#endif
3158 surrogate = 0;
3159 }
3160 else {
3161 surrogate = 0;
3162 errmsg = "second surrogate missing";
3163 goto utf7Error;
3164 }
3165 }
3166 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3167 /* first surrogate */
3168 surrogate = outCh;
3169 }
3170 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3171 errmsg = "unexpected second surrogate";
3172 goto utf7Error;
3173 }
3174 else {
3175 *p++ = outCh;
3176 }
3177 }
3178 }
3179 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003180 inShift = 0;
3181 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003182 if (surrogate) {
3183 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003184 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003185 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003186 if (base64bits > 0) { /* left-over bits */
3187 if (base64bits >= 6) {
3188 /* We've seen at least one base-64 character */
3189 errmsg = "partial character in shift sequence";
3190 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003191 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003192 else {
3193 /* Some bits remain; they should be zero */
3194 if (base64buffer != 0) {
3195 errmsg = "non-zero padding bits in shift sequence";
3196 goto utf7Error;
3197 }
3198 }
3199 }
3200 if (ch != '-') {
3201 /* '-' is absorbed; other terminating
3202 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003203 *p++ = ch;
3204 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003205 }
3206 }
3207 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003208 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003209 s++; /* consume '+' */
3210 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003211 s++;
3212 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003213 }
3214 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003215 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003216 shiftOutStart = p;
3217 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003218 }
3219 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003220 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003221 *p++ = ch;
3222 s++;
3223 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003224 else {
3225 startinpos = s-starts;
3226 s++;
3227 errmsg = "unexpected special character";
3228 goto utf7Error;
3229 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003230 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003231utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003232 outpos = p-PyUnicode_AS_UNICODE(unicode);
3233 endinpos = s-starts;
3234 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003235 errors, &errorHandler,
3236 "utf7", errmsg,
3237 &starts, &e, &startinpos, &endinpos, &exc, &s,
3238 &unicode, &outpos, &p))
3239 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003240 }
3241
Antoine Pitrou244651a2009-05-04 18:56:13 +00003242 /* end of string */
3243
3244 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3245 /* if we're in an inconsistent state, that's an error */
3246 if (surrogate ||
3247 (base64bits >= 6) ||
3248 (base64bits > 0 && base64buffer != 0)) {
3249 outpos = p-PyUnicode_AS_UNICODE(unicode);
3250 endinpos = size;
3251 if (unicode_decode_call_errorhandler(
3252 errors, &errorHandler,
3253 "utf7", "unterminated shift sequence",
3254 &starts, &e, &startinpos, &endinpos, &exc, &s,
3255 &unicode, &outpos, &p))
3256 goto onError;
3257 if (s < e)
3258 goto restart;
3259 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003260 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003261
3262 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003263 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003264 if (inShift) {
3265 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003266 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003267 }
3268 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003269 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003270 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003271 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003272
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003273 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003274 goto onError;
3275
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 Py_XDECREF(errorHandler);
3277 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003278 if (PyUnicode_READY(unicode) == -1) {
3279 Py_DECREF(unicode);
3280 return NULL;
3281 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003282 return (PyObject *)unicode;
3283
Benjamin Peterson29060642009-01-31 22:14:21 +00003284 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003285 Py_XDECREF(errorHandler);
3286 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003287 Py_DECREF(unicode);
3288 return NULL;
3289}
3290
3291
Alexander Belopolsky40018472011-02-26 01:02:56 +00003292PyObject *
3293PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003294 Py_ssize_t size,
3295 int base64SetO,
3296 int base64WhiteSpace,
3297 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003298{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003299 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003300 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003301 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003302 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003303 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003304 unsigned int base64bits = 0;
3305 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003306 char * out;
3307 char * start;
3308
3309 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003310 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003311
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003312 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003313 return PyErr_NoMemory();
3314
Antoine Pitrou244651a2009-05-04 18:56:13 +00003315 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003316 if (v == NULL)
3317 return NULL;
3318
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003319 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003320 for (;i < size; ++i) {
3321 Py_UNICODE ch = s[i];
3322
Antoine Pitrou244651a2009-05-04 18:56:13 +00003323 if (inShift) {
3324 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3325 /* shifting out */
3326 if (base64bits) { /* output remaining bits */
3327 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3328 base64buffer = 0;
3329 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003330 }
3331 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003332 /* Characters not in the BASE64 set implicitly unshift the sequence
3333 so no '-' is required, except if the character is itself a '-' */
3334 if (IS_BASE64(ch) || ch == '-') {
3335 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003336 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003337 *out++ = (char) ch;
3338 }
3339 else {
3340 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003341 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003342 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003343 else { /* not in a shift sequence */
3344 if (ch == '+') {
3345 *out++ = '+';
3346 *out++ = '-';
3347 }
3348 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3349 *out++ = (char) ch;
3350 }
3351 else {
3352 *out++ = '+';
3353 inShift = 1;
3354 goto encode_char;
3355 }
3356 }
3357 continue;
3358encode_char:
3359#ifdef Py_UNICODE_WIDE
3360 if (ch >= 0x10000) {
3361 /* code first surrogate */
3362 base64bits += 16;
3363 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3364 while (base64bits >= 6) {
3365 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3366 base64bits -= 6;
3367 }
3368 /* prepare second surrogate */
3369 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3370 }
3371#endif
3372 base64bits += 16;
3373 base64buffer = (base64buffer << 16) | ch;
3374 while (base64bits >= 6) {
3375 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3376 base64bits -= 6;
3377 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003378 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003379 if (base64bits)
3380 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3381 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003382 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003383 if (_PyBytes_Resize(&v, out - start) < 0)
3384 return NULL;
3385 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003386}
3387
Antoine Pitrou244651a2009-05-04 18:56:13 +00003388#undef IS_BASE64
3389#undef FROM_BASE64
3390#undef TO_BASE64
3391#undef DECODE_DIRECT
3392#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003393
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394/* --- UTF-8 Codec -------------------------------------------------------- */
3395
Tim Petersced69f82003-09-16 20:30:58 +00003396static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003398 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3399 illegal prefix. See RFC 3629 for details */
3400 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3401 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003402 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3404 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3405 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3406 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003407 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3408 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3410 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003411 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3412 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3413 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3414 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3415 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416};
3417
Alexander Belopolsky40018472011-02-26 01:02:56 +00003418PyObject *
3419PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003420 Py_ssize_t size,
3421 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422{
Walter Dörwald69652032004-09-07 20:24:22 +00003423 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3424}
3425
Antoine Pitrouab868312009-01-10 15:40:25 +00003426/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3427#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3428
3429/* Mask to quickly check whether a C 'long' contains a
3430 non-ASCII, UTF8-encoded char. */
3431#if (SIZEOF_LONG == 8)
3432# define ASCII_CHAR_MASK 0x8080808080808080L
3433#elif (SIZEOF_LONG == 4)
3434# define ASCII_CHAR_MASK 0x80808080L
3435#else
3436# error C 'long' size should be either 4 or 8!
3437#endif
3438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003439/* Scans a UTF-8 string and returns the maximum character to be expected,
3440 the size of the decoded unicode string and if any major errors were
3441 encountered.
3442
3443 This function does check basic UTF-8 sanity, it does however NOT CHECK
3444 if the string contains surrogates, and if all continuation bytes are
3445 within the correct ranges, these checks are performed in
3446 PyUnicode_DecodeUTF8Stateful.
3447
3448 If it sets has_errors to 1, it means the value of unicode_size and max_char
3449 will be bogus and you should not rely on useful information in them.
3450 */
3451static Py_UCS4
3452utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3453 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3454 int *has_errors)
3455{
3456 Py_ssize_t n;
3457 Py_ssize_t char_count = 0;
3458 Py_UCS4 max_char = 127, new_max;
3459 Py_UCS4 upper_bound;
3460 const unsigned char *p = (const unsigned char *)s;
3461 const unsigned char *end = p + string_size;
3462 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3463 int err = 0;
3464
3465 for (; p < end && !err; ++p, ++char_count) {
3466 /* Only check value if it's not a ASCII char... */
3467 if (*p < 0x80) {
3468 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3469 an explanation. */
3470 if (!((size_t) p & LONG_PTR_MASK)) {
3471 /* Help register allocation */
3472 register const unsigned char *_p = p;
3473 while (_p < aligned_end) {
3474 unsigned long value = *(unsigned long *) _p;
3475 if (value & ASCII_CHAR_MASK)
3476 break;
3477 _p += SIZEOF_LONG;
3478 char_count += SIZEOF_LONG;
3479 }
3480 p = _p;
3481 if (p == end)
3482 break;
3483 }
3484 }
3485 if (*p >= 0x80) {
3486 n = utf8_code_length[*p];
3487 new_max = max_char;
3488 switch (n) {
3489 /* invalid start byte */
3490 case 0:
3491 err = 1;
3492 break;
3493 case 2:
3494 /* Code points between 0x00FF and 0x07FF inclusive.
3495 Approximate the upper bound of the code point,
3496 if this flips over 255 we can be sure it will be more
3497 than 255 and the string will need 2 bytes per code coint,
3498 if it stays under or equal to 255, we can be sure 1 byte
3499 is enough.
3500 ((*p & 0b00011111) << 6) | 0b00111111 */
3501 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3502 if (max_char < upper_bound)
3503 new_max = upper_bound;
3504 /* Ensure we track at least that we left ASCII space. */
3505 if (new_max < 128)
3506 new_max = 128;
3507 break;
3508 case 3:
3509 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3510 always > 255 and <= 65535 and will always need 2 bytes. */
3511 if (max_char < 65535)
3512 new_max = 65535;
3513 break;
3514 case 4:
3515 /* Code point will be above 0xFFFF for sure in this case. */
3516 new_max = 65537;
3517 break;
3518 /* Internal error, this should be caught by the first if */
3519 case 1:
3520 default:
3521 assert(0 && "Impossible case in utf8_max_char_and_size");
3522 err = 1;
3523 }
3524 /* Instead of number of overall bytes for this code point,
3525 n containts the number of following bytes: */
3526 --n;
3527 /* Check if the follow up chars are all valid continuation bytes */
3528 if (n >= 1) {
3529 const unsigned char *cont;
3530 if ((p + n) >= end) {
3531 if (consumed == 0)
3532 /* incomplete data, non-incremental decoding */
3533 err = 1;
3534 break;
3535 }
3536 for (cont = p + 1; cont < (p + n); ++cont) {
3537 if ((*cont & 0xc0) != 0x80) {
3538 err = 1;
3539 break;
3540 }
3541 }
3542 p += n;
3543 }
3544 else
3545 err = 1;
3546 max_char = new_max;
3547 }
3548 }
3549
3550 if (unicode_size)
3551 *unicode_size = char_count;
3552 if (has_errors)
3553 *has_errors = err;
3554 return max_char;
3555}
3556
3557/* Similar to PyUnicode_WRITE but can also write into wstr field
3558 of the legacy unicode representation */
3559#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3560 do { \
3561 const int k_ = (kind); \
3562 if (k_ == PyUnicode_WCHAR_KIND) \
3563 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3564 else if (k_ == PyUnicode_1BYTE_KIND) \
3565 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3566 else if (k_ == PyUnicode_2BYTE_KIND) \
3567 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3568 else \
3569 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3570 } while (0)
3571
Alexander Belopolsky40018472011-02-26 01:02:56 +00003572PyObject *
3573PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003574 Py_ssize_t size,
3575 const char *errors,
3576 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003577{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003580 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003581 Py_ssize_t startinpos;
3582 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003583 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003585 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 PyObject *errorHandler = NULL;
3587 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003588 Py_UCS4 maxchar = 0;
3589 Py_ssize_t unicode_size;
3590 Py_ssize_t i;
3591 int kind;
3592 void *data;
3593 int has_errors;
3594 Py_UNICODE *error_outptr;
3595#if SIZEOF_WCHAR_T == 2
3596 Py_ssize_t wchar_offset = 0;
3597#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598
Walter Dörwald69652032004-09-07 20:24:22 +00003599 if (size == 0) {
3600 if (consumed)
3601 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003602 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003604 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3605 consumed, &has_errors);
3606 if (has_errors) {
3607 unicode = _PyUnicode_New(size);
3608 if (!unicode)
3609 return NULL;
3610 kind = PyUnicode_WCHAR_KIND;
3611 data = PyUnicode_AS_UNICODE(unicode);
3612 assert(data != NULL);
3613 }
3614 else {
3615 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3616 if (!unicode)
3617 return NULL;
3618 /* When the string is ASCII only, just use memcpy and return.
3619 unicode_size may be != size if there is an incomplete UTF-8
3620 sequence at the end of the ASCII block. */
3621 if (maxchar < 128 && size == unicode_size) {
3622 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3623 return (PyObject *)unicode;
3624 }
3625 kind = PyUnicode_KIND(unicode);
3626 data = PyUnicode_DATA(unicode);
3627 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003629 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003631 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632
3633 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003634 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635
3636 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003637 /* Fast path for runs of ASCII characters. Given that common UTF-8
3638 input will consist of an overwhelming majority of ASCII
3639 characters, we try to optimize for this case by checking
3640 as many characters as a C 'long' can contain.
3641 First, check if we can do an aligned read, as most CPUs have
3642 a penalty for unaligned reads.
3643 */
3644 if (!((size_t) s & LONG_PTR_MASK)) {
3645 /* Help register allocation */
3646 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003647 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003648 while (_s < aligned_end) {
3649 /* Read a whole long at a time (either 4 or 8 bytes),
3650 and do a fast unrolled copy if it only contains ASCII
3651 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003652 unsigned long value = *(unsigned long *) _s;
3653 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003654 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003655 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3656 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3657 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3658 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003659#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003660 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3661 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3662 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3663 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003664#endif
3665 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003666 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003667 }
3668 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003669 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003670 if (s == e)
3671 break;
3672 ch = (unsigned char)*s;
3673 }
3674 }
3675
3676 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003677 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678 s++;
3679 continue;
3680 }
3681
3682 n = utf8_code_length[ch];
3683
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003684 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003685 if (consumed)
3686 break;
3687 else {
3688 errmsg = "unexpected end of data";
3689 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003690 endinpos = startinpos+1;
3691 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3692 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 goto utf8Error;
3694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003695 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696
3697 switch (n) {
3698
3699 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003700 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003701 startinpos = s-starts;
3702 endinpos = startinpos+1;
3703 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704
3705 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003706 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003707 startinpos = s-starts;
3708 endinpos = startinpos+1;
3709 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710
3711 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003712 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003713 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003714 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003715 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003716 goto utf8Error;
3717 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003719 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003720 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721 break;
3722
3723 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003724 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3725 will result in surrogates in range d800-dfff. Surrogates are
3726 not valid UTF-8 so they are rejected.
3727 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3728 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003729 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003730 (s[2] & 0xc0) != 0x80 ||
3731 ((unsigned char)s[0] == 0xE0 &&
3732 (unsigned char)s[1] < 0xA0) ||
3733 ((unsigned char)s[0] == 0xED &&
3734 (unsigned char)s[1] > 0x9F)) {
3735 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003736 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003737 endinpos = startinpos + 1;
3738
3739 /* if s[1] first two bits are 1 and 0, then the invalid
3740 continuation byte is s[2], so increment endinpos by 1,
3741 if not, s[1] is invalid and endinpos doesn't need to
3742 be incremented. */
3743 if ((s[1] & 0xC0) == 0x80)
3744 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003745 goto utf8Error;
3746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003748 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003749 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003750 break;
3751
3752 case 4:
3753 if ((s[1] & 0xc0) != 0x80 ||
3754 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003755 (s[3] & 0xc0) != 0x80 ||
3756 ((unsigned char)s[0] == 0xF0 &&
3757 (unsigned char)s[1] < 0x90) ||
3758 ((unsigned char)s[0] == 0xF4 &&
3759 (unsigned char)s[1] > 0x8F)) {
3760 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003761 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003762 endinpos = startinpos + 1;
3763 if ((s[1] & 0xC0) == 0x80) {
3764 endinpos++;
3765 if ((s[2] & 0xC0) == 0x80)
3766 endinpos++;
3767 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003768 goto utf8Error;
3769 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003770 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003771 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3772 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003774 /* If the string is flexible or we have native UCS-4, write
3775 directly.. */
3776 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3777 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003779 else {
3780 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003782 /* translate from 10000..10FFFF to 0..FFFF */
3783 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785 /* high surrogate = top 10 bits added to D800 */
3786 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3787 (Py_UNICODE)(0xD800 + (ch >> 10)));
3788
3789 /* low surrogate = bottom 10 bits added to DC00 */
3790 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3791 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3792 }
3793#if SIZEOF_WCHAR_T == 2
3794 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003795#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 }
3798 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003799 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003800
Benjamin Peterson29060642009-01-31 22:14:21 +00003801 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003802 /* If this is not yet a resizable string, make it one.. */
3803 if (kind != PyUnicode_WCHAR_KIND) {
3804 const Py_UNICODE *u;
3805 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3806 if (!new_unicode)
3807 goto onError;
3808 u = PyUnicode_AsUnicode((PyObject *)unicode);
3809 if (!u)
3810 goto onError;
3811#if SIZEOF_WCHAR_T == 2
3812 i += wchar_offset;
3813#endif
3814 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3815 Py_DECREF(unicode);
3816 unicode = new_unicode;
3817 kind = 0;
3818 data = PyUnicode_AS_UNICODE(new_unicode);
3819 assert(data != NULL);
3820 }
3821 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003822 if (unicode_decode_call_errorhandler(
3823 errors, &errorHandler,
3824 "utf8", errmsg,
3825 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003826 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003827 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003828 /* Update data because unicode_decode_call_errorhandler might have
3829 re-created or resized the unicode object. */
3830 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003831 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003833 /* Ensure the unicode_size calculation above was correct: */
3834 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3835
Walter Dörwald69652032004-09-07 20:24:22 +00003836 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003837 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003839 /* Adjust length and ready string when it contained errors and
3840 is of the old resizable kind. */
3841 if (kind == PyUnicode_WCHAR_KIND) {
3842 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3843 PyUnicode_READY(unicode) == -1)
3844 goto onError;
3845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003847 Py_XDECREF(errorHandler);
3848 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003849 if (PyUnicode_READY(unicode) == -1) {
3850 Py_DECREF(unicode);
3851 return NULL;
3852 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853 return (PyObject *)unicode;
3854
Benjamin Peterson29060642009-01-31 22:14:21 +00003855 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003856 Py_XDECREF(errorHandler);
3857 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 Py_DECREF(unicode);
3859 return NULL;
3860}
3861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003862#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003863
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003864#ifdef __APPLE__
3865
3866/* Simplified UTF-8 decoder using surrogateescape error handler,
3867 used to decode the command line arguments on Mac OS X. */
3868
3869wchar_t*
3870_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3871{
3872 int n;
3873 const char *e;
3874 wchar_t *unicode, *p;
3875
3876 /* Note: size will always be longer than the resulting Unicode
3877 character count */
3878 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3879 PyErr_NoMemory();
3880 return NULL;
3881 }
3882 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3883 if (!unicode)
3884 return NULL;
3885
3886 /* Unpack UTF-8 encoded data */
3887 p = unicode;
3888 e = s + size;
3889 while (s < e) {
3890 Py_UCS4 ch = (unsigned char)*s;
3891
3892 if (ch < 0x80) {
3893 *p++ = (wchar_t)ch;
3894 s++;
3895 continue;
3896 }
3897
3898 n = utf8_code_length[ch];
3899 if (s + n > e) {
3900 goto surrogateescape;
3901 }
3902
3903 switch (n) {
3904 case 0:
3905 case 1:
3906 goto surrogateescape;
3907
3908 case 2:
3909 if ((s[1] & 0xc0) != 0x80)
3910 goto surrogateescape;
3911 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3912 assert ((ch > 0x007F) && (ch <= 0x07FF));
3913 *p++ = (wchar_t)ch;
3914 break;
3915
3916 case 3:
3917 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3918 will result in surrogates in range d800-dfff. Surrogates are
3919 not valid UTF-8 so they are rejected.
3920 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3921 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3922 if ((s[1] & 0xc0) != 0x80 ||
3923 (s[2] & 0xc0) != 0x80 ||
3924 ((unsigned char)s[0] == 0xE0 &&
3925 (unsigned char)s[1] < 0xA0) ||
3926 ((unsigned char)s[0] == 0xED &&
3927 (unsigned char)s[1] > 0x9F)) {
3928
3929 goto surrogateescape;
3930 }
3931 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3932 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003934 break;
3935
3936 case 4:
3937 if ((s[1] & 0xc0) != 0x80 ||
3938 (s[2] & 0xc0) != 0x80 ||
3939 (s[3] & 0xc0) != 0x80 ||
3940 ((unsigned char)s[0] == 0xF0 &&
3941 (unsigned char)s[1] < 0x90) ||
3942 ((unsigned char)s[0] == 0xF4 &&
3943 (unsigned char)s[1] > 0x8F)) {
3944 goto surrogateescape;
3945 }
3946 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3947 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3948 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3949
3950#if SIZEOF_WCHAR_T == 4
3951 *p++ = (wchar_t)ch;
3952#else
3953 /* compute and append the two surrogates: */
3954
3955 /* translate from 10000..10FFFF to 0..FFFF */
3956 ch -= 0x10000;
3957
3958 /* high surrogate = top 10 bits added to D800 */
3959 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3960
3961 /* low surrogate = bottom 10 bits added to DC00 */
3962 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3963#endif
3964 break;
3965 }
3966 s += n;
3967 continue;
3968
3969 surrogateescape:
3970 *p++ = 0xDC00 + ch;
3971 s++;
3972 }
3973 *p = L'\0';
3974 return unicode;
3975}
3976
3977#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979/* Primary internal function which creates utf8 encoded bytes objects.
3980
3981 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003982 and allocate exactly as much space needed at the end. Else allocate the
3983 maximum possible needed (4 result bytes per Unicode character), and return
3984 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003985*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003986PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988{
Tim Peters602f7402002-04-27 18:03:26 +00003989#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003990
Guido van Rossum98297ee2007-11-06 21:34:58 +00003991 Py_ssize_t i; /* index into s of next input byte */
3992 PyObject *result; /* result string object */
3993 char *p; /* next free byte in output buffer */
3994 Py_ssize_t nallocated; /* number of result bytes allocated */
3995 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003996 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003997 PyObject *errorHandler = NULL;
3998 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003999 int kind;
4000 void *data;
4001 Py_ssize_t size;
4002 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4003#if SIZEOF_WCHAR_T == 2
4004 Py_ssize_t wchar_offset = 0;
4005#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 if (!PyUnicode_Check(unicode)) {
4008 PyErr_BadArgument();
4009 return NULL;
4010 }
4011
4012 if (PyUnicode_READY(unicode) == -1)
4013 return NULL;
4014
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004015 if (PyUnicode_UTF8(unicode))
4016 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4017 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018
4019 kind = PyUnicode_KIND(unicode);
4020 data = PyUnicode_DATA(unicode);
4021 size = PyUnicode_GET_LENGTH(unicode);
4022
Tim Peters602f7402002-04-27 18:03:26 +00004023 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024
Tim Peters602f7402002-04-27 18:03:26 +00004025 if (size <= MAX_SHORT_UNICHARS) {
4026 /* Write into the stack buffer; nallocated can't overflow.
4027 * At the end, we'll allocate exactly as much heap space as it
4028 * turns out we need.
4029 */
4030 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004031 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004032 p = stackbuf;
4033 }
4034 else {
4035 /* Overallocate on the heap, and give the excess back at the end. */
4036 nallocated = size * 4;
4037 if (nallocated / 4 != size) /* overflow! */
4038 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004039 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004040 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004041 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004042 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004043 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004044
Tim Peters602f7402002-04-27 18:03:26 +00004045 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004046 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004047
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004048 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004049 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004051
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004053 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004054 *p++ = (char)(0xc0 | (ch >> 6));
4055 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004056 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057 Py_ssize_t newpos;
4058 PyObject *rep;
4059 Py_ssize_t repsize, k, startpos;
4060 startpos = i-1;
4061#if SIZEOF_WCHAR_T == 2
4062 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004063#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 rep = unicode_encode_call_errorhandler(
4065 errors, &errorHandler, "utf-8", "surrogates not allowed",
4066 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4067 &exc, startpos, startpos+1, &newpos);
4068 if (!rep)
4069 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004071 if (PyBytes_Check(rep))
4072 repsize = PyBytes_GET_SIZE(rep);
4073 else
4074 repsize = PyUnicode_GET_SIZE(rep);
4075
4076 if (repsize > 4) {
4077 Py_ssize_t offset;
4078
4079 if (result == NULL)
4080 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004081 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004082 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004084 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4085 /* integer overflow */
4086 PyErr_NoMemory();
4087 goto error;
4088 }
4089 nallocated += repsize - 4;
4090 if (result != NULL) {
4091 if (_PyBytes_Resize(&result, nallocated) < 0)
4092 goto error;
4093 } else {
4094 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004095 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004096 goto error;
4097 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4098 }
4099 p = PyBytes_AS_STRING(result) + offset;
4100 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004102 if (PyBytes_Check(rep)) {
4103 char *prep = PyBytes_AS_STRING(rep);
4104 for(k = repsize; k > 0; k--)
4105 *p++ = *prep++;
4106 } else /* rep is unicode */ {
4107 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4108 Py_UNICODE c;
4109
4110 for(k=0; k<repsize; k++) {
4111 c = prep[k];
4112 if (0x80 <= c) {
4113 raise_encode_exception(&exc, "utf-8",
4114 PyUnicode_AS_UNICODE(unicode),
4115 size, i-1, i,
4116 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004117 goto error;
4118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004119 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004120 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004121 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004122 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004123 } else if (ch < 0x10000) {
4124 *p++ = (char)(0xe0 | (ch >> 12));
4125 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4126 *p++ = (char)(0x80 | (ch & 0x3f));
4127 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004128 /* Encode UCS4 Unicode ordinals */
4129 *p++ = (char)(0xf0 | (ch >> 18));
4130 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4131 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4132 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004133#if SIZEOF_WCHAR_T == 2
4134 wchar_offset++;
4135#endif
Tim Peters602f7402002-04-27 18:03:26 +00004136 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004138
Guido van Rossum98297ee2007-11-06 21:34:58 +00004139 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004140 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004141 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004142 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004143 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004144 }
4145 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004146 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004147 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004148 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004149 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004151
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004152 Py_XDECREF(errorHandler);
4153 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004154 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004155 error:
4156 Py_XDECREF(errorHandler);
4157 Py_XDECREF(exc);
4158 Py_XDECREF(result);
4159 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004160
Tim Peters602f7402002-04-27 18:03:26 +00004161#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162}
4163
Alexander Belopolsky40018472011-02-26 01:02:56 +00004164PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004165PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4166 Py_ssize_t size,
4167 const char *errors)
4168{
4169 PyObject *v, *unicode;
4170
4171 unicode = PyUnicode_FromUnicode(s, size);
4172 if (unicode == NULL)
4173 return NULL;
4174 v = _PyUnicode_AsUTF8String(unicode, errors);
4175 Py_DECREF(unicode);
4176 return v;
4177}
4178
4179PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004180PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004182 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183}
4184
Walter Dörwald41980ca2007-08-16 21:55:45 +00004185/* --- UTF-32 Codec ------------------------------------------------------- */
4186
4187PyObject *
4188PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 Py_ssize_t size,
4190 const char *errors,
4191 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004192{
4193 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4194}
4195
4196PyObject *
4197PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 Py_ssize_t size,
4199 const char *errors,
4200 int *byteorder,
4201 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004202{
4203 const char *starts = s;
4204 Py_ssize_t startinpos;
4205 Py_ssize_t endinpos;
4206 Py_ssize_t outpos;
4207 PyUnicodeObject *unicode;
4208 Py_UNICODE *p;
4209#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004210 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004211 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004212#else
4213 const int pairs = 0;
4214#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004215 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004216 int bo = 0; /* assume native ordering by default */
4217 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004218 /* Offsets from q for retrieving bytes in the right order. */
4219#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4220 int iorder[] = {0, 1, 2, 3};
4221#else
4222 int iorder[] = {3, 2, 1, 0};
4223#endif
4224 PyObject *errorHandler = NULL;
4225 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004226
Walter Dörwald41980ca2007-08-16 21:55:45 +00004227 q = (unsigned char *)s;
4228 e = q + size;
4229
4230 if (byteorder)
4231 bo = *byteorder;
4232
4233 /* Check for BOM marks (U+FEFF) in the input and adjust current
4234 byte order setting accordingly. In native mode, the leading BOM
4235 mark is skipped, in all other modes, it is copied to the output
4236 stream as-is (giving a ZWNBSP character). */
4237 if (bo == 0) {
4238 if (size >= 4) {
4239 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004241#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004242 if (bom == 0x0000FEFF) {
4243 q += 4;
4244 bo = -1;
4245 }
4246 else if (bom == 0xFFFE0000) {
4247 q += 4;
4248 bo = 1;
4249 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004250#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004251 if (bom == 0x0000FEFF) {
4252 q += 4;
4253 bo = 1;
4254 }
4255 else if (bom == 0xFFFE0000) {
4256 q += 4;
4257 bo = -1;
4258 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004259#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004260 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004261 }
4262
4263 if (bo == -1) {
4264 /* force LE */
4265 iorder[0] = 0;
4266 iorder[1] = 1;
4267 iorder[2] = 2;
4268 iorder[3] = 3;
4269 }
4270 else if (bo == 1) {
4271 /* force BE */
4272 iorder[0] = 3;
4273 iorder[1] = 2;
4274 iorder[2] = 1;
4275 iorder[3] = 0;
4276 }
4277
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004278 /* On narrow builds we split characters outside the BMP into two
4279 codepoints => count how much extra space we need. */
4280#ifndef Py_UNICODE_WIDE
4281 for (qq = q; qq < e; qq += 4)
4282 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4283 pairs++;
4284#endif
4285
4286 /* This might be one to much, because of a BOM */
4287 unicode = _PyUnicode_New((size+3)/4+pairs);
4288 if (!unicode)
4289 return NULL;
4290 if (size == 0)
4291 return (PyObject *)unicode;
4292
4293 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004294 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004295
Walter Dörwald41980ca2007-08-16 21:55:45 +00004296 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004297 Py_UCS4 ch;
4298 /* remaining bytes at the end? (size should be divisible by 4) */
4299 if (e-q<4) {
4300 if (consumed)
4301 break;
4302 errmsg = "truncated data";
4303 startinpos = ((const char *)q)-starts;
4304 endinpos = ((const char *)e)-starts;
4305 goto utf32Error;
4306 /* The remaining input chars are ignored if the callback
4307 chooses to skip the input */
4308 }
4309 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4310 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004311
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 if (ch >= 0x110000)
4313 {
4314 errmsg = "codepoint not in range(0x110000)";
4315 startinpos = ((const char *)q)-starts;
4316 endinpos = startinpos+4;
4317 goto utf32Error;
4318 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004319#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004320 if (ch >= 0x10000)
4321 {
4322 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4323 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4324 }
4325 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004326#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004327 *p++ = ch;
4328 q += 4;
4329 continue;
4330 utf32Error:
4331 outpos = p-PyUnicode_AS_UNICODE(unicode);
4332 if (unicode_decode_call_errorhandler(
4333 errors, &errorHandler,
4334 "utf32", errmsg,
4335 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4336 &unicode, &outpos, &p))
4337 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004338 }
4339
4340 if (byteorder)
4341 *byteorder = bo;
4342
4343 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004345
4346 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004347 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004348 goto onError;
4349
4350 Py_XDECREF(errorHandler);
4351 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004352 if (PyUnicode_READY(unicode) == -1) {
4353 Py_DECREF(unicode);
4354 return NULL;
4355 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004356 return (PyObject *)unicode;
4357
Benjamin Peterson29060642009-01-31 22:14:21 +00004358 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004359 Py_DECREF(unicode);
4360 Py_XDECREF(errorHandler);
4361 Py_XDECREF(exc);
4362 return NULL;
4363}
4364
4365PyObject *
4366PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 Py_ssize_t size,
4368 const char *errors,
4369 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004370{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004371 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004372 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004373 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004374#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004375 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004376#else
4377 const int pairs = 0;
4378#endif
4379 /* Offsets from p for storing byte pairs in the right order. */
4380#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4381 int iorder[] = {0, 1, 2, 3};
4382#else
4383 int iorder[] = {3, 2, 1, 0};
4384#endif
4385
Benjamin Peterson29060642009-01-31 22:14:21 +00004386#define STORECHAR(CH) \
4387 do { \
4388 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4389 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4390 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4391 p[iorder[0]] = (CH) & 0xff; \
4392 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004393 } while(0)
4394
4395 /* In narrow builds we can output surrogate pairs as one codepoint,
4396 so we need less space. */
4397#ifndef Py_UNICODE_WIDE
4398 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4400 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4401 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004402#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004403 nsize = (size - pairs + (byteorder == 0));
4404 bytesize = nsize * 4;
4405 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004407 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004408 if (v == NULL)
4409 return NULL;
4410
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004411 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004412 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004414 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004415 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004416
4417 if (byteorder == -1) {
4418 /* force LE */
4419 iorder[0] = 0;
4420 iorder[1] = 1;
4421 iorder[2] = 2;
4422 iorder[3] = 3;
4423 }
4424 else if (byteorder == 1) {
4425 /* force BE */
4426 iorder[0] = 3;
4427 iorder[1] = 2;
4428 iorder[2] = 1;
4429 iorder[3] = 0;
4430 }
4431
4432 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004434#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4436 Py_UCS4 ch2 = *s;
4437 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4438 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4439 s++;
4440 size--;
4441 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004442 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004443#endif
4444 STORECHAR(ch);
4445 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004446
4447 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004448 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004449#undef STORECHAR
4450}
4451
Alexander Belopolsky40018472011-02-26 01:02:56 +00004452PyObject *
4453PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004454{
4455 if (!PyUnicode_Check(unicode)) {
4456 PyErr_BadArgument();
4457 return NULL;
4458 }
4459 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004460 PyUnicode_GET_SIZE(unicode),
4461 NULL,
4462 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004463}
4464
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465/* --- UTF-16 Codec ------------------------------------------------------- */
4466
Tim Peters772747b2001-08-09 22:21:55 +00004467PyObject *
4468PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 Py_ssize_t size,
4470 const char *errors,
4471 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472{
Walter Dörwald69652032004-09-07 20:24:22 +00004473 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4474}
4475
Antoine Pitrouab868312009-01-10 15:40:25 +00004476/* Two masks for fast checking of whether a C 'long' may contain
4477 UTF16-encoded surrogate characters. This is an efficient heuristic,
4478 assuming that non-surrogate characters with a code point >= 0x8000 are
4479 rare in most input.
4480 FAST_CHAR_MASK is used when the input is in native byte ordering,
4481 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004482*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004483#if (SIZEOF_LONG == 8)
4484# define FAST_CHAR_MASK 0x8000800080008000L
4485# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4486#elif (SIZEOF_LONG == 4)
4487# define FAST_CHAR_MASK 0x80008000L
4488# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4489#else
4490# error C 'long' size should be either 4 or 8!
4491#endif
4492
Walter Dörwald69652032004-09-07 20:24:22 +00004493PyObject *
4494PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004495 Py_ssize_t size,
4496 const char *errors,
4497 int *byteorder,
4498 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004499{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004501 Py_ssize_t startinpos;
4502 Py_ssize_t endinpos;
4503 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 PyUnicodeObject *unicode;
4505 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004506 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004507 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004508 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004509 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004510 /* Offsets from q for retrieving byte pairs in the right order. */
4511#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4512 int ihi = 1, ilo = 0;
4513#else
4514 int ihi = 0, ilo = 1;
4515#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 PyObject *errorHandler = NULL;
4517 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518
4519 /* Note: size will always be longer than the resulting Unicode
4520 character count */
4521 unicode = _PyUnicode_New(size);
4522 if (!unicode)
4523 return NULL;
4524 if (size == 0)
4525 return (PyObject *)unicode;
4526
4527 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004528 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004529 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004530 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531
4532 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004533 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004535 /* Check for BOM marks (U+FEFF) in the input and adjust current
4536 byte order setting accordingly. In native mode, the leading BOM
4537 mark is skipped, in all other modes, it is copied to the output
4538 stream as-is (giving a ZWNBSP character). */
4539 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004540 if (size >= 2) {
4541 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004542#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004543 if (bom == 0xFEFF) {
4544 q += 2;
4545 bo = -1;
4546 }
4547 else if (bom == 0xFFFE) {
4548 q += 2;
4549 bo = 1;
4550 }
Tim Petersced69f82003-09-16 20:30:58 +00004551#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 if (bom == 0xFEFF) {
4553 q += 2;
4554 bo = 1;
4555 }
4556 else if (bom == 0xFFFE) {
4557 q += 2;
4558 bo = -1;
4559 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004560#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004561 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563
Tim Peters772747b2001-08-09 22:21:55 +00004564 if (bo == -1) {
4565 /* force LE */
4566 ihi = 1;
4567 ilo = 0;
4568 }
4569 else if (bo == 1) {
4570 /* force BE */
4571 ihi = 0;
4572 ilo = 1;
4573 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004574#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4575 native_ordering = ilo < ihi;
4576#else
4577 native_ordering = ilo > ihi;
4578#endif
Tim Peters772747b2001-08-09 22:21:55 +00004579
Antoine Pitrouab868312009-01-10 15:40:25 +00004580 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004581 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004583 /* First check for possible aligned read of a C 'long'. Unaligned
4584 reads are more expensive, better to defer to another iteration. */
4585 if (!((size_t) q & LONG_PTR_MASK)) {
4586 /* Fast path for runs of non-surrogate chars. */
4587 register const unsigned char *_q = q;
4588 Py_UNICODE *_p = p;
4589 if (native_ordering) {
4590 /* Native ordering is simple: as long as the input cannot
4591 possibly contain a surrogate char, do an unrolled copy
4592 of several 16-bit code points to the target object.
4593 The non-surrogate check is done on several input bytes
4594 at a time (as many as a C 'long' can contain). */
4595 while (_q < aligned_end) {
4596 unsigned long data = * (unsigned long *) _q;
4597 if (data & FAST_CHAR_MASK)
4598 break;
4599 _p[0] = ((unsigned short *) _q)[0];
4600 _p[1] = ((unsigned short *) _q)[1];
4601#if (SIZEOF_LONG == 8)
4602 _p[2] = ((unsigned short *) _q)[2];
4603 _p[3] = ((unsigned short *) _q)[3];
4604#endif
4605 _q += SIZEOF_LONG;
4606 _p += SIZEOF_LONG / 2;
4607 }
4608 }
4609 else {
4610 /* Byteswapped ordering is similar, but we must decompose
4611 the copy bytewise, and take care of zero'ing out the
4612 upper bytes if the target object is in 32-bit units
4613 (that is, in UCS-4 builds). */
4614 while (_q < aligned_end) {
4615 unsigned long data = * (unsigned long *) _q;
4616 if (data & SWAPPED_FAST_CHAR_MASK)
4617 break;
4618 /* Zero upper bytes in UCS-4 builds */
4619#if (Py_UNICODE_SIZE > 2)
4620 _p[0] = 0;
4621 _p[1] = 0;
4622#if (SIZEOF_LONG == 8)
4623 _p[2] = 0;
4624 _p[3] = 0;
4625#endif
4626#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004627 /* Issue #4916; UCS-4 builds on big endian machines must
4628 fill the two last bytes of each 4-byte unit. */
4629#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4630# define OFF 2
4631#else
4632# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004633#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004634 ((unsigned char *) _p)[OFF + 1] = _q[0];
4635 ((unsigned char *) _p)[OFF + 0] = _q[1];
4636 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4637 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4638#if (SIZEOF_LONG == 8)
4639 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4640 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4641 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4642 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4643#endif
4644#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004645 _q += SIZEOF_LONG;
4646 _p += SIZEOF_LONG / 2;
4647 }
4648 }
4649 p = _p;
4650 q = _q;
4651 if (q >= e)
4652 break;
4653 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004654 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655
Benjamin Peterson14339b62009-01-31 16:36:08 +00004656 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004657
4658 if (ch < 0xD800 || ch > 0xDFFF) {
4659 *p++ = ch;
4660 continue;
4661 }
4662
4663 /* UTF-16 code pair: */
4664 if (q > e) {
4665 errmsg = "unexpected end of data";
4666 startinpos = (((const char *)q) - 2) - starts;
4667 endinpos = ((const char *)e) + 1 - starts;
4668 goto utf16Error;
4669 }
4670 if (0xD800 <= ch && ch <= 0xDBFF) {
4671 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4672 q += 2;
4673 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004674#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 *p++ = ch;
4676 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004677#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004678 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004679#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004680 continue;
4681 }
4682 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004683 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 startinpos = (((const char *)q)-4)-starts;
4685 endinpos = startinpos+2;
4686 goto utf16Error;
4687 }
4688
Benjamin Peterson14339b62009-01-31 16:36:08 +00004689 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004690 errmsg = "illegal encoding";
4691 startinpos = (((const char *)q)-2)-starts;
4692 endinpos = startinpos+2;
4693 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004694
Benjamin Peterson29060642009-01-31 22:14:21 +00004695 utf16Error:
4696 outpos = p - PyUnicode_AS_UNICODE(unicode);
4697 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004698 errors,
4699 &errorHandler,
4700 "utf16", errmsg,
4701 &starts,
4702 (const char **)&e,
4703 &startinpos,
4704 &endinpos,
4705 &exc,
4706 (const char **)&q,
4707 &unicode,
4708 &outpos,
4709 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004710 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004712 /* remaining byte at the end? (size should be even) */
4713 if (e == q) {
4714 if (!consumed) {
4715 errmsg = "truncated data";
4716 startinpos = ((const char *)q) - starts;
4717 endinpos = ((const char *)e) + 1 - starts;
4718 outpos = p - PyUnicode_AS_UNICODE(unicode);
4719 if (unicode_decode_call_errorhandler(
4720 errors,
4721 &errorHandler,
4722 "utf16", errmsg,
4723 &starts,
4724 (const char **)&e,
4725 &startinpos,
4726 &endinpos,
4727 &exc,
4728 (const char **)&q,
4729 &unicode,
4730 &outpos,
4731 &p))
4732 goto onError;
4733 /* The remaining input chars are ignored if the callback
4734 chooses to skip the input */
4735 }
4736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737
4738 if (byteorder)
4739 *byteorder = bo;
4740
Walter Dörwald69652032004-09-07 20:24:22 +00004741 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004743
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004745 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746 goto onError;
4747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748 Py_XDECREF(errorHandler);
4749 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004750 if (PyUnicode_READY(unicode) == -1) {
4751 Py_DECREF(unicode);
4752 return NULL;
4753 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 return (PyObject *)unicode;
4755
Benjamin Peterson29060642009-01-31 22:14:21 +00004756 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 Py_XDECREF(errorHandler);
4759 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760 return NULL;
4761}
4762
Antoine Pitrouab868312009-01-10 15:40:25 +00004763#undef FAST_CHAR_MASK
4764#undef SWAPPED_FAST_CHAR_MASK
4765
Tim Peters772747b2001-08-09 22:21:55 +00004766PyObject *
4767PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004768 Py_ssize_t size,
4769 const char *errors,
4770 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004772 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004773 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004774 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004775#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004776 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004777#else
4778 const int pairs = 0;
4779#endif
Tim Peters772747b2001-08-09 22:21:55 +00004780 /* Offsets from p for storing byte pairs in the right order. */
4781#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4782 int ihi = 1, ilo = 0;
4783#else
4784 int ihi = 0, ilo = 1;
4785#endif
4786
Benjamin Peterson29060642009-01-31 22:14:21 +00004787#define STORECHAR(CH) \
4788 do { \
4789 p[ihi] = ((CH) >> 8) & 0xff; \
4790 p[ilo] = (CH) & 0xff; \
4791 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004792 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004794#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004795 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004796 if (s[i] >= 0x10000)
4797 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004798#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004799 /* 2 * (size + pairs + (byteorder == 0)) */
4800 if (size > PY_SSIZE_T_MAX ||
4801 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004802 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004803 nsize = size + pairs + (byteorder == 0);
4804 bytesize = nsize * 2;
4805 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004806 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004807 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808 if (v == NULL)
4809 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004811 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004813 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004814 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004815 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004816
4817 if (byteorder == -1) {
4818 /* force LE */
4819 ihi = 1;
4820 ilo = 0;
4821 }
4822 else if (byteorder == 1) {
4823 /* force BE */
4824 ihi = 0;
4825 ilo = 1;
4826 }
4827
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004828 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004829 Py_UNICODE ch = *s++;
4830 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004831#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004832 if (ch >= 0x10000) {
4833 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4834 ch = 0xD800 | ((ch-0x10000) >> 10);
4835 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004836#endif
Tim Peters772747b2001-08-09 22:21:55 +00004837 STORECHAR(ch);
4838 if (ch2)
4839 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004840 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004841
4842 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004843 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004844#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845}
4846
Alexander Belopolsky40018472011-02-26 01:02:56 +00004847PyObject *
4848PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849{
4850 if (!PyUnicode_Check(unicode)) {
4851 PyErr_BadArgument();
4852 return NULL;
4853 }
4854 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 PyUnicode_GET_SIZE(unicode),
4856 NULL,
4857 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858}
4859
4860/* --- Unicode Escape Codec ----------------------------------------------- */
4861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004862/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4863 if all the escapes in the string make it still a valid ASCII string.
4864 Returns -1 if any escapes were found which cause the string to
4865 pop out of ASCII range. Otherwise returns the length of the
4866 required buffer to hold the string.
4867 */
4868Py_ssize_t
4869length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4870{
4871 const unsigned char *p = (const unsigned char *)s;
4872 const unsigned char *end = p + size;
4873 Py_ssize_t length = 0;
4874
4875 if (size < 0)
4876 return -1;
4877
4878 for (; p < end; ++p) {
4879 if (*p > 127) {
4880 /* Non-ASCII */
4881 return -1;
4882 }
4883 else if (*p != '\\') {
4884 /* Normal character */
4885 ++length;
4886 }
4887 else {
4888 /* Backslash-escape, check next char */
4889 ++p;
4890 /* Escape sequence reaches till end of string or
4891 non-ASCII follow-up. */
4892 if (p >= end || *p > 127)
4893 return -1;
4894 switch (*p) {
4895 case '\n':
4896 /* backslash + \n result in zero characters */
4897 break;
4898 case '\\': case '\'': case '\"':
4899 case 'b': case 'f': case 't':
4900 case 'n': case 'r': case 'v': case 'a':
4901 ++length;
4902 break;
4903 case '0': case '1': case '2': case '3':
4904 case '4': case '5': case '6': case '7':
4905 case 'x': case 'u': case 'U': case 'N':
4906 /* these do not guarantee ASCII characters */
4907 return -1;
4908 default:
4909 /* count the backslash + the other character */
4910 length += 2;
4911 }
4912 }
4913 }
4914 return length;
4915}
4916
4917/* Similar to PyUnicode_WRITE but either write into wstr field
4918 or treat string as ASCII. */
4919#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4920 do { \
4921 if ((kind) != PyUnicode_WCHAR_KIND) \
4922 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4923 else \
4924 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4925 } while (0)
4926
4927#define WRITE_WSTR(buf, index, value) \
4928 assert(kind == PyUnicode_WCHAR_KIND), \
4929 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4930
4931
Fredrik Lundh06d12682001-01-24 07:59:11 +00004932static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004933
Alexander Belopolsky40018472011-02-26 01:02:56 +00004934PyObject *
4935PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004936 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02004937 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004939 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004940 Py_ssize_t startinpos;
4941 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004942 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004944 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004946 char* message;
4947 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004948 PyObject *errorHandler = NULL;
4949 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004950 Py_ssize_t ascii_length;
4951 Py_ssize_t i;
4952 int kind;
4953 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004955 ascii_length = length_of_escaped_ascii_string(s, size);
4956
4957 /* After length_of_escaped_ascii_string() there are two alternatives,
4958 either the string is pure ASCII with named escapes like \n, etc.
4959 and we determined it's exact size (common case)
4960 or it contains \x, \u, ... escape sequences. then we create a
4961 legacy wchar string and resize it at the end of this function. */
4962 if (ascii_length >= 0) {
4963 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4964 if (!v)
4965 goto onError;
4966 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4967 kind = PyUnicode_1BYTE_KIND;
4968 data = PyUnicode_DATA(v);
4969 }
4970 else {
4971 /* Escaped strings will always be longer than the resulting
4972 Unicode string, so we start with size here and then reduce the
4973 length after conversion to the true value.
4974 (but if the error callback returns a long replacement string
4975 we'll have to allocate more space) */
4976 v = _PyUnicode_New(size);
4977 if (!v)
4978 goto onError;
4979 kind = PyUnicode_WCHAR_KIND;
4980 data = PyUnicode_AS_UNICODE(v);
4981 }
4982
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983 if (size == 0)
4984 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004985 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004987
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988 while (s < end) {
4989 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004990 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004991 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004993 if (kind == PyUnicode_WCHAR_KIND) {
4994 assert(i < _PyUnicode_WSTR_LENGTH(v));
4995 }
4996 else {
4997 /* The only case in which i == ascii_length is a backslash
4998 followed by a newline. */
4999 assert(i <= ascii_length);
5000 }
5001
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002 /* Non-escape characters are interpreted as Unicode ordinals */
5003 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005004 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005 continue;
5006 }
5007
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005008 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005009 /* \ - Escapes */
5010 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005011 c = *s++;
5012 if (s > end)
5013 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005014
5015 if (kind == PyUnicode_WCHAR_KIND) {
5016 assert(i < _PyUnicode_WSTR_LENGTH(v));
5017 }
5018 else {
5019 /* The only case in which i == ascii_length is a backslash
5020 followed by a newline. */
5021 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5022 }
5023
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005024 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005028 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5029 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5030 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5031 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5032 /* FF */
5033 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5034 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5035 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5036 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5037 /* VT */
5038 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5039 /* BEL, not classic C */
5040 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043 case '0': case '1': case '2': case '3':
5044 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005045 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005046 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005047 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005048 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005049 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005051 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052 break;
5053
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 /* hex escapes */
5055 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005057 digits = 2;
5058 message = "truncated \\xXX escape";
5059 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005063 digits = 4;
5064 message = "truncated \\uXXXX escape";
5065 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066
Benjamin Peterson29060642009-01-31 22:14:21 +00005067 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005068 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005069 digits = 8;
5070 message = "truncated \\UXXXXXXXX escape";
5071 hexescape:
5072 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005073 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005074 if (s+digits>end) {
5075 endinpos = size;
5076 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 errors, &errorHandler,
5078 "unicodeescape", "end of string in escape sequence",
5079 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005080 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005081 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005082 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005083 goto nextByte;
5084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005085 for (j = 0; j < digits; ++j) {
5086 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005087 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005088 endinpos = (s+j+1)-starts;
5089 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005090 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005091 errors, &errorHandler,
5092 "unicodeescape", message,
5093 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005094 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005095 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005096 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005097 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005098 }
5099 chr = (chr<<4) & ~0xF;
5100 if (c >= '0' && c <= '9')
5101 chr += c - '0';
5102 else if (c >= 'a' && c <= 'f')
5103 chr += 10 + c - 'a';
5104 else
5105 chr += 10 + c - 'A';
5106 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005107 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005108 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005109 /* _decoding_error will have already written into the
5110 target buffer. */
5111 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005112 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005113 /* when we get here, chr is a 32-bit unicode character */
5114 if (chr <= 0xffff)
5115 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005116 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005117 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005118 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005119 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005120#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005121 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005122#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005123 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005124 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5125 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005126#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005127 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005128 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005129 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005130 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005131 errors, &errorHandler,
5132 "unicodeescape", "illegal Unicode character",
5133 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005134 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005135 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005136 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005137 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005138 break;
5139
Benjamin Peterson29060642009-01-31 22:14:21 +00005140 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005141 case 'N':
5142 message = "malformed \\N character escape";
5143 if (ucnhash_CAPI == NULL) {
5144 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005145 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5146 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005147 if (ucnhash_CAPI == NULL)
5148 goto ucnhashError;
5149 }
5150 if (*s == '{') {
5151 const char *start = s+1;
5152 /* look for the closing brace */
5153 while (*s != '}' && s < end)
5154 s++;
5155 if (s > start && s < end && *s == '}') {
5156 /* found a name. look it up in the unicode database */
5157 message = "unknown Unicode character name";
5158 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005159 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5160 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005161 goto store;
5162 }
5163 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005164 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005165 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005166 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 errors, &errorHandler,
5168 "unicodeescape", message,
5169 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005170 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005171 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005172 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005173 break;
5174
5175 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005176 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005177 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005178 message = "\\ at end of string";
5179 s--;
5180 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005181 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005182 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005183 errors, &errorHandler,
5184 "unicodeescape", message,
5185 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005186 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005187 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005188 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005189 }
5190 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005191 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5192 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005193 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005194 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005197 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005199 /* Ensure the length prediction worked in case of ASCII strings */
5200 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5201
5202 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5203 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005204 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005205 Py_XDECREF(errorHandler);
5206 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005208
Benjamin Peterson29060642009-01-31 22:14:21 +00005209 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005210 PyErr_SetString(
5211 PyExc_UnicodeError,
5212 "\\N escapes not supported (can't load unicodedata module)"
5213 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005214 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005215 Py_XDECREF(errorHandler);
5216 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005217 return NULL;
5218
Benjamin Peterson29060642009-01-31 22:14:21 +00005219 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005221 Py_XDECREF(errorHandler);
5222 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223 return NULL;
5224}
5225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005226#undef WRITE_ASCII_OR_WSTR
5227#undef WRITE_WSTR
5228
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229/* Return a Unicode-Escape string version of the Unicode object.
5230
5231 If quotes is true, the string is enclosed in u"" or u'' quotes as
5232 appropriate.
5233
5234*/
5235
Walter Dörwald79e913e2007-05-12 11:08:06 +00005236static const char *hexdigits = "0123456789abcdef";
5237
Alexander Belopolsky40018472011-02-26 01:02:56 +00005238PyObject *
5239PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005240 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005242 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005245#ifdef Py_UNICODE_WIDE
5246 const Py_ssize_t expandsize = 10;
5247#else
5248 const Py_ssize_t expandsize = 6;
5249#endif
5250
Thomas Wouters89f507f2006-12-13 04:49:30 +00005251 /* XXX(nnorwitz): rather than over-allocating, it would be
5252 better to choose a different scheme. Perhaps scan the
5253 first N-chars of the string and allocate based on that size.
5254 */
5255 /* Initial allocation is based on the longest-possible unichr
5256 escape.
5257
5258 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5259 unichr, so in this case it's the longest unichr escape. In
5260 narrow (UTF-16) builds this is five chars per source unichr
5261 since there are two unichrs in the surrogate pair, so in narrow
5262 (UTF-16) builds it's not the longest unichr escape.
5263
5264 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5265 so in the narrow (UTF-16) build case it's the longest unichr
5266 escape.
5267 */
5268
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005269 if (size == 0)
5270 return PyBytes_FromStringAndSize(NULL, 0);
5271
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005272 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005273 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005274
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005275 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005276 2
5277 + expandsize*size
5278 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279 if (repr == NULL)
5280 return NULL;
5281
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005282 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 while (size-- > 0) {
5285 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005286
Walter Dörwald79e913e2007-05-12 11:08:06 +00005287 /* Escape backslashes */
5288 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 *p++ = '\\';
5290 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005291 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005292 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005293
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005294#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005295 /* Map 21-bit characters to '\U00xxxxxx' */
5296 else if (ch >= 0x10000) {
5297 *p++ = '\\';
5298 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005299 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5300 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5301 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5302 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5303 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5304 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5305 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5306 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005308 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005309#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005310 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5311 else if (ch >= 0xD800 && ch < 0xDC00) {
5312 Py_UNICODE ch2;
5313 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005314
Benjamin Peterson29060642009-01-31 22:14:21 +00005315 ch2 = *s++;
5316 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005317 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005318 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5319 *p++ = '\\';
5320 *p++ = 'U';
5321 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5322 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5323 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5324 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5325 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5326 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5327 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5328 *p++ = hexdigits[ucs & 0x0000000F];
5329 continue;
5330 }
5331 /* Fall through: isolated surrogates are copied as-is */
5332 s--;
5333 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005334 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005335#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005336
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005338 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339 *p++ = '\\';
5340 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005341 *p++ = hexdigits[(ch >> 12) & 0x000F];
5342 *p++ = hexdigits[(ch >> 8) & 0x000F];
5343 *p++ = hexdigits[(ch >> 4) & 0x000F];
5344 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005346
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005347 /* Map special whitespace to '\t', \n', '\r' */
5348 else if (ch == '\t') {
5349 *p++ = '\\';
5350 *p++ = 't';
5351 }
5352 else if (ch == '\n') {
5353 *p++ = '\\';
5354 *p++ = 'n';
5355 }
5356 else if (ch == '\r') {
5357 *p++ = '\\';
5358 *p++ = 'r';
5359 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005360
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005361 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005362 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005364 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005365 *p++ = hexdigits[(ch >> 4) & 0x000F];
5366 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005367 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005368
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 /* Copy everything else as-is */
5370 else
5371 *p++ = (char) ch;
5372 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005374 assert(p - PyBytes_AS_STRING(repr) > 0);
5375 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5376 return NULL;
5377 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378}
5379
Alexander Belopolsky40018472011-02-26 01:02:56 +00005380PyObject *
5381PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005383 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 if (!PyUnicode_Check(unicode)) {
5385 PyErr_BadArgument();
5386 return NULL;
5387 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005388 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5389 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005390 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391}
5392
5393/* --- Raw Unicode Escape Codec ------------------------------------------- */
5394
Alexander Belopolsky40018472011-02-26 01:02:56 +00005395PyObject *
5396PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005397 Py_ssize_t size,
5398 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005400 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005401 Py_ssize_t startinpos;
5402 Py_ssize_t endinpos;
5403 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005405 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 const char *end;
5407 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005408 PyObject *errorHandler = NULL;
5409 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005410
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411 /* Escaped strings will always be longer than the resulting
5412 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005413 length after conversion to the true value. (But decoding error
5414 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 v = _PyUnicode_New(size);
5416 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005419 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005420 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421 end = s + size;
5422 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005423 unsigned char c;
5424 Py_UCS4 x;
5425 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005426 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 /* Non-escape characters are interpreted as Unicode ordinals */
5429 if (*s != '\\') {
5430 *p++ = (unsigned char)*s++;
5431 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005432 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 startinpos = s-starts;
5434
5435 /* \u-escapes are only interpreted iff the number of leading
5436 backslashes if odd */
5437 bs = s;
5438 for (;s < end;) {
5439 if (*s != '\\')
5440 break;
5441 *p++ = (unsigned char)*s++;
5442 }
5443 if (((s - bs) & 1) == 0 ||
5444 s >= end ||
5445 (*s != 'u' && *s != 'U')) {
5446 continue;
5447 }
5448 p--;
5449 count = *s=='u' ? 4 : 8;
5450 s++;
5451
5452 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5453 outpos = p-PyUnicode_AS_UNICODE(v);
5454 for (x = 0, i = 0; i < count; ++i, ++s) {
5455 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005456 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005457 endinpos = s-starts;
5458 if (unicode_decode_call_errorhandler(
5459 errors, &errorHandler,
5460 "rawunicodeescape", "truncated \\uXXXX",
5461 &starts, &end, &startinpos, &endinpos, &exc, &s,
5462 &v, &outpos, &p))
5463 goto onError;
5464 goto nextByte;
5465 }
5466 x = (x<<4) & ~0xF;
5467 if (c >= '0' && c <= '9')
5468 x += c - '0';
5469 else if (c >= 'a' && c <= 'f')
5470 x += 10 + c - 'a';
5471 else
5472 x += 10 + c - 'A';
5473 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005474 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 /* UCS-2 character */
5476 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005477 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 /* UCS-4 character. Either store directly, or as
5479 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005480#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005481 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005482#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 x -= 0x10000L;
5484 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5485 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005486#endif
5487 } else {
5488 endinpos = s-starts;
5489 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005490 if (unicode_decode_call_errorhandler(
5491 errors, &errorHandler,
5492 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 &starts, &end, &startinpos, &endinpos, &exc, &s,
5494 &v, &outpos, &p))
5495 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005496 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 nextByte:
5498 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005500 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502 Py_XDECREF(errorHandler);
5503 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005504 if (PyUnicode_READY(v) == -1) {
5505 Py_DECREF(v);
5506 return NULL;
5507 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005509
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 Py_XDECREF(errorHandler);
5513 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514 return NULL;
5515}
5516
Alexander Belopolsky40018472011-02-26 01:02:56 +00005517PyObject *
5518PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005519 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005521 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 char *p;
5523 char *q;
5524
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005525#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005526 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005527#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005528 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005529#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005530
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005531 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005532 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005533
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005534 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535 if (repr == NULL)
5536 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005537 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005538 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005540 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 while (size-- > 0) {
5542 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005543#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005544 /* Map 32-bit characters to '\Uxxxxxxxx' */
5545 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005546 *p++ = '\\';
5547 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005548 *p++ = hexdigits[(ch >> 28) & 0xf];
5549 *p++ = hexdigits[(ch >> 24) & 0xf];
5550 *p++ = hexdigits[(ch >> 20) & 0xf];
5551 *p++ = hexdigits[(ch >> 16) & 0xf];
5552 *p++ = hexdigits[(ch >> 12) & 0xf];
5553 *p++ = hexdigits[(ch >> 8) & 0xf];
5554 *p++ = hexdigits[(ch >> 4) & 0xf];
5555 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005556 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005557 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005558#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5560 if (ch >= 0xD800 && ch < 0xDC00) {
5561 Py_UNICODE ch2;
5562 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005563
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 ch2 = *s++;
5565 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005566 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5568 *p++ = '\\';
5569 *p++ = 'U';
5570 *p++ = hexdigits[(ucs >> 28) & 0xf];
5571 *p++ = hexdigits[(ucs >> 24) & 0xf];
5572 *p++ = hexdigits[(ucs >> 20) & 0xf];
5573 *p++ = hexdigits[(ucs >> 16) & 0xf];
5574 *p++ = hexdigits[(ucs >> 12) & 0xf];
5575 *p++ = hexdigits[(ucs >> 8) & 0xf];
5576 *p++ = hexdigits[(ucs >> 4) & 0xf];
5577 *p++ = hexdigits[ucs & 0xf];
5578 continue;
5579 }
5580 /* Fall through: isolated surrogates are copied as-is */
5581 s--;
5582 size++;
5583 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005584#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 /* Map 16-bit characters to '\uxxxx' */
5586 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 *p++ = '\\';
5588 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005589 *p++ = hexdigits[(ch >> 12) & 0xf];
5590 *p++ = hexdigits[(ch >> 8) & 0xf];
5591 *p++ = hexdigits[(ch >> 4) & 0xf];
5592 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 /* Copy everything else as-is */
5595 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596 *p++ = (char) ch;
5597 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005598 size = p - q;
5599
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005600 assert(size > 0);
5601 if (_PyBytes_Resize(&repr, size) < 0)
5602 return NULL;
5603 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604}
5605
Alexander Belopolsky40018472011-02-26 01:02:56 +00005606PyObject *
5607PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005609 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005611 PyErr_BadArgument();
5612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005614 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5615 PyUnicode_GET_SIZE(unicode));
5616
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005617 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618}
5619
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005620/* --- Unicode Internal Codec ------------------------------------------- */
5621
Alexander Belopolsky40018472011-02-26 01:02:56 +00005622PyObject *
5623_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005624 Py_ssize_t size,
5625 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005626{
5627 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005628 Py_ssize_t startinpos;
5629 Py_ssize_t endinpos;
5630 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005631 PyUnicodeObject *v;
5632 Py_UNICODE *p;
5633 const char *end;
5634 const char *reason;
5635 PyObject *errorHandler = NULL;
5636 PyObject *exc = NULL;
5637
Neal Norwitzd43069c2006-01-08 01:12:10 +00005638#ifdef Py_UNICODE_WIDE
5639 Py_UNICODE unimax = PyUnicode_GetMax();
5640#endif
5641
Thomas Wouters89f507f2006-12-13 04:49:30 +00005642 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005643 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5644 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005645 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005646 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5647 as string was created with the old API. */
5648 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005650 p = PyUnicode_AS_UNICODE(v);
5651 end = s + size;
5652
5653 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005654 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005655 /* We have to sanity check the raw data, otherwise doom looms for
5656 some malformed UCS-4 data. */
5657 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005658#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005659 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005660#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005661 end-s < Py_UNICODE_SIZE
5662 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005664 startinpos = s - starts;
5665 if (end-s < Py_UNICODE_SIZE) {
5666 endinpos = end-starts;
5667 reason = "truncated input";
5668 }
5669 else {
5670 endinpos = s - starts + Py_UNICODE_SIZE;
5671 reason = "illegal code point (> 0x10FFFF)";
5672 }
5673 outpos = p - PyUnicode_AS_UNICODE(v);
5674 if (unicode_decode_call_errorhandler(
5675 errors, &errorHandler,
5676 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005677 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005678 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005679 goto onError;
5680 }
5681 }
5682 else {
5683 p++;
5684 s += Py_UNICODE_SIZE;
5685 }
5686 }
5687
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005688 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005689 goto onError;
5690 Py_XDECREF(errorHandler);
5691 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005692 if (PyUnicode_READY(v) == -1) {
5693 Py_DECREF(v);
5694 return NULL;
5695 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005696 return (PyObject *)v;
5697
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005699 Py_XDECREF(v);
5700 Py_XDECREF(errorHandler);
5701 Py_XDECREF(exc);
5702 return NULL;
5703}
5704
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705/* --- Latin-1 Codec ------------------------------------------------------ */
5706
Alexander Belopolsky40018472011-02-26 01:02:56 +00005707PyObject *
5708PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005709 Py_ssize_t size,
5710 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005713 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714}
5715
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005716/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005717static void
5718make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005719 const char *encoding,
5720 const Py_UNICODE *unicode, Py_ssize_t size,
5721 Py_ssize_t startpos, Py_ssize_t endpos,
5722 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 *exceptionObject = PyUnicodeEncodeError_Create(
5726 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 }
5728 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5730 goto onError;
5731 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5732 goto onError;
5733 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5734 goto onError;
5735 return;
5736 onError:
5737 Py_DECREF(*exceptionObject);
5738 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 }
5740}
5741
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005742/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005743static void
5744raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005745 const char *encoding,
5746 const Py_UNICODE *unicode, Py_ssize_t size,
5747 Py_ssize_t startpos, Py_ssize_t endpos,
5748 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005749{
5750 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005753 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005754}
5755
5756/* error handling callback helper:
5757 build arguments, call the callback and check the arguments,
5758 put the result into newpos and return the replacement string, which
5759 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005760static PyObject *
5761unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005762 PyObject **errorHandler,
5763 const char *encoding, const char *reason,
5764 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5765 Py_ssize_t startpos, Py_ssize_t endpos,
5766 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005768 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769
5770 PyObject *restuple;
5771 PyObject *resunicode;
5772
5773 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 }
5778
5779 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005783
5784 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005786 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005788 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005789 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 Py_DECREF(restuple);
5791 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005793 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 &resunicode, newpos)) {
5795 Py_DECREF(restuple);
5796 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005798 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5799 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5800 Py_DECREF(restuple);
5801 return NULL;
5802 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005803 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005804 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005805 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5807 Py_DECREF(restuple);
5808 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005809 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005810 Py_INCREF(resunicode);
5811 Py_DECREF(restuple);
5812 return resunicode;
5813}
5814
Alexander Belopolsky40018472011-02-26 01:02:56 +00005815static PyObject *
5816unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005817 Py_ssize_t size,
5818 const char *errors,
5819 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005820{
5821 /* output object */
5822 PyObject *res;
5823 /* pointers to the beginning and end+1 of input */
5824 const Py_UNICODE *startp = p;
5825 const Py_UNICODE *endp = p + size;
5826 /* pointer to the beginning of the unencodable characters */
5827 /* const Py_UNICODE *badp = NULL; */
5828 /* pointer into the output */
5829 char *str;
5830 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005831 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005832 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5833 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005834 PyObject *errorHandler = NULL;
5835 PyObject *exc = NULL;
5836 /* the following variable is used for caching string comparisons
5837 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5838 int known_errorHandler = -1;
5839
5840 /* allocate enough for a simple encoding without
5841 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005842 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005843 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005844 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005845 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005846 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005847 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005848 ressize = size;
5849
5850 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005852
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 /* can we encode this? */
5854 if (c<limit) {
5855 /* no overflow check, because we know that the space is enough */
5856 *str++ = (char)c;
5857 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005858 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 else {
5860 Py_ssize_t unicodepos = p-startp;
5861 Py_ssize_t requiredsize;
5862 PyObject *repunicode;
5863 Py_ssize_t repsize;
5864 Py_ssize_t newpos;
5865 Py_ssize_t respos;
5866 Py_UNICODE *uni2;
5867 /* startpos for collecting unencodable chars */
5868 const Py_UNICODE *collstart = p;
5869 const Py_UNICODE *collend = p;
5870 /* find all unecodable characters */
5871 while ((collend < endp) && ((*collend)>=limit))
5872 ++collend;
5873 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5874 if (known_errorHandler==-1) {
5875 if ((errors==NULL) || (!strcmp(errors, "strict")))
5876 known_errorHandler = 1;
5877 else if (!strcmp(errors, "replace"))
5878 known_errorHandler = 2;
5879 else if (!strcmp(errors, "ignore"))
5880 known_errorHandler = 3;
5881 else if (!strcmp(errors, "xmlcharrefreplace"))
5882 known_errorHandler = 4;
5883 else
5884 known_errorHandler = 0;
5885 }
5886 switch (known_errorHandler) {
5887 case 1: /* strict */
5888 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5889 goto onError;
5890 case 2: /* replace */
5891 while (collstart++<collend)
5892 *str++ = '?'; /* fall through */
5893 case 3: /* ignore */
5894 p = collend;
5895 break;
5896 case 4: /* xmlcharrefreplace */
5897 respos = str - PyBytes_AS_STRING(res);
5898 /* determine replacement size (temporarily (mis)uses p) */
5899 for (p = collstart, repsize = 0; p < collend; ++p) {
5900 if (*p<10)
5901 repsize += 2+1+1;
5902 else if (*p<100)
5903 repsize += 2+2+1;
5904 else if (*p<1000)
5905 repsize += 2+3+1;
5906 else if (*p<10000)
5907 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005908#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 else
5910 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005911#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 else if (*p<100000)
5913 repsize += 2+5+1;
5914 else if (*p<1000000)
5915 repsize += 2+6+1;
5916 else
5917 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005918#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005919 }
5920 requiredsize = respos+repsize+(endp-collend);
5921 if (requiredsize > ressize) {
5922 if (requiredsize<2*ressize)
5923 requiredsize = 2*ressize;
5924 if (_PyBytes_Resize(&res, requiredsize))
5925 goto onError;
5926 str = PyBytes_AS_STRING(res) + respos;
5927 ressize = requiredsize;
5928 }
5929 /* generate replacement (temporarily (mis)uses p) */
5930 for (p = collstart; p < collend; ++p) {
5931 str += sprintf(str, "&#%d;", (int)*p);
5932 }
5933 p = collend;
5934 break;
5935 default:
5936 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5937 encoding, reason, startp, size, &exc,
5938 collstart-startp, collend-startp, &newpos);
5939 if (repunicode == NULL)
5940 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005941 if (PyBytes_Check(repunicode)) {
5942 /* Directly copy bytes result to output. */
5943 repsize = PyBytes_Size(repunicode);
5944 if (repsize > 1) {
5945 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005946 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005947 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5948 Py_DECREF(repunicode);
5949 goto onError;
5950 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005951 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005952 ressize += repsize-1;
5953 }
5954 memcpy(str, PyBytes_AsString(repunicode), repsize);
5955 str += repsize;
5956 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005957 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005958 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005959 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 /* need more space? (at least enough for what we
5961 have+the replacement+the rest of the string, so
5962 we won't have to check space for encodable characters) */
5963 respos = str - PyBytes_AS_STRING(res);
5964 repsize = PyUnicode_GET_SIZE(repunicode);
5965 requiredsize = respos+repsize+(endp-collend);
5966 if (requiredsize > ressize) {
5967 if (requiredsize<2*ressize)
5968 requiredsize = 2*ressize;
5969 if (_PyBytes_Resize(&res, requiredsize)) {
5970 Py_DECREF(repunicode);
5971 goto onError;
5972 }
5973 str = PyBytes_AS_STRING(res) + respos;
5974 ressize = requiredsize;
5975 }
5976 /* check if there is anything unencodable in the replacement
5977 and copy it to the output */
5978 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5979 c = *uni2;
5980 if (c >= limit) {
5981 raise_encode_exception(&exc, encoding, startp, size,
5982 unicodepos, unicodepos+1, reason);
5983 Py_DECREF(repunicode);
5984 goto onError;
5985 }
5986 *str = (char)c;
5987 }
5988 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005989 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005990 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005991 }
5992 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005993 /* Resize if we allocated to much */
5994 size = str - PyBytes_AS_STRING(res);
5995 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005996 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005997 if (_PyBytes_Resize(&res, size) < 0)
5998 goto onError;
5999 }
6000
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006001 Py_XDECREF(errorHandler);
6002 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006003 return res;
6004
6005 onError:
6006 Py_XDECREF(res);
6007 Py_XDECREF(errorHandler);
6008 Py_XDECREF(exc);
6009 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010}
6011
Alexander Belopolsky40018472011-02-26 01:02:56 +00006012PyObject *
6013PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006014 Py_ssize_t size,
6015 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006017 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018}
6019
Alexander Belopolsky40018472011-02-26 01:02:56 +00006020PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006021_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022{
6023 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 PyErr_BadArgument();
6025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006027 if (PyUnicode_READY(unicode) == -1)
6028 return NULL;
6029 /* Fast path: if it is a one-byte string, construct
6030 bytes object directly. */
6031 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6032 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6033 PyUnicode_GET_LENGTH(unicode));
6034 /* Non-Latin-1 characters present. Defer to above function to
6035 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006038 errors);
6039}
6040
6041PyObject*
6042PyUnicode_AsLatin1String(PyObject *unicode)
6043{
6044 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045}
6046
6047/* --- 7-bit ASCII Codec -------------------------------------------------- */
6048
Alexander Belopolsky40018472011-02-26 01:02:56 +00006049PyObject *
6050PyUnicode_DecodeASCII(const char *s,
6051 Py_ssize_t size,
6052 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006054 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 PyUnicodeObject *v;
6056 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006057 Py_ssize_t startinpos;
6058 Py_ssize_t endinpos;
6059 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006060 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006061 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006062 PyObject *errorHandler = NULL;
6063 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006064 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006065
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006067 if (size == 1 && *(unsigned char*)s < 128)
6068 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6069
6070 /* Fast path. Assume the input actually *is* ASCII, and allocate
6071 a single-block Unicode object with that assumption. If there is
6072 an error, drop the object and start over. */
6073 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6074 if (v == NULL)
6075 goto onError;
6076 d = PyUnicode_1BYTE_DATA(v);
6077 for (i = 0; i < size; i++) {
6078 unsigned char ch = ((unsigned char*)s)[i];
6079 if (ch < 128)
6080 d[i] = ch;
6081 else
6082 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006084 if (i == size)
6085 return (PyObject*)v;
6086 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006087
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 v = _PyUnicode_New(size);
6089 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006094 e = s + size;
6095 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 register unsigned char c = (unsigned char)*s;
6097 if (c < 128) {
6098 *p++ = c;
6099 ++s;
6100 }
6101 else {
6102 startinpos = s-starts;
6103 endinpos = startinpos + 1;
6104 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6105 if (unicode_decode_call_errorhandler(
6106 errors, &errorHandler,
6107 "ascii", "ordinal not in range(128)",
6108 &starts, &e, &startinpos, &endinpos, &exc, &s,
6109 &v, &outpos, &p))
6110 goto onError;
6111 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006113 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6115 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006116 Py_XDECREF(errorHandler);
6117 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006118 if (PyUnicode_READY(v) == -1) {
6119 Py_DECREF(v);
6120 return NULL;
6121 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006123
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 Py_XDECREF(errorHandler);
6127 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 return NULL;
6129}
6130
Alexander Belopolsky40018472011-02-26 01:02:56 +00006131PyObject *
6132PyUnicode_EncodeASCII(const Py_UNICODE *p,
6133 Py_ssize_t size,
6134 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137}
6138
Alexander Belopolsky40018472011-02-26 01:02:56 +00006139PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006140_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141{
6142 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 PyErr_BadArgument();
6144 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006146 if (PyUnicode_READY(unicode) == -1)
6147 return NULL;
6148 /* Fast path: if it is an ASCII-only string, construct bytes object
6149 directly. Else defer to above function to raise the exception. */
6150 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6151 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6152 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006155 errors);
6156}
6157
6158PyObject *
6159PyUnicode_AsASCIIString(PyObject *unicode)
6160{
6161 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162}
6163
Victor Stinner99b95382011-07-04 14:23:54 +02006164#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006165
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006166/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006167
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006168#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006169#define NEED_RETRY
6170#endif
6171
6172/* XXX This code is limited to "true" double-byte encodings, as
6173 a) it assumes an incomplete character consists of a single byte, and
6174 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006175 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006176
Alexander Belopolsky40018472011-02-26 01:02:56 +00006177static int
6178is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006179{
6180 const char *curr = s + offset;
6181
6182 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006183 const char *prev = CharPrev(s, curr);
6184 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006185 }
6186 return 0;
6187}
6188
6189/*
6190 * Decode MBCS string into unicode object. If 'final' is set, converts
6191 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6192 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006193static int
6194decode_mbcs(PyUnicodeObject **v,
6195 const char *s, /* MBCS string */
6196 int size, /* sizeof MBCS string */
6197 int final,
6198 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006199{
6200 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006201 Py_ssize_t n;
6202 DWORD usize;
6203 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006204
6205 assert(size >= 0);
6206
Victor Stinner554f3f02010-06-16 23:33:54 +00006207 /* check and handle 'errors' arg */
6208 if (errors==NULL || strcmp(errors, "strict")==0)
6209 flags = MB_ERR_INVALID_CHARS;
6210 else if (strcmp(errors, "ignore")==0)
6211 flags = 0;
6212 else {
6213 PyErr_Format(PyExc_ValueError,
6214 "mbcs encoding does not support errors='%s'",
6215 errors);
6216 return -1;
6217 }
6218
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006219 /* Skip trailing lead-byte unless 'final' is set */
6220 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006222
6223 /* First get the size of the result */
6224 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006225 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6226 if (usize==0)
6227 goto mbcs_decode_error;
6228 } else
6229 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006230
6231 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006232 /* Create unicode object */
6233 *v = _PyUnicode_New(usize);
6234 if (*v == NULL)
6235 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006236 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006237 }
6238 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 /* Extend unicode object */
6240 n = PyUnicode_GET_SIZE(*v);
6241 if (_PyUnicode_Resize(v, n + usize) < 0)
6242 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006243 }
6244
6245 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006246 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006248 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6249 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006251 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006252 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006253
6254mbcs_decode_error:
6255 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6256 we raise a UnicodeDecodeError - else it is a 'generic'
6257 windows error
6258 */
6259 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6260 /* Ideally, we should get reason from FormatMessage - this
6261 is the Windows 2000 English version of the message
6262 */
6263 PyObject *exc = NULL;
6264 const char *reason = "No mapping for the Unicode character exists "
6265 "in the target multi-byte code page.";
6266 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6267 if (exc != NULL) {
6268 PyCodec_StrictErrors(exc);
6269 Py_DECREF(exc);
6270 }
6271 } else {
6272 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6273 }
6274 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006275}
6276
Alexander Belopolsky40018472011-02-26 01:02:56 +00006277PyObject *
6278PyUnicode_DecodeMBCSStateful(const char *s,
6279 Py_ssize_t size,
6280 const char *errors,
6281 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006282{
6283 PyUnicodeObject *v = NULL;
6284 int done;
6285
6286 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006288
6289#ifdef NEED_RETRY
6290 retry:
6291 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006292 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006293 else
6294#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006295 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006296
6297 if (done < 0) {
6298 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006300 }
6301
6302 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006304
6305#ifdef NEED_RETRY
6306 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 s += done;
6308 size -= done;
6309 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006310 }
6311#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006312 if (PyUnicode_READY(v) == -1) {
6313 Py_DECREF(v);
6314 return NULL;
6315 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006316 return (PyObject *)v;
6317}
6318
Alexander Belopolsky40018472011-02-26 01:02:56 +00006319PyObject *
6320PyUnicode_DecodeMBCS(const char *s,
6321 Py_ssize_t size,
6322 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006323{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006324 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6325}
6326
6327/*
6328 * Convert unicode into string object (MBCS).
6329 * Returns 0 if succeed, -1 otherwise.
6330 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006331static int
6332encode_mbcs(PyObject **repr,
6333 const Py_UNICODE *p, /* unicode */
6334 int size, /* size of unicode */
6335 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006336{
Victor Stinner554f3f02010-06-16 23:33:54 +00006337 BOOL usedDefaultChar = FALSE;
6338 BOOL *pusedDefaultChar;
6339 int mbcssize;
6340 Py_ssize_t n;
6341 PyObject *exc = NULL;
6342 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006343
6344 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006345
Victor Stinner554f3f02010-06-16 23:33:54 +00006346 /* check and handle 'errors' arg */
6347 if (errors==NULL || strcmp(errors, "strict")==0) {
6348 flags = WC_NO_BEST_FIT_CHARS;
6349 pusedDefaultChar = &usedDefaultChar;
6350 } else if (strcmp(errors, "replace")==0) {
6351 flags = 0;
6352 pusedDefaultChar = NULL;
6353 } else {
6354 PyErr_Format(PyExc_ValueError,
6355 "mbcs encoding does not support errors='%s'",
6356 errors);
6357 return -1;
6358 }
6359
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006360 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006361 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006362 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6363 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 if (mbcssize == 0) {
6365 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6366 return -1;
6367 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006368 /* If we used a default char, then we failed! */
6369 if (pusedDefaultChar && *pusedDefaultChar)
6370 goto mbcs_encode_error;
6371 } else {
6372 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006373 }
6374
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006375 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 /* Create string object */
6377 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6378 if (*repr == NULL)
6379 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006380 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006381 }
6382 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 /* Extend string object */
6384 n = PyBytes_Size(*repr);
6385 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6386 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006387 }
6388
6389 /* Do the conversion */
6390 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006392 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6393 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6395 return -1;
6396 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006397 if (pusedDefaultChar && *pusedDefaultChar)
6398 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006399 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006400 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006401
6402mbcs_encode_error:
6403 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6404 Py_XDECREF(exc);
6405 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006406}
6407
Alexander Belopolsky40018472011-02-26 01:02:56 +00006408PyObject *
6409PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6410 Py_ssize_t size,
6411 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006412{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006413 PyObject *repr = NULL;
6414 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006415
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006416#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006418 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006419 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006420 else
6421#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006422 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006423
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006424 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 Py_XDECREF(repr);
6426 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006427 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006428
6429#ifdef NEED_RETRY
6430 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 p += INT_MAX;
6432 size -= INT_MAX;
6433 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006434 }
6435#endif
6436
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006437 return repr;
6438}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006439
Alexander Belopolsky40018472011-02-26 01:02:56 +00006440PyObject *
6441PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006442{
6443 if (!PyUnicode_Check(unicode)) {
6444 PyErr_BadArgument();
6445 return NULL;
6446 }
6447 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 PyUnicode_GET_SIZE(unicode),
6449 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006450}
6451
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006452#undef NEED_RETRY
6453
Victor Stinner99b95382011-07-04 14:23:54 +02006454#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006455
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456/* --- Character Mapping Codec -------------------------------------------- */
6457
Alexander Belopolsky40018472011-02-26 01:02:56 +00006458PyObject *
6459PyUnicode_DecodeCharmap(const char *s,
6460 Py_ssize_t size,
6461 PyObject *mapping,
6462 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006465 Py_ssize_t startinpos;
6466 Py_ssize_t endinpos;
6467 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006468 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 PyUnicodeObject *v;
6470 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006471 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006472 PyObject *errorHandler = NULL;
6473 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006474 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006475 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006476
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 /* Default to Latin-1 */
6478 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480
6481 v = _PyUnicode_New(size);
6482 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006487 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006488 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 mapstring = PyUnicode_AS_UNICODE(mapping);
6490 maplen = PyUnicode_GET_SIZE(mapping);
6491 while (s < e) {
6492 unsigned char ch = *s;
6493 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 if (ch < maplen)
6496 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 if (x == 0xfffe) {
6499 /* undefined mapping */
6500 outpos = p-PyUnicode_AS_UNICODE(v);
6501 startinpos = s-starts;
6502 endinpos = startinpos+1;
6503 if (unicode_decode_call_errorhandler(
6504 errors, &errorHandler,
6505 "charmap", "character maps to <undefined>",
6506 &starts, &e, &startinpos, &endinpos, &exc, &s,
6507 &v, &outpos, &p)) {
6508 goto onError;
6509 }
6510 continue;
6511 }
6512 *p++ = x;
6513 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006514 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006515 }
6516 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 while (s < e) {
6518 unsigned char ch = *s;
6519 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006520
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6522 w = PyLong_FromLong((long)ch);
6523 if (w == NULL)
6524 goto onError;
6525 x = PyObject_GetItem(mapping, w);
6526 Py_DECREF(w);
6527 if (x == NULL) {
6528 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6529 /* No mapping found means: mapping is undefined. */
6530 PyErr_Clear();
6531 x = Py_None;
6532 Py_INCREF(x);
6533 } else
6534 goto onError;
6535 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006536
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 /* Apply mapping */
6538 if (PyLong_Check(x)) {
6539 long value = PyLong_AS_LONG(x);
6540 if (value < 0 || value > 65535) {
6541 PyErr_SetString(PyExc_TypeError,
6542 "character mapping must be in range(65536)");
6543 Py_DECREF(x);
6544 goto onError;
6545 }
6546 *p++ = (Py_UNICODE)value;
6547 }
6548 else if (x == Py_None) {
6549 /* undefined mapping */
6550 outpos = p-PyUnicode_AS_UNICODE(v);
6551 startinpos = s-starts;
6552 endinpos = startinpos+1;
6553 if (unicode_decode_call_errorhandler(
6554 errors, &errorHandler,
6555 "charmap", "character maps to <undefined>",
6556 &starts, &e, &startinpos, &endinpos, &exc, &s,
6557 &v, &outpos, &p)) {
6558 Py_DECREF(x);
6559 goto onError;
6560 }
6561 Py_DECREF(x);
6562 continue;
6563 }
6564 else if (PyUnicode_Check(x)) {
6565 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006566
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 if (targetsize == 1)
6568 /* 1-1 mapping */
6569 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006570
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 else if (targetsize > 1) {
6572 /* 1-n mapping */
6573 if (targetsize > extrachars) {
6574 /* resize first */
6575 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6576 Py_ssize_t needed = (targetsize - extrachars) + \
6577 (targetsize << 2);
6578 extrachars += needed;
6579 /* XXX overflow detection missing */
6580 if (_PyUnicode_Resize(&v,
6581 PyUnicode_GET_SIZE(v) + needed) < 0) {
6582 Py_DECREF(x);
6583 goto onError;
6584 }
6585 p = PyUnicode_AS_UNICODE(v) + oldpos;
6586 }
6587 Py_UNICODE_COPY(p,
6588 PyUnicode_AS_UNICODE(x),
6589 targetsize);
6590 p += targetsize;
6591 extrachars -= targetsize;
6592 }
6593 /* 1-0 mapping: skip the character */
6594 }
6595 else {
6596 /* wrong return value */
6597 PyErr_SetString(PyExc_TypeError,
6598 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006599 Py_DECREF(x);
6600 goto onError;
6601 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 Py_DECREF(x);
6603 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 }
6606 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6608 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006609 Py_XDECREF(errorHandler);
6610 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006611 if (PyUnicode_READY(v) == -1) {
6612 Py_DECREF(v);
6613 return NULL;
6614 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006616
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006618 Py_XDECREF(errorHandler);
6619 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 Py_XDECREF(v);
6621 return NULL;
6622}
6623
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006624/* Charmap encoding: the lookup table */
6625
Alexander Belopolsky40018472011-02-26 01:02:56 +00006626struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 PyObject_HEAD
6628 unsigned char level1[32];
6629 int count2, count3;
6630 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006631};
6632
6633static PyObject*
6634encoding_map_size(PyObject *obj, PyObject* args)
6635{
6636 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006637 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006638 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006639}
6640
6641static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006642 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 PyDoc_STR("Return the size (in bytes) of this object") },
6644 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006645};
6646
6647static void
6648encoding_map_dealloc(PyObject* o)
6649{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006650 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006651}
6652
6653static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006654 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 "EncodingMap", /*tp_name*/
6656 sizeof(struct encoding_map), /*tp_basicsize*/
6657 0, /*tp_itemsize*/
6658 /* methods */
6659 encoding_map_dealloc, /*tp_dealloc*/
6660 0, /*tp_print*/
6661 0, /*tp_getattr*/
6662 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006663 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 0, /*tp_repr*/
6665 0, /*tp_as_number*/
6666 0, /*tp_as_sequence*/
6667 0, /*tp_as_mapping*/
6668 0, /*tp_hash*/
6669 0, /*tp_call*/
6670 0, /*tp_str*/
6671 0, /*tp_getattro*/
6672 0, /*tp_setattro*/
6673 0, /*tp_as_buffer*/
6674 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6675 0, /*tp_doc*/
6676 0, /*tp_traverse*/
6677 0, /*tp_clear*/
6678 0, /*tp_richcompare*/
6679 0, /*tp_weaklistoffset*/
6680 0, /*tp_iter*/
6681 0, /*tp_iternext*/
6682 encoding_map_methods, /*tp_methods*/
6683 0, /*tp_members*/
6684 0, /*tp_getset*/
6685 0, /*tp_base*/
6686 0, /*tp_dict*/
6687 0, /*tp_descr_get*/
6688 0, /*tp_descr_set*/
6689 0, /*tp_dictoffset*/
6690 0, /*tp_init*/
6691 0, /*tp_alloc*/
6692 0, /*tp_new*/
6693 0, /*tp_free*/
6694 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006695};
6696
6697PyObject*
6698PyUnicode_BuildEncodingMap(PyObject* string)
6699{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006700 PyObject *result;
6701 struct encoding_map *mresult;
6702 int i;
6703 int need_dict = 0;
6704 unsigned char level1[32];
6705 unsigned char level2[512];
6706 unsigned char *mlevel1, *mlevel2, *mlevel3;
6707 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006708 int kind;
6709 void *data;
6710 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006712 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006713 PyErr_BadArgument();
6714 return NULL;
6715 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006716 kind = PyUnicode_KIND(string);
6717 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006718 memset(level1, 0xFF, sizeof level1);
6719 memset(level2, 0xFF, sizeof level2);
6720
6721 /* If there isn't a one-to-one mapping of NULL to \0,
6722 or if there are non-BMP characters, we need to use
6723 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006724 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006725 need_dict = 1;
6726 for (i = 1; i < 256; i++) {
6727 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006728 ch = PyUnicode_READ(kind, data, i);
6729 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006730 need_dict = 1;
6731 break;
6732 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006733 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006734 /* unmapped character */
6735 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006736 l1 = ch >> 11;
6737 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006738 if (level1[l1] == 0xFF)
6739 level1[l1] = count2++;
6740 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006741 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006742 }
6743
6744 if (count2 >= 0xFF || count3 >= 0xFF)
6745 need_dict = 1;
6746
6747 if (need_dict) {
6748 PyObject *result = PyDict_New();
6749 PyObject *key, *value;
6750 if (!result)
6751 return NULL;
6752 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006753 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006754 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006755 if (!key || !value)
6756 goto failed1;
6757 if (PyDict_SetItem(result, key, value) == -1)
6758 goto failed1;
6759 Py_DECREF(key);
6760 Py_DECREF(value);
6761 }
6762 return result;
6763 failed1:
6764 Py_XDECREF(key);
6765 Py_XDECREF(value);
6766 Py_DECREF(result);
6767 return NULL;
6768 }
6769
6770 /* Create a three-level trie */
6771 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6772 16*count2 + 128*count3 - 1);
6773 if (!result)
6774 return PyErr_NoMemory();
6775 PyObject_Init(result, &EncodingMapType);
6776 mresult = (struct encoding_map*)result;
6777 mresult->count2 = count2;
6778 mresult->count3 = count3;
6779 mlevel1 = mresult->level1;
6780 mlevel2 = mresult->level23;
6781 mlevel3 = mresult->level23 + 16*count2;
6782 memcpy(mlevel1, level1, 32);
6783 memset(mlevel2, 0xFF, 16*count2);
6784 memset(mlevel3, 0, 128*count3);
6785 count3 = 0;
6786 for (i = 1; i < 256; i++) {
6787 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006788 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006789 /* unmapped character */
6790 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006791 o1 = PyUnicode_READ(kind, data, i)>>11;
6792 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006793 i2 = 16*mlevel1[o1] + o2;
6794 if (mlevel2[i2] == 0xFF)
6795 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006796 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006797 i3 = 128*mlevel2[i2] + o3;
6798 mlevel3[i3] = i;
6799 }
6800 return result;
6801}
6802
6803static int
6804encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6805{
6806 struct encoding_map *map = (struct encoding_map*)mapping;
6807 int l1 = c>>11;
6808 int l2 = (c>>7) & 0xF;
6809 int l3 = c & 0x7F;
6810 int i;
6811
6812#ifdef Py_UNICODE_WIDE
6813 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006815 }
6816#endif
6817 if (c == 0)
6818 return 0;
6819 /* level 1*/
6820 i = map->level1[l1];
6821 if (i == 0xFF) {
6822 return -1;
6823 }
6824 /* level 2*/
6825 i = map->level23[16*i+l2];
6826 if (i == 0xFF) {
6827 return -1;
6828 }
6829 /* level 3 */
6830 i = map->level23[16*map->count2 + 128*i + l3];
6831 if (i == 0) {
6832 return -1;
6833 }
6834 return i;
6835}
6836
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006837/* Lookup the character ch in the mapping. If the character
6838 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006839 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006840static PyObject *
6841charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842{
Christian Heimes217cfd12007-12-02 14:31:20 +00006843 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006844 PyObject *x;
6845
6846 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006848 x = PyObject_GetItem(mapping, w);
6849 Py_DECREF(w);
6850 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6852 /* No mapping found means: mapping is undefined. */
6853 PyErr_Clear();
6854 x = Py_None;
6855 Py_INCREF(x);
6856 return x;
6857 } else
6858 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006860 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006862 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 long value = PyLong_AS_LONG(x);
6864 if (value < 0 || value > 255) {
6865 PyErr_SetString(PyExc_TypeError,
6866 "character mapping must be in range(256)");
6867 Py_DECREF(x);
6868 return NULL;
6869 }
6870 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006872 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 /* wrong return value */
6876 PyErr_Format(PyExc_TypeError,
6877 "character mapping must return integer, bytes or None, not %.400s",
6878 x->ob_type->tp_name);
6879 Py_DECREF(x);
6880 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 }
6882}
6883
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006884static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006885charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006886{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006887 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6888 /* exponentially overallocate to minimize reallocations */
6889 if (requiredsize < 2*outsize)
6890 requiredsize = 2*outsize;
6891 if (_PyBytes_Resize(outobj, requiredsize))
6892 return -1;
6893 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006894}
6895
Benjamin Peterson14339b62009-01-31 16:36:08 +00006896typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006897 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006898} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006899/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006900 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006901 space is available. Return a new reference to the object that
6902 was put in the output buffer, or Py_None, if the mapping was undefined
6903 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006904 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006905static charmapencode_result
6906charmapencode_output(Py_UNICODE c, PyObject *mapping,
6907 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006908{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006909 PyObject *rep;
6910 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006911 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006912
Christian Heimes90aa7642007-12-19 02:45:37 +00006913 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006914 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006915 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006916 if (res == -1)
6917 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 if (outsize<requiredsize)
6919 if (charmapencode_resize(outobj, outpos, requiredsize))
6920 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006921 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 outstart[(*outpos)++] = (char)res;
6923 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006924 }
6925
6926 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006927 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006929 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006930 Py_DECREF(rep);
6931 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006932 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 if (PyLong_Check(rep)) {
6934 Py_ssize_t requiredsize = *outpos+1;
6935 if (outsize<requiredsize)
6936 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6937 Py_DECREF(rep);
6938 return enc_EXCEPTION;
6939 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006940 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006941 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006942 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 else {
6944 const char *repchars = PyBytes_AS_STRING(rep);
6945 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6946 Py_ssize_t requiredsize = *outpos+repsize;
6947 if (outsize<requiredsize)
6948 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6949 Py_DECREF(rep);
6950 return enc_EXCEPTION;
6951 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006952 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 memcpy(outstart + *outpos, repchars, repsize);
6954 *outpos += repsize;
6955 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006956 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006957 Py_DECREF(rep);
6958 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006959}
6960
6961/* handle an error in PyUnicode_EncodeCharmap
6962 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006963static int
6964charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006965 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006966 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006967 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006968 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006969{
6970 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006971 Py_ssize_t repsize;
6972 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006973 Py_UNICODE *uni2;
6974 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006975 Py_ssize_t collstartpos = *inpos;
6976 Py_ssize_t collendpos = *inpos+1;
6977 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006978 char *encoding = "charmap";
6979 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006980 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006981
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006982 /* find all unencodable characters */
6983 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006984 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006985 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006986 int res = encoding_map_lookup(p[collendpos], mapping);
6987 if (res != -1)
6988 break;
6989 ++collendpos;
6990 continue;
6991 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006992
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 rep = charmapencode_lookup(p[collendpos], mapping);
6994 if (rep==NULL)
6995 return -1;
6996 else if (rep!=Py_None) {
6997 Py_DECREF(rep);
6998 break;
6999 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007000 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007001 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007002 }
7003 /* cache callback name lookup
7004 * (if not done yet, i.e. it's the first error) */
7005 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 if ((errors==NULL) || (!strcmp(errors, "strict")))
7007 *known_errorHandler = 1;
7008 else if (!strcmp(errors, "replace"))
7009 *known_errorHandler = 2;
7010 else if (!strcmp(errors, "ignore"))
7011 *known_errorHandler = 3;
7012 else if (!strcmp(errors, "xmlcharrefreplace"))
7013 *known_errorHandler = 4;
7014 else
7015 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007016 }
7017 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007018 case 1: /* strict */
7019 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7020 return -1;
7021 case 2: /* replace */
7022 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 x = charmapencode_output('?', mapping, res, respos);
7024 if (x==enc_EXCEPTION) {
7025 return -1;
7026 }
7027 else if (x==enc_FAILED) {
7028 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7029 return -1;
7030 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007031 }
7032 /* fall through */
7033 case 3: /* ignore */
7034 *inpos = collendpos;
7035 break;
7036 case 4: /* xmlcharrefreplace */
7037 /* generate replacement (temporarily (mis)uses p) */
7038 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 char buffer[2+29+1+1];
7040 char *cp;
7041 sprintf(buffer, "&#%d;", (int)p[collpos]);
7042 for (cp = buffer; *cp; ++cp) {
7043 x = charmapencode_output(*cp, mapping, res, respos);
7044 if (x==enc_EXCEPTION)
7045 return -1;
7046 else if (x==enc_FAILED) {
7047 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7048 return -1;
7049 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007050 }
7051 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007052 *inpos = collendpos;
7053 break;
7054 default:
7055 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 encoding, reason, p, size, exceptionObject,
7057 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007058 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007060 if (PyBytes_Check(repunicode)) {
7061 /* Directly copy bytes result to output. */
7062 Py_ssize_t outsize = PyBytes_Size(*res);
7063 Py_ssize_t requiredsize;
7064 repsize = PyBytes_Size(repunicode);
7065 requiredsize = *respos + repsize;
7066 if (requiredsize > outsize)
7067 /* Make room for all additional bytes. */
7068 if (charmapencode_resize(res, respos, requiredsize)) {
7069 Py_DECREF(repunicode);
7070 return -1;
7071 }
7072 memcpy(PyBytes_AsString(*res) + *respos,
7073 PyBytes_AsString(repunicode), repsize);
7074 *respos += repsize;
7075 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007076 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007077 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007078 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007079 /* generate replacement */
7080 repsize = PyUnicode_GET_SIZE(repunicode);
7081 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007082 x = charmapencode_output(*uni2, mapping, res, respos);
7083 if (x==enc_EXCEPTION) {
7084 return -1;
7085 }
7086 else if (x==enc_FAILED) {
7087 Py_DECREF(repunicode);
7088 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7089 return -1;
7090 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007091 }
7092 *inpos = newpos;
7093 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007094 }
7095 return 0;
7096}
7097
Alexander Belopolsky40018472011-02-26 01:02:56 +00007098PyObject *
7099PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7100 Py_ssize_t size,
7101 PyObject *mapping,
7102 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007104 /* output object */
7105 PyObject *res = NULL;
7106 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007107 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007108 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007109 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007110 PyObject *errorHandler = NULL;
7111 PyObject *exc = NULL;
7112 /* the following variable is used for caching string comparisons
7113 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7114 * 3=ignore, 4=xmlcharrefreplace */
7115 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116
7117 /* Default to Latin-1 */
7118 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007119 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007121 /* allocate enough for a simple encoding without
7122 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007123 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007124 if (res == NULL)
7125 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007126 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007129 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 /* try to encode it */
7131 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7132 if (x==enc_EXCEPTION) /* error */
7133 goto onError;
7134 if (x==enc_FAILED) { /* unencodable character */
7135 if (charmap_encoding_error(p, size, &inpos, mapping,
7136 &exc,
7137 &known_errorHandler, &errorHandler, errors,
7138 &res, &respos)) {
7139 goto onError;
7140 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007141 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 else
7143 /* done with this character => adjust input position */
7144 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007147 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007148 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007149 if (_PyBytes_Resize(&res, respos) < 0)
7150 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007151
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007152 Py_XDECREF(exc);
7153 Py_XDECREF(errorHandler);
7154 return res;
7155
Benjamin Peterson29060642009-01-31 22:14:21 +00007156 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007157 Py_XDECREF(res);
7158 Py_XDECREF(exc);
7159 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160 return NULL;
7161}
7162
Alexander Belopolsky40018472011-02-26 01:02:56 +00007163PyObject *
7164PyUnicode_AsCharmapString(PyObject *unicode,
7165 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166{
7167 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 PyErr_BadArgument();
7169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170 }
7171 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 PyUnicode_GET_SIZE(unicode),
7173 mapping,
7174 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175}
7176
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007177/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007178static void
7179make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007180 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007181 Py_ssize_t startpos, Py_ssize_t endpos,
7182 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007184 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007185 *exceptionObject = _PyUnicodeTranslateError_Create(
7186 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187 }
7188 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7190 goto onError;
7191 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7192 goto onError;
7193 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7194 goto onError;
7195 return;
7196 onError:
7197 Py_DECREF(*exceptionObject);
7198 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 }
7200}
7201
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007202/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007203static void
7204raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007205 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007206 Py_ssize_t startpos, Py_ssize_t endpos,
7207 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007208{
7209 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007210 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007211 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007213}
7214
7215/* error handling callback helper:
7216 build arguments, call the callback and check the arguments,
7217 put the result into newpos and return the replacement string, which
7218 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007219static PyObject *
7220unicode_translate_call_errorhandler(const char *errors,
7221 PyObject **errorHandler,
7222 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007223 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007224 Py_ssize_t startpos, Py_ssize_t endpos,
7225 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007226{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007227 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007228
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007229 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007230 PyObject *restuple;
7231 PyObject *resunicode;
7232
7233 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007235 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007237 }
7238
7239 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007240 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007241 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007243
7244 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007245 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007246 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007248 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007249 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007250 Py_DECREF(restuple);
7251 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007252 }
7253 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 &resunicode, &i_newpos)) {
7255 Py_DECREF(restuple);
7256 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007257 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007258 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007259 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007260 else
7261 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007262 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007263 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7264 Py_DECREF(restuple);
7265 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007266 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007267 Py_INCREF(resunicode);
7268 Py_DECREF(restuple);
7269 return resunicode;
7270}
7271
7272/* Lookup the character ch in the mapping and put the result in result,
7273 which must be decrefed by the caller.
7274 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007275static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007276charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007277{
Christian Heimes217cfd12007-12-02 14:31:20 +00007278 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007279 PyObject *x;
7280
7281 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007282 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007283 x = PyObject_GetItem(mapping, w);
7284 Py_DECREF(w);
7285 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007286 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7287 /* No mapping found means: use 1:1 mapping. */
7288 PyErr_Clear();
7289 *result = NULL;
7290 return 0;
7291 } else
7292 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007293 }
7294 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 *result = x;
7296 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007297 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007298 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007299 long value = PyLong_AS_LONG(x);
7300 long max = PyUnicode_GetMax();
7301 if (value < 0 || value > max) {
7302 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007303 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007304 Py_DECREF(x);
7305 return -1;
7306 }
7307 *result = x;
7308 return 0;
7309 }
7310 else if (PyUnicode_Check(x)) {
7311 *result = x;
7312 return 0;
7313 }
7314 else {
7315 /* wrong return value */
7316 PyErr_SetString(PyExc_TypeError,
7317 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007318 Py_DECREF(x);
7319 return -1;
7320 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007321}
7322/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 if not reallocate and adjust various state variables.
7324 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007325static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007326charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007329 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007330 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 /* exponentially overallocate to minimize reallocations */
7332 if (requiredsize < 2 * oldsize)
7333 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007334 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7335 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007336 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007337 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007338 }
7339 return 0;
7340}
7341/* lookup the character, put the result in the output string and adjust
7342 various state variables. Return a new reference to the object that
7343 was put in the output buffer in *result, or Py_None, if the mapping was
7344 undefined (in which case no character was written).
7345 The called must decref result.
7346 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007347static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007348charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7349 PyObject *mapping, Py_UCS4 **output,
7350 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007351 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007352{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007353 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7354 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007356 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007358 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007359 }
7360 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007362 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007363 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007364 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007365 }
7366 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007367 Py_ssize_t repsize;
7368 if (PyUnicode_READY(*res) == -1)
7369 return -1;
7370 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 if (repsize==1) {
7372 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007373 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 }
7375 else if (repsize!=0) {
7376 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007377 Py_ssize_t requiredsize = *opos +
7378 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007380 Py_ssize_t i;
7381 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007383 for(i = 0; i < repsize; i++)
7384 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007386 }
7387 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007389 return 0;
7390}
7391
Alexander Belopolsky40018472011-02-26 01:02:56 +00007392PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007393_PyUnicode_TranslateCharmap(PyObject *input,
7394 PyObject *mapping,
7395 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007397 /* input object */
7398 char *idata;
7399 Py_ssize_t size, i;
7400 int kind;
7401 /* output buffer */
7402 Py_UCS4 *output = NULL;
7403 Py_ssize_t osize;
7404 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007405 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007406 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007407 char *reason = "character maps to <undefined>";
7408 PyObject *errorHandler = NULL;
7409 PyObject *exc = NULL;
7410 /* the following variable is used for caching string comparisons
7411 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7412 * 3=ignore, 4=xmlcharrefreplace */
7413 int known_errorHandler = -1;
7414
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 PyErr_BadArgument();
7417 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007420 if (PyUnicode_READY(input) == -1)
7421 return NULL;
7422 idata = (char*)PyUnicode_DATA(input);
7423 kind = PyUnicode_KIND(input);
7424 size = PyUnicode_GET_LENGTH(input);
7425 i = 0;
7426
7427 if (size == 0) {
7428 Py_INCREF(input);
7429 return input;
7430 }
7431
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007432 /* allocate enough for a simple 1:1 translation without
7433 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007434 osize = size;
7435 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7436 opos = 0;
7437 if (output == NULL) {
7438 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007440 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007442 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 /* try to encode it */
7444 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007445 if (charmaptranslate_output(input, i, mapping,
7446 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 Py_XDECREF(x);
7448 goto onError;
7449 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007450 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007452 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 else { /* untranslatable character */
7454 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7455 Py_ssize_t repsize;
7456 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007457 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007459 Py_ssize_t collstart = i;
7460 Py_ssize_t collend = i+1;
7461 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007464 while (collend < size) {
7465 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 goto onError;
7467 Py_XDECREF(x);
7468 if (x!=Py_None)
7469 break;
7470 ++collend;
7471 }
7472 /* cache callback name lookup
7473 * (if not done yet, i.e. it's the first error) */
7474 if (known_errorHandler==-1) {
7475 if ((errors==NULL) || (!strcmp(errors, "strict")))
7476 known_errorHandler = 1;
7477 else if (!strcmp(errors, "replace"))
7478 known_errorHandler = 2;
7479 else if (!strcmp(errors, "ignore"))
7480 known_errorHandler = 3;
7481 else if (!strcmp(errors, "xmlcharrefreplace"))
7482 known_errorHandler = 4;
7483 else
7484 known_errorHandler = 0;
7485 }
7486 switch (known_errorHandler) {
7487 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007488 raise_translate_exception(&exc, input, collstart,
7489 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007490 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007491 case 2: /* replace */
7492 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007493 for (coll = collstart; coll<collend; coll++)
7494 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 /* fall through */
7496 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007497 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 break;
7499 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007500 /* generate replacement (temporarily (mis)uses i) */
7501 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 char buffer[2+29+1+1];
7503 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007504 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7505 if (charmaptranslate_makespace(&output, &osize,
7506 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 goto onError;
7508 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007509 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007511 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007512 break;
7513 default:
7514 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007515 reason, input, &exc,
7516 collstart, collend, &newpos);
7517 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 goto onError;
7519 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007520 repsize = PyUnicode_GET_LENGTH(repunicode);
7521 if (charmaptranslate_makespace(&output, &osize,
7522 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 Py_DECREF(repunicode);
7524 goto onError;
7525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007526 for (uni2 = 0; repsize-->0; ++uni2)
7527 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7528 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007530 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007531 }
7532 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007533 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7534 if (!res)
7535 goto onError;
7536 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007537 Py_XDECREF(exc);
7538 Py_XDECREF(errorHandler);
7539 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007542 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007543 Py_XDECREF(exc);
7544 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545 return NULL;
7546}
7547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007548/* Deprecated. Use PyUnicode_Translate instead. */
7549PyObject *
7550PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7551 Py_ssize_t size,
7552 PyObject *mapping,
7553 const char *errors)
7554{
7555 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7556 if (!unicode)
7557 return NULL;
7558 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7559}
7560
Alexander Belopolsky40018472011-02-26 01:02:56 +00007561PyObject *
7562PyUnicode_Translate(PyObject *str,
7563 PyObject *mapping,
7564 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565{
7566 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007567
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 str = PyUnicode_FromObject(str);
7569 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007571 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 Py_DECREF(str);
7573 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007574
Benjamin Peterson29060642009-01-31 22:14:21 +00007575 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576 Py_XDECREF(str);
7577 return NULL;
7578}
Tim Petersced69f82003-09-16 20:30:58 +00007579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007580static Py_UCS4
7581fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7582{
7583 /* No need to call PyUnicode_READY(self) because this function is only
7584 called as a callback from fixup() which does it already. */
7585 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7586 const int kind = PyUnicode_KIND(self);
7587 void *data = PyUnicode_DATA(self);
7588 Py_UCS4 maxchar = 0, ch, fixed;
7589 Py_ssize_t i;
7590
7591 for (i = 0; i < len; ++i) {
7592 ch = PyUnicode_READ(kind, data, i);
7593 fixed = 0;
7594 if (ch > 127) {
7595 if (Py_UNICODE_ISSPACE(ch))
7596 fixed = ' ';
7597 else {
7598 const int decimal = Py_UNICODE_TODECIMAL(ch);
7599 if (decimal >= 0)
7600 fixed = '0' + decimal;
7601 }
7602 if (fixed != 0) {
7603 if (fixed > maxchar)
7604 maxchar = fixed;
7605 PyUnicode_WRITE(kind, data, i, fixed);
7606 }
7607 else if (ch > maxchar)
7608 maxchar = ch;
7609 }
7610 else if (ch > maxchar)
7611 maxchar = ch;
7612 }
7613
7614 return maxchar;
7615}
7616
7617PyObject *
7618_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7619{
7620 if (!PyUnicode_Check(unicode)) {
7621 PyErr_BadInternalCall();
7622 return NULL;
7623 }
7624 if (PyUnicode_READY(unicode) == -1)
7625 return NULL;
7626 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7627 /* If the string is already ASCII, just return the same string */
7628 Py_INCREF(unicode);
7629 return unicode;
7630 }
7631 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7632}
7633
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007634PyObject *
7635PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7636 Py_ssize_t length)
7637{
7638 PyObject *result;
7639 Py_UNICODE *p; /* write pointer into result */
7640 Py_ssize_t i;
7641 /* Copy to a new string */
7642 result = (PyObject *)_PyUnicode_New(length);
7643 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7644 if (result == NULL)
7645 return result;
7646 p = PyUnicode_AS_UNICODE(result);
7647 /* Iterate over code points */
7648 for (i = 0; i < length; i++) {
7649 Py_UNICODE ch =s[i];
7650 if (ch > 127) {
7651 int decimal = Py_UNICODE_TODECIMAL(ch);
7652 if (decimal >= 0)
7653 p[i] = '0' + decimal;
7654 }
7655 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007656 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7657 Py_DECREF(result);
7658 return NULL;
7659 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007660 return result;
7661}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007662/* --- Decimal Encoder ---------------------------------------------------- */
7663
Alexander Belopolsky40018472011-02-26 01:02:56 +00007664int
7665PyUnicode_EncodeDecimal(Py_UNICODE *s,
7666 Py_ssize_t length,
7667 char *output,
7668 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007669{
7670 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007671 PyObject *errorHandler = NULL;
7672 PyObject *exc = NULL;
7673 const char *encoding = "decimal";
7674 const char *reason = "invalid decimal Unicode string";
7675 /* the following variable is used for caching string comparisons
7676 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7677 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007678
7679 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 PyErr_BadArgument();
7681 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007682 }
7683
7684 p = s;
7685 end = s + length;
7686 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007687 register Py_UNICODE ch = *p;
7688 int decimal;
7689 PyObject *repunicode;
7690 Py_ssize_t repsize;
7691 Py_ssize_t newpos;
7692 Py_UNICODE *uni2;
7693 Py_UNICODE *collstart;
7694 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007695
Benjamin Peterson29060642009-01-31 22:14:21 +00007696 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007697 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007698 ++p;
7699 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007700 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 decimal = Py_UNICODE_TODECIMAL(ch);
7702 if (decimal >= 0) {
7703 *output++ = '0' + decimal;
7704 ++p;
7705 continue;
7706 }
7707 if (0 < ch && ch < 256) {
7708 *output++ = (char)ch;
7709 ++p;
7710 continue;
7711 }
7712 /* All other characters are considered unencodable */
7713 collstart = p;
7714 collend = p+1;
7715 while (collend < end) {
7716 if ((0 < *collend && *collend < 256) ||
7717 !Py_UNICODE_ISSPACE(*collend) ||
7718 Py_UNICODE_TODECIMAL(*collend))
7719 break;
7720 }
7721 /* cache callback name lookup
7722 * (if not done yet, i.e. it's the first error) */
7723 if (known_errorHandler==-1) {
7724 if ((errors==NULL) || (!strcmp(errors, "strict")))
7725 known_errorHandler = 1;
7726 else if (!strcmp(errors, "replace"))
7727 known_errorHandler = 2;
7728 else if (!strcmp(errors, "ignore"))
7729 known_errorHandler = 3;
7730 else if (!strcmp(errors, "xmlcharrefreplace"))
7731 known_errorHandler = 4;
7732 else
7733 known_errorHandler = 0;
7734 }
7735 switch (known_errorHandler) {
7736 case 1: /* strict */
7737 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7738 goto onError;
7739 case 2: /* replace */
7740 for (p = collstart; p < collend; ++p)
7741 *output++ = '?';
7742 /* fall through */
7743 case 3: /* ignore */
7744 p = collend;
7745 break;
7746 case 4: /* xmlcharrefreplace */
7747 /* generate replacement (temporarily (mis)uses p) */
7748 for (p = collstart; p < collend; ++p)
7749 output += sprintf(output, "&#%d;", (int)*p);
7750 p = collend;
7751 break;
7752 default:
7753 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7754 encoding, reason, s, length, &exc,
7755 collstart-s, collend-s, &newpos);
7756 if (repunicode == NULL)
7757 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007758 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007759 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007760 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7761 Py_DECREF(repunicode);
7762 goto onError;
7763 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007764 /* generate replacement */
7765 repsize = PyUnicode_GET_SIZE(repunicode);
7766 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7767 Py_UNICODE ch = *uni2;
7768 if (Py_UNICODE_ISSPACE(ch))
7769 *output++ = ' ';
7770 else {
7771 decimal = Py_UNICODE_TODECIMAL(ch);
7772 if (decimal >= 0)
7773 *output++ = '0' + decimal;
7774 else if (0 < ch && ch < 256)
7775 *output++ = (char)ch;
7776 else {
7777 Py_DECREF(repunicode);
7778 raise_encode_exception(&exc, encoding,
7779 s, length, collstart-s, collend-s, reason);
7780 goto onError;
7781 }
7782 }
7783 }
7784 p = s + newpos;
7785 Py_DECREF(repunicode);
7786 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007787 }
7788 /* 0-terminate the output string */
7789 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007790 Py_XDECREF(exc);
7791 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007792 return 0;
7793
Benjamin Peterson29060642009-01-31 22:14:21 +00007794 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007795 Py_XDECREF(exc);
7796 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007797 return -1;
7798}
7799
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800/* --- Helpers ------------------------------------------------------------ */
7801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007802#include "stringlib/ucs1lib.h"
7803#include "stringlib/fastsearch.h"
7804#include "stringlib/partition.h"
7805#include "stringlib/split.h"
7806#include "stringlib/count.h"
7807#include "stringlib/find.h"
7808#include "stringlib/localeutil.h"
7809#include "stringlib/undef.h"
7810
7811#include "stringlib/ucs2lib.h"
7812#include "stringlib/fastsearch.h"
7813#include "stringlib/partition.h"
7814#include "stringlib/split.h"
7815#include "stringlib/count.h"
7816#include "stringlib/find.h"
7817#include "stringlib/localeutil.h"
7818#include "stringlib/undef.h"
7819
7820#include "stringlib/ucs4lib.h"
7821#include "stringlib/fastsearch.h"
7822#include "stringlib/partition.h"
7823#include "stringlib/split.h"
7824#include "stringlib/count.h"
7825#include "stringlib/find.h"
7826#include "stringlib/localeutil.h"
7827#include "stringlib/undef.h"
7828
7829static Py_ssize_t
7830any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7831 const Py_UCS1*, Py_ssize_t,
7832 Py_ssize_t, Py_ssize_t),
7833 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7834 const Py_UCS2*, Py_ssize_t,
7835 Py_ssize_t, Py_ssize_t),
7836 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7837 const Py_UCS4*, Py_ssize_t,
7838 Py_ssize_t, Py_ssize_t),
7839 PyObject* s1, PyObject* s2,
7840 Py_ssize_t start,
7841 Py_ssize_t end)
7842{
7843 int kind1, kind2, kind;
7844 void *buf1, *buf2;
7845 Py_ssize_t len1, len2, result;
7846
7847 kind1 = PyUnicode_KIND(s1);
7848 kind2 = PyUnicode_KIND(s2);
7849 kind = kind1 > kind2 ? kind1 : kind2;
7850 buf1 = PyUnicode_DATA(s1);
7851 buf2 = PyUnicode_DATA(s2);
7852 if (kind1 != kind)
7853 buf1 = _PyUnicode_AsKind(s1, kind);
7854 if (!buf1)
7855 return -2;
7856 if (kind2 != kind)
7857 buf2 = _PyUnicode_AsKind(s2, kind);
7858 if (!buf2) {
7859 if (kind1 != kind) PyMem_Free(buf1);
7860 return -2;
7861 }
7862 len1 = PyUnicode_GET_LENGTH(s1);
7863 len2 = PyUnicode_GET_LENGTH(s2);
7864
7865 switch(kind) {
7866 case PyUnicode_1BYTE_KIND:
7867 result = ucs1(buf1, len1, buf2, len2, start, end);
7868 break;
7869 case PyUnicode_2BYTE_KIND:
7870 result = ucs2(buf1, len1, buf2, len2, start, end);
7871 break;
7872 case PyUnicode_4BYTE_KIND:
7873 result = ucs4(buf1, len1, buf2, len2, start, end);
7874 break;
7875 default:
7876 assert(0); result = -2;
7877 }
7878
7879 if (kind1 != kind)
7880 PyMem_Free(buf1);
7881 if (kind2 != kind)
7882 PyMem_Free(buf2);
7883
7884 return result;
7885}
7886
7887Py_ssize_t
7888_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7889 Py_ssize_t n_buffer,
7890 void *digits, Py_ssize_t n_digits,
7891 Py_ssize_t min_width,
7892 const char *grouping,
7893 const char *thousands_sep)
7894{
7895 switch(kind) {
7896 case PyUnicode_1BYTE_KIND:
7897 return _PyUnicode_ucs1_InsertThousandsGrouping(
7898 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7899 min_width, grouping, thousands_sep);
7900 case PyUnicode_2BYTE_KIND:
7901 return _PyUnicode_ucs2_InsertThousandsGrouping(
7902 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7903 min_width, grouping, thousands_sep);
7904 case PyUnicode_4BYTE_KIND:
7905 return _PyUnicode_ucs4_InsertThousandsGrouping(
7906 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7907 min_width, grouping, thousands_sep);
7908 }
7909 assert(0);
7910 return -1;
7911}
7912
7913
Eric Smith8c663262007-08-25 02:26:07 +00007914#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007915#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007916
Thomas Wouters477c8d52006-05-27 19:21:47 +00007917#include "stringlib/count.h"
7918#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007919
Thomas Wouters477c8d52006-05-27 19:21:47 +00007920/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007921#define ADJUST_INDICES(start, end, len) \
7922 if (end > len) \
7923 end = len; \
7924 else if (end < 0) { \
7925 end += len; \
7926 if (end < 0) \
7927 end = 0; \
7928 } \
7929 if (start < 0) { \
7930 start += len; \
7931 if (start < 0) \
7932 start = 0; \
7933 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007934
Alexander Belopolsky40018472011-02-26 01:02:56 +00007935Py_ssize_t
7936PyUnicode_Count(PyObject *str,
7937 PyObject *substr,
7938 Py_ssize_t start,
7939 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007941 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007942 PyUnicodeObject* str_obj;
7943 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007944 int kind1, kind2, kind;
7945 void *buf1 = NULL, *buf2 = NULL;
7946 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007947
Thomas Wouters477c8d52006-05-27 19:21:47 +00007948 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007949 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007951 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02007952 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 Py_DECREF(str_obj);
7954 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955 }
Tim Petersced69f82003-09-16 20:30:58 +00007956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007957 kind1 = PyUnicode_KIND(str_obj);
7958 kind2 = PyUnicode_KIND(sub_obj);
7959 kind = kind1 > kind2 ? kind1 : kind2;
7960 buf1 = PyUnicode_DATA(str_obj);
7961 if (kind1 != kind)
7962 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7963 if (!buf1)
7964 goto onError;
7965 buf2 = PyUnicode_DATA(sub_obj);
7966 if (kind2 != kind)
7967 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7968 if (!buf2)
7969 goto onError;
7970 len1 = PyUnicode_GET_LENGTH(str_obj);
7971 len2 = PyUnicode_GET_LENGTH(sub_obj);
7972
7973 ADJUST_INDICES(start, end, len1);
7974 switch(kind) {
7975 case PyUnicode_1BYTE_KIND:
7976 result = ucs1lib_count(
7977 ((Py_UCS1*)buf1) + start, end - start,
7978 buf2, len2, PY_SSIZE_T_MAX
7979 );
7980 break;
7981 case PyUnicode_2BYTE_KIND:
7982 result = ucs2lib_count(
7983 ((Py_UCS2*)buf1) + start, end - start,
7984 buf2, len2, PY_SSIZE_T_MAX
7985 );
7986 break;
7987 case PyUnicode_4BYTE_KIND:
7988 result = ucs4lib_count(
7989 ((Py_UCS4*)buf1) + start, end - start,
7990 buf2, len2, PY_SSIZE_T_MAX
7991 );
7992 break;
7993 default:
7994 assert(0); result = 0;
7995 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007996
7997 Py_DECREF(sub_obj);
7998 Py_DECREF(str_obj);
7999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008000 if (kind1 != kind)
8001 PyMem_Free(buf1);
8002 if (kind2 != kind)
8003 PyMem_Free(buf2);
8004
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008006 onError:
8007 Py_DECREF(sub_obj);
8008 Py_DECREF(str_obj);
8009 if (kind1 != kind && buf1)
8010 PyMem_Free(buf1);
8011 if (kind2 != kind && buf2)
8012 PyMem_Free(buf2);
8013 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014}
8015
Alexander Belopolsky40018472011-02-26 01:02:56 +00008016Py_ssize_t
8017PyUnicode_Find(PyObject *str,
8018 PyObject *sub,
8019 Py_ssize_t start,
8020 Py_ssize_t end,
8021 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008023 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008024
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008028 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008029 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 Py_DECREF(str);
8031 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 }
Tim Petersced69f82003-09-16 20:30:58 +00008033
Thomas Wouters477c8d52006-05-27 19:21:47 +00008034 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008035 result = any_find_slice(
8036 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8037 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008038 );
8039 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008040 result = any_find_slice(
8041 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8042 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008043 );
8044
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008046 Py_DECREF(sub);
8047
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 return result;
8049}
8050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008051Py_ssize_t
8052PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8053 Py_ssize_t start, Py_ssize_t end,
8054 int direction)
8055{
8056 char *result;
8057 int kind;
8058 if (PyUnicode_READY(str) == -1)
8059 return -2;
8060 if (end > PyUnicode_GET_LENGTH(str))
8061 end = PyUnicode_GET_LENGTH(str);
8062 kind = PyUnicode_KIND(str);
8063 result = findchar(PyUnicode_1BYTE_DATA(str)
8064 + PyUnicode_KIND_SIZE(kind, start),
8065 kind,
8066 end-start, ch, direction);
8067 if (!result)
8068 return -1;
8069 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8070}
8071
Alexander Belopolsky40018472011-02-26 01:02:56 +00008072static int
8073tailmatch(PyUnicodeObject *self,
8074 PyUnicodeObject *substring,
8075 Py_ssize_t start,
8076 Py_ssize_t end,
8077 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008079 int kind_self;
8080 int kind_sub;
8081 void *data_self;
8082 void *data_sub;
8083 Py_ssize_t offset;
8084 Py_ssize_t i;
8085 Py_ssize_t end_sub;
8086
8087 if (PyUnicode_READY(self) == -1 ||
8088 PyUnicode_READY(substring) == -1)
8089 return 0;
8090
8091 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092 return 1;
8093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008094 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8095 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008097 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008099 kind_self = PyUnicode_KIND(self);
8100 data_self = PyUnicode_DATA(self);
8101 kind_sub = PyUnicode_KIND(substring);
8102 data_sub = PyUnicode_DATA(substring);
8103 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8104
8105 if (direction > 0)
8106 offset = end;
8107 else
8108 offset = start;
8109
8110 if (PyUnicode_READ(kind_self, data_self, offset) ==
8111 PyUnicode_READ(kind_sub, data_sub, 0) &&
8112 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8113 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8114 /* If both are of the same kind, memcmp is sufficient */
8115 if (kind_self == kind_sub) {
8116 return ! memcmp((char *)data_self +
8117 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8118 data_sub,
8119 PyUnicode_GET_LENGTH(substring) *
8120 PyUnicode_CHARACTER_SIZE(substring));
8121 }
8122 /* otherwise we have to compare each character by first accesing it */
8123 else {
8124 /* We do not need to compare 0 and len(substring)-1 because
8125 the if statement above ensured already that they are equal
8126 when we end up here. */
8127 // TODO: honor direction and do a forward or backwards search
8128 for (i = 1; i < end_sub; ++i) {
8129 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8130 PyUnicode_READ(kind_sub, data_sub, i))
8131 return 0;
8132 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008134 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 }
8136
8137 return 0;
8138}
8139
Alexander Belopolsky40018472011-02-26 01:02:56 +00008140Py_ssize_t
8141PyUnicode_Tailmatch(PyObject *str,
8142 PyObject *substr,
8143 Py_ssize_t start,
8144 Py_ssize_t end,
8145 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008147 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008148
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 str = PyUnicode_FromObject(str);
8150 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152 substr = PyUnicode_FromObject(substr);
8153 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 Py_DECREF(str);
8155 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156 }
Tim Petersced69f82003-09-16 20:30:58 +00008157
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 (PyUnicodeObject *)substr,
8160 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161 Py_DECREF(str);
8162 Py_DECREF(substr);
8163 return result;
8164}
8165
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166/* Apply fixfct filter to the Unicode object self and return a
8167 reference to the modified object */
8168
Alexander Belopolsky40018472011-02-26 01:02:56 +00008169static PyObject *
8170fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008171 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008173 PyObject *u;
8174 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008176 if (PyUnicode_READY(self) == -1)
8177 return NULL;
8178 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8179 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8180 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008184 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8185 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008187 /* fix functions return the new maximum character in a string,
8188 if the kind of the resulting unicode object does not change,
8189 everything is fine. Otherwise we need to change the string kind
8190 and re-run the fix function. */
8191 maxchar_new = fixfct((PyUnicodeObject*)u);
8192 if (maxchar_new == 0)
8193 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8194 else if (maxchar_new <= 127)
8195 maxchar_new = 127;
8196 else if (maxchar_new <= 255)
8197 maxchar_new = 255;
8198 else if (maxchar_new <= 65535)
8199 maxchar_new = 65535;
8200 else
8201 maxchar_new = 1114111; /* 0x10ffff */
8202
8203 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008204 /* fixfct should return TRUE if it modified the buffer. If
8205 FALSE, return a reference to the original buffer instead
8206 (to save space, not time) */
8207 Py_INCREF(self);
8208 Py_DECREF(u);
8209 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008211 else if (maxchar_new == maxchar_old) {
8212 return u;
8213 }
8214 else {
8215 /* In case the maximum character changed, we need to
8216 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008217 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008218 if (v == NULL) {
8219 Py_DECREF(u);
8220 return NULL;
8221 }
8222 if (maxchar_new > maxchar_old) {
8223 /* If the maxchar increased so that the kind changed, not all
8224 characters are representable anymore and we need to fix the
8225 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008226 if (PyUnicode_CopyCharacters(v, 0,
8227 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008228 PyUnicode_GET_LENGTH(self)) < 0)
8229 {
8230 Py_DECREF(u);
8231 return NULL;
8232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008233 maxchar_old = fixfct((PyUnicodeObject*)v);
8234 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8235 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008236 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008237 if (PyUnicode_CopyCharacters(v, 0,
8238 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008239 PyUnicode_GET_LENGTH(self)) < 0)
8240 {
8241 Py_DECREF(u);
8242 return NULL;
8243 }
8244 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008245
8246 Py_DECREF(u);
8247 return v;
8248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249}
8250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008251static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008252fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008254 /* No need to call PyUnicode_READY(self) because this function is only
8255 called as a callback from fixup() which does it already. */
8256 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8257 const int kind = PyUnicode_KIND(self);
8258 void *data = PyUnicode_DATA(self);
8259 int touched = 0;
8260 Py_UCS4 maxchar = 0;
8261 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008263 for (i = 0; i < len; ++i) {
8264 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8265 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8266 if (up != ch) {
8267 if (up > maxchar)
8268 maxchar = up;
8269 PyUnicode_WRITE(kind, data, i, up);
8270 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008272 else if (ch > maxchar)
8273 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 }
8275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008276 if (touched)
8277 return maxchar;
8278 else
8279 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280}
8281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008282static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008283fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008285 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8286 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8287 const int kind = PyUnicode_KIND(self);
8288 void *data = PyUnicode_DATA(self);
8289 int touched = 0;
8290 Py_UCS4 maxchar = 0;
8291 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008293 for(i = 0; i < len; ++i) {
8294 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8295 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8296 if (lo != ch) {
8297 if (lo > maxchar)
8298 maxchar = lo;
8299 PyUnicode_WRITE(kind, data, i, lo);
8300 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008302 else if (ch > maxchar)
8303 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 }
8305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008306 if (touched)
8307 return maxchar;
8308 else
8309 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310}
8311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008313fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008315 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8316 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8317 const int kind = PyUnicode_KIND(self);
8318 void *data = PyUnicode_DATA(self);
8319 int touched = 0;
8320 Py_UCS4 maxchar = 0;
8321 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008323 for(i = 0; i < len; ++i) {
8324 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8325 Py_UCS4 nu = 0;
8326
8327 if (Py_UNICODE_ISUPPER(ch))
8328 nu = Py_UNICODE_TOLOWER(ch);
8329 else if (Py_UNICODE_ISLOWER(ch))
8330 nu = Py_UNICODE_TOUPPER(ch);
8331
8332 if (nu != 0) {
8333 if (nu > maxchar)
8334 maxchar = nu;
8335 PyUnicode_WRITE(kind, data, i, nu);
8336 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008338 else if (ch > maxchar)
8339 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 }
8341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008342 if (touched)
8343 return maxchar;
8344 else
8345 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346}
8347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008349fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8352 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8353 const int kind = PyUnicode_KIND(self);
8354 void *data = PyUnicode_DATA(self);
8355 int touched = 0;
8356 Py_UCS4 maxchar = 0;
8357 Py_ssize_t i = 0;
8358 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008359
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008360 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362
8363 ch = PyUnicode_READ(kind, data, i);
8364 if (!Py_UNICODE_ISUPPER(ch)) {
8365 maxchar = Py_UNICODE_TOUPPER(ch);
8366 PyUnicode_WRITE(kind, data, i, maxchar);
8367 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008369 ++i;
8370 for(; i < len; ++i) {
8371 ch = PyUnicode_READ(kind, data, i);
8372 if (!Py_UNICODE_ISLOWER(ch)) {
8373 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8374 if (lo > maxchar)
8375 maxchar = lo;
8376 PyUnicode_WRITE(kind, data, i, lo);
8377 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008379 else if (ch > maxchar)
8380 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382
8383 if (touched)
8384 return maxchar;
8385 else
8386 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387}
8388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008389static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008390fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008392 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8393 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8394 const int kind = PyUnicode_KIND(self);
8395 void *data = PyUnicode_DATA(self);
8396 Py_UCS4 maxchar = 0;
8397 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398 int previous_is_cased;
8399
8400 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 if (len == 1) {
8402 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8403 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8404 if (ti != ch) {
8405 PyUnicode_WRITE(kind, data, i, ti);
8406 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 }
8408 else
8409 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008412 for(; i < len; ++i) {
8413 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8414 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008415
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008419 nu = Py_UNICODE_TOTITLE(ch);
8420
8421 if (nu > maxchar)
8422 maxchar = nu;
8423 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008424
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 if (Py_UNICODE_ISLOWER(ch) ||
8426 Py_UNICODE_ISUPPER(ch) ||
8427 Py_UNICODE_ISTITLE(ch))
8428 previous_is_cased = 1;
8429 else
8430 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008432 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433}
8434
Tim Peters8ce9f162004-08-27 01:49:32 +00008435PyObject *
8436PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008438 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008439 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008441 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008442 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8443 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008444 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008445 Py_ssize_t sz, i, res_offset;
8446 Py_UCS4 maxchar = 0;
8447 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448
Tim Peters05eba1f2004-08-27 21:32:02 +00008449 fseq = PySequence_Fast(seq, "");
8450 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008451 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008452 }
8453
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008454 /* NOTE: the following code can't call back into Python code,
8455 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008456 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008457
Tim Peters05eba1f2004-08-27 21:32:02 +00008458 seqlen = PySequence_Fast_GET_SIZE(fseq);
8459 /* If empty sequence, return u"". */
8460 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008461 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008462 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008463 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008464 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008465 /* If singleton sequence with an exact Unicode, return that. */
8466 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 item = items[0];
8468 if (PyUnicode_CheckExact(item)) {
8469 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 goto Done;
8472 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008473 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008474 else {
8475 /* Set up sep and seplen */
8476 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008477 /* fall back to a blank space separator */
8478 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008479 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008481 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008482 else {
8483 if (!PyUnicode_Check(separator)) {
8484 PyErr_Format(PyExc_TypeError,
8485 "separator: expected str instance,"
8486 " %.80s found",
8487 Py_TYPE(separator)->tp_name);
8488 goto onError;
8489 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490 if (PyUnicode_READY(separator) == -1)
8491 goto onError;
8492 sep = separator;
8493 seplen = PyUnicode_GET_LENGTH(separator);
8494 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8495 /* inc refcount to keep this code path symetric with the
8496 above case of a blank separator */
8497 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008498 }
8499 }
8500
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008501 /* There are at least two things to join, or else we have a subclass
8502 * of str in the sequence.
8503 * Do a pre-pass to figure out the total amount of space we'll
8504 * need (sz), and see whether all argument are strings.
8505 */
8506 sz = 0;
8507 for (i = 0; i < seqlen; i++) {
8508 const Py_ssize_t old_sz = sz;
8509 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 if (!PyUnicode_Check(item)) {
8511 PyErr_Format(PyExc_TypeError,
8512 "sequence item %zd: expected str instance,"
8513 " %.80s found",
8514 i, Py_TYPE(item)->tp_name);
8515 goto onError;
8516 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 if (PyUnicode_READY(item) == -1)
8518 goto onError;
8519 sz += PyUnicode_GET_LENGTH(item);
8520 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8521 if (item_maxchar > maxchar)
8522 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008523 if (i != 0)
8524 sz += seplen;
8525 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8526 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008528 goto onError;
8529 }
8530 }
Tim Petersced69f82003-09-16 20:30:58 +00008531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008533 if (res == NULL)
8534 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008535
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008536 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008538 Py_ssize_t itemlen;
8539 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 /* Copy item, and maybe the separator. */
8542 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008543 if (PyUnicode_CopyCharacters(res, res_offset,
8544 sep, 0, seplen) < 0)
8545 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008548 if (PyUnicode_CopyCharacters(res, res_offset,
8549 item, 0, itemlen) < 0)
8550 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008552 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008553 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008554
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008556 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557 Py_XDECREF(sep);
8558 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008561 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008563 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564 return NULL;
8565}
8566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008567#define FILL(kind, data, value, start, length) \
8568 do { \
8569 Py_ssize_t i_ = 0; \
8570 assert(kind != PyUnicode_WCHAR_KIND); \
8571 switch ((kind)) { \
8572 case PyUnicode_1BYTE_KIND: { \
8573 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8574 memset(to_, (unsigned char)value, length); \
8575 break; \
8576 } \
8577 case PyUnicode_2BYTE_KIND: { \
8578 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8579 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8580 break; \
8581 } \
8582 default: { \
8583 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8584 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8585 break; \
8586 } \
8587 } \
8588 } while (0)
8589
Alexander Belopolsky40018472011-02-26 01:02:56 +00008590static PyUnicodeObject *
8591pad(PyUnicodeObject *self,
8592 Py_ssize_t left,
8593 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 PyObject *u;
8597 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008598 int kind;
8599 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600
8601 if (left < 0)
8602 left = 0;
8603 if (right < 0)
8604 right = 0;
8605
Tim Peters7a29bd52001-09-12 03:03:31 +00008606 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607 Py_INCREF(self);
8608 return self;
8609 }
8610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8612 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008613 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8614 return NULL;
8615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8617 if (fill > maxchar)
8618 maxchar = fill;
8619 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008620 if (!u)
8621 return NULL;
8622
8623 kind = PyUnicode_KIND(u);
8624 data = PyUnicode_DATA(u);
8625 if (left)
8626 FILL(kind, data, fill, 0, left);
8627 if (right)
8628 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008629 if (PyUnicode_CopyCharacters(u, left,
8630 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008631 _PyUnicode_LENGTH(self)) < 0)
8632 {
8633 Py_DECREF(u);
8634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 }
8636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640
Alexander Belopolsky40018472011-02-26 01:02:56 +00008641PyObject *
8642PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645
8646 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650 switch(PyUnicode_KIND(string)) {
8651 case PyUnicode_1BYTE_KIND:
8652 list = ucs1lib_splitlines(
8653 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8654 PyUnicode_GET_LENGTH(string), keepends);
8655 break;
8656 case PyUnicode_2BYTE_KIND:
8657 list = ucs2lib_splitlines(
8658 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8659 PyUnicode_GET_LENGTH(string), keepends);
8660 break;
8661 case PyUnicode_4BYTE_KIND:
8662 list = ucs4lib_splitlines(
8663 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8664 PyUnicode_GET_LENGTH(string), keepends);
8665 break;
8666 default:
8667 assert(0);
8668 list = 0;
8669 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670 Py_DECREF(string);
8671 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672}
8673
Alexander Belopolsky40018472011-02-26 01:02:56 +00008674static PyObject *
8675split(PyUnicodeObject *self,
8676 PyUnicodeObject *substring,
8677 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 int kind1, kind2, kind;
8680 void *buf1, *buf2;
8681 Py_ssize_t len1, len2;
8682 PyObject* out;
8683
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008685 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687 if (PyUnicode_READY(self) == -1)
8688 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 if (substring == NULL)
8691 switch(PyUnicode_KIND(self)) {
8692 case PyUnicode_1BYTE_KIND:
8693 return ucs1lib_split_whitespace(
8694 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8695 PyUnicode_GET_LENGTH(self), maxcount
8696 );
8697 case PyUnicode_2BYTE_KIND:
8698 return ucs2lib_split_whitespace(
8699 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8700 PyUnicode_GET_LENGTH(self), maxcount
8701 );
8702 case PyUnicode_4BYTE_KIND:
8703 return ucs4lib_split_whitespace(
8704 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8705 PyUnicode_GET_LENGTH(self), maxcount
8706 );
8707 default:
8708 assert(0);
8709 return NULL;
8710 }
8711
8712 if (PyUnicode_READY(substring) == -1)
8713 return NULL;
8714
8715 kind1 = PyUnicode_KIND(self);
8716 kind2 = PyUnicode_KIND(substring);
8717 kind = kind1 > kind2 ? kind1 : kind2;
8718 buf1 = PyUnicode_DATA(self);
8719 buf2 = PyUnicode_DATA(substring);
8720 if (kind1 != kind)
8721 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8722 if (!buf1)
8723 return NULL;
8724 if (kind2 != kind)
8725 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8726 if (!buf2) {
8727 if (kind1 != kind) PyMem_Free(buf1);
8728 return NULL;
8729 }
8730 len1 = PyUnicode_GET_LENGTH(self);
8731 len2 = PyUnicode_GET_LENGTH(substring);
8732
8733 switch(kind) {
8734 case PyUnicode_1BYTE_KIND:
8735 out = ucs1lib_split(
8736 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8737 break;
8738 case PyUnicode_2BYTE_KIND:
8739 out = ucs2lib_split(
8740 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8741 break;
8742 case PyUnicode_4BYTE_KIND:
8743 out = ucs4lib_split(
8744 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8745 break;
8746 default:
8747 out = NULL;
8748 }
8749 if (kind1 != kind)
8750 PyMem_Free(buf1);
8751 if (kind2 != kind)
8752 PyMem_Free(buf2);
8753 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754}
8755
Alexander Belopolsky40018472011-02-26 01:02:56 +00008756static PyObject *
8757rsplit(PyUnicodeObject *self,
8758 PyUnicodeObject *substring,
8759 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008760{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008761 int kind1, kind2, kind;
8762 void *buf1, *buf2;
8763 Py_ssize_t len1, len2;
8764 PyObject* out;
8765
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008766 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008767 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008769 if (PyUnicode_READY(self) == -1)
8770 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772 if (substring == NULL)
8773 switch(PyUnicode_KIND(self)) {
8774 case PyUnicode_1BYTE_KIND:
8775 return ucs1lib_rsplit_whitespace(
8776 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8777 PyUnicode_GET_LENGTH(self), maxcount
8778 );
8779 case PyUnicode_2BYTE_KIND:
8780 return ucs2lib_rsplit_whitespace(
8781 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8782 PyUnicode_GET_LENGTH(self), maxcount
8783 );
8784 case PyUnicode_4BYTE_KIND:
8785 return ucs4lib_rsplit_whitespace(
8786 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8787 PyUnicode_GET_LENGTH(self), maxcount
8788 );
8789 default:
8790 assert(0);
8791 return NULL;
8792 }
8793
8794 if (PyUnicode_READY(substring) == -1)
8795 return NULL;
8796
8797 kind1 = PyUnicode_KIND(self);
8798 kind2 = PyUnicode_KIND(substring);
8799 kind = kind1 > kind2 ? kind1 : kind2;
8800 buf1 = PyUnicode_DATA(self);
8801 buf2 = PyUnicode_DATA(substring);
8802 if (kind1 != kind)
8803 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8804 if (!buf1)
8805 return NULL;
8806 if (kind2 != kind)
8807 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8808 if (!buf2) {
8809 if (kind1 != kind) PyMem_Free(buf1);
8810 return NULL;
8811 }
8812 len1 = PyUnicode_GET_LENGTH(self);
8813 len2 = PyUnicode_GET_LENGTH(substring);
8814
8815 switch(kind) {
8816 case PyUnicode_1BYTE_KIND:
8817 out = ucs1lib_rsplit(
8818 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8819 break;
8820 case PyUnicode_2BYTE_KIND:
8821 out = ucs2lib_rsplit(
8822 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8823 break;
8824 case PyUnicode_4BYTE_KIND:
8825 out = ucs4lib_rsplit(
8826 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8827 break;
8828 default:
8829 out = NULL;
8830 }
8831 if (kind1 != kind)
8832 PyMem_Free(buf1);
8833 if (kind2 != kind)
8834 PyMem_Free(buf2);
8835 return out;
8836}
8837
8838static Py_ssize_t
8839anylib_find(int kind, void *buf1, Py_ssize_t len1,
8840 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8841{
8842 switch(kind) {
8843 case PyUnicode_1BYTE_KIND:
8844 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8845 case PyUnicode_2BYTE_KIND:
8846 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8847 case PyUnicode_4BYTE_KIND:
8848 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8849 }
8850 assert(0);
8851 return -1;
8852}
8853
8854static Py_ssize_t
8855anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8856 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8857{
8858 switch(kind) {
8859 case PyUnicode_1BYTE_KIND:
8860 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8861 case PyUnicode_2BYTE_KIND:
8862 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8863 case PyUnicode_4BYTE_KIND:
8864 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8865 }
8866 assert(0);
8867 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008868}
8869
Alexander Belopolsky40018472011-02-26 01:02:56 +00008870static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008871replace(PyObject *self, PyObject *str1,
8872 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 PyObject *u;
8875 char *sbuf = PyUnicode_DATA(self);
8876 char *buf1 = PyUnicode_DATA(str1);
8877 char *buf2 = PyUnicode_DATA(str2);
8878 int srelease = 0, release1 = 0, release2 = 0;
8879 int skind = PyUnicode_KIND(self);
8880 int kind1 = PyUnicode_KIND(str1);
8881 int kind2 = PyUnicode_KIND(str2);
8882 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8883 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8884 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885
8886 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008889 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 if (skind < kind1)
8892 /* substring too wide to be present */
8893 goto nothing;
8894
8895 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008896 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008897 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008899 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008901 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 Py_UCS4 u1, u2, maxchar;
8903 int mayshrink, rkind;
8904 u1 = PyUnicode_READ_CHAR(str1, 0);
8905 if (!findchar(sbuf, PyUnicode_KIND(self),
8906 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008907 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 u2 = PyUnicode_READ_CHAR(str2, 0);
8909 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8910 /* Replacing u1 with u2 may cause a maxchar reduction in the
8911 result string. */
8912 mayshrink = maxchar > 127;
8913 if (u2 > maxchar) {
8914 maxchar = u2;
8915 mayshrink = 0;
8916 }
8917 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008918 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008920 if (PyUnicode_CopyCharacters(u, 0,
8921 (PyObject*)self, 0, slen) < 0)
8922 {
8923 Py_DECREF(u);
8924 return NULL;
8925 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 rkind = PyUnicode_KIND(u);
8927 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8928 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008929 if (--maxcount < 0)
8930 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008932 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933 if (mayshrink) {
8934 PyObject *tmp = u;
8935 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8936 PyUnicode_GET_LENGTH(tmp));
8937 Py_DECREF(tmp);
8938 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 int rkind = skind;
8941 char *res;
8942 if (kind1 < rkind) {
8943 /* widen substring */
8944 buf1 = _PyUnicode_AsKind(str1, rkind);
8945 if (!buf1) goto error;
8946 release1 = 1;
8947 }
8948 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008949 if (i < 0)
8950 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 if (rkind > kind2) {
8952 /* widen replacement */
8953 buf2 = _PyUnicode_AsKind(str2, rkind);
8954 if (!buf2) goto error;
8955 release2 = 1;
8956 }
8957 else if (rkind < kind2) {
8958 /* widen self and buf1 */
8959 rkind = kind2;
8960 if (release1) PyMem_Free(buf1);
8961 sbuf = _PyUnicode_AsKind(self, rkind);
8962 if (!sbuf) goto error;
8963 srelease = 1;
8964 buf1 = _PyUnicode_AsKind(str1, rkind);
8965 if (!buf1) goto error;
8966 release1 = 1;
8967 }
8968 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8969 if (!res) {
8970 PyErr_NoMemory();
8971 goto error;
8972 }
8973 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008974 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8976 buf2,
8977 PyUnicode_KIND_SIZE(rkind, len2));
8978 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008979
8980 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8982 slen-i,
8983 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008984 if (i == -1)
8985 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008986 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8987 buf2,
8988 PyUnicode_KIND_SIZE(rkind, len2));
8989 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008990 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991
8992 u = PyUnicode_FromKindAndData(rkind, res, slen);
8993 PyMem_Free(res);
8994 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008998 Py_ssize_t n, i, j, ires;
8999 Py_ssize_t product, new_size;
9000 int rkind = skind;
9001 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003 if (kind1 < rkind) {
9004 buf1 = _PyUnicode_AsKind(str1, rkind);
9005 if (!buf1) goto error;
9006 release1 = 1;
9007 }
9008 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009009 if (n == 0)
9010 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 if (kind2 < rkind) {
9012 buf2 = _PyUnicode_AsKind(str2, rkind);
9013 if (!buf2) goto error;
9014 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 else if (kind2 > rkind) {
9017 rkind = kind2;
9018 sbuf = _PyUnicode_AsKind(self, rkind);
9019 if (!sbuf) goto error;
9020 srelease = 1;
9021 if (release1) PyMem_Free(buf1);
9022 buf1 = _PyUnicode_AsKind(str1, rkind);
9023 if (!buf1) goto error;
9024 release1 = 1;
9025 }
9026 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9027 PyUnicode_GET_LENGTH(str1))); */
9028 product = n * (len2-len1);
9029 if ((product / (len2-len1)) != n) {
9030 PyErr_SetString(PyExc_OverflowError,
9031 "replace string is too long");
9032 goto error;
9033 }
9034 new_size = slen + product;
9035 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9036 PyErr_SetString(PyExc_OverflowError,
9037 "replace string is too long");
9038 goto error;
9039 }
9040 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9041 if (!res)
9042 goto error;
9043 ires = i = 0;
9044 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009045 while (n-- > 0) {
9046 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047 j = anylib_find(rkind,
9048 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9049 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009050 if (j == -1)
9051 break;
9052 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009053 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9055 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9056 PyUnicode_KIND_SIZE(rkind, j-i));
9057 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009058 }
9059 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 if (len2 > 0) {
9061 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9062 buf2,
9063 PyUnicode_KIND_SIZE(rkind, len2));
9064 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009065 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009067 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009069 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9071 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9072 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009073 } else {
9074 /* interleave */
9075 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9077 buf2,
9078 PyUnicode_KIND_SIZE(rkind, len2));
9079 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009080 if (--n <= 0)
9081 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9083 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9084 PyUnicode_KIND_SIZE(rkind, 1));
9085 ires++;
9086 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009087 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9089 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9090 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009093 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 if (srelease)
9096 PyMem_FREE(sbuf);
9097 if (release1)
9098 PyMem_FREE(buf1);
9099 if (release2)
9100 PyMem_FREE(buf2);
9101 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009102
Benjamin Peterson29060642009-01-31 22:14:21 +00009103 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009104 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105 if (srelease)
9106 PyMem_FREE(sbuf);
9107 if (release1)
9108 PyMem_FREE(buf1);
9109 if (release2)
9110 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009111 if (PyUnicode_CheckExact(self)) {
9112 Py_INCREF(self);
9113 return (PyObject *) self;
9114 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009115 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009116 error:
9117 if (srelease && sbuf)
9118 PyMem_FREE(sbuf);
9119 if (release1 && buf1)
9120 PyMem_FREE(buf1);
9121 if (release2 && buf2)
9122 PyMem_FREE(buf2);
9123 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124}
9125
9126/* --- Unicode Object Methods --------------------------------------------- */
9127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009128PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130\n\
9131Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009132characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133
9134static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009135unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137 return fixup(self, fixtitle);
9138}
9139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009140PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009141 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142\n\
9143Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009144have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145
9146static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009147unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149 return fixup(self, fixcapitalize);
9150}
9151
9152#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009153PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009154 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155\n\
9156Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009157normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158
9159static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009160unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161{
9162 PyObject *list;
9163 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009164 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166 /* Split into words */
9167 list = split(self, NULL, -1);
9168 if (!list)
9169 return NULL;
9170
9171 /* Capitalize each word */
9172 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9173 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009174 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175 if (item == NULL)
9176 goto onError;
9177 Py_DECREF(PyList_GET_ITEM(list, i));
9178 PyList_SET_ITEM(list, i, item);
9179 }
9180
9181 /* Join the words to form a new string */
9182 item = PyUnicode_Join(NULL, list);
9183
Benjamin Peterson29060642009-01-31 22:14:21 +00009184 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185 Py_DECREF(list);
9186 return (PyObject *)item;
9187}
9188#endif
9189
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009190/* Argument converter. Coerces to a single unicode character */
9191
9192static int
9193convert_uc(PyObject *obj, void *addr)
9194{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009196 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009197
Benjamin Peterson14339b62009-01-31 16:36:08 +00009198 uniobj = PyUnicode_FromObject(obj);
9199 if (uniobj == NULL) {
9200 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009201 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009202 return 0;
9203 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009204 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009205 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009206 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009207 Py_DECREF(uniobj);
9208 return 0;
9209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009210 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009211 Py_DECREF(uniobj);
9212 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009213}
9214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009215PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009216 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009218Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009219done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220
9221static PyObject *
9222unicode_center(PyUnicodeObject *self, PyObject *args)
9223{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009224 Py_ssize_t marg, left;
9225 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226 Py_UCS4 fillchar = ' ';
9227
Victor Stinnere9a29352011-10-01 02:14:59 +02009228 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009229 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230
Victor Stinnere9a29352011-10-01 02:14:59 +02009231 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232 return NULL;
9233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009234 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235 Py_INCREF(self);
9236 return (PyObject*) self;
9237 }
9238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240 left = marg / 2 + (marg & width & 1);
9241
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009242 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009243}
9244
Marc-André Lemburge5034372000-08-08 08:04:29 +00009245#if 0
9246
9247/* This code should go into some future Unicode collation support
9248 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009249 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009250
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009251/* speedy UTF-16 code point order comparison */
9252/* gleaned from: */
9253/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9254
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009255static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009256{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009257 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009258 0, 0, 0, 0, 0, 0, 0, 0,
9259 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009260 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009261};
9262
Guido van Rossumd57fd912000-03-10 22:53:23 +00009263static int
9264unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9265{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009266 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009267
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268 Py_UNICODE *s1 = str1->str;
9269 Py_UNICODE *s2 = str2->str;
9270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 len1 = str1->_base._base.length;
9272 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009273
Guido van Rossumd57fd912000-03-10 22:53:23 +00009274 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009275 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009276
9277 c1 = *s1++;
9278 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009279
Benjamin Peterson29060642009-01-31 22:14:21 +00009280 if (c1 > (1<<11) * 26)
9281 c1 += utf16Fixup[c1>>11];
9282 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009283 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009284 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009285
9286 if (c1 != c2)
9287 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009288
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009289 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290 }
9291
9292 return (len1 < len2) ? -1 : (len1 != len2);
9293}
9294
Marc-André Lemburge5034372000-08-08 08:04:29 +00009295#else
9296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297/* This function assumes that str1 and str2 are readied by the caller. */
9298
Marc-André Lemburge5034372000-08-08 08:04:29 +00009299static int
9300unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9301{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302 int kind1, kind2;
9303 void *data1, *data2;
9304 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 kind1 = PyUnicode_KIND(str1);
9307 kind2 = PyUnicode_KIND(str2);
9308 data1 = PyUnicode_DATA(str1);
9309 data2 = PyUnicode_DATA(str2);
9310 len1 = PyUnicode_GET_LENGTH(str1);
9311 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 for (i = 0; i < len1 && i < len2; ++i) {
9314 Py_UCS4 c1, c2;
9315 c1 = PyUnicode_READ(kind1, data1, i);
9316 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009317
9318 if (c1 != c2)
9319 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009320 }
9321
9322 return (len1 < len2) ? -1 : (len1 != len2);
9323}
9324
9325#endif
9326
Alexander Belopolsky40018472011-02-26 01:02:56 +00009327int
9328PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009329{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9331 if (PyUnicode_READY(left) == -1 ||
9332 PyUnicode_READY(right) == -1)
9333 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009334 return unicode_compare((PyUnicodeObject *)left,
9335 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009337 PyErr_Format(PyExc_TypeError,
9338 "Can't compare %.100s and %.100s",
9339 left->ob_type->tp_name,
9340 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009341 return -1;
9342}
9343
Martin v. Löwis5b222132007-06-10 09:51:05 +00009344int
9345PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9346{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 Py_ssize_t i;
9348 int kind;
9349 void *data;
9350 Py_UCS4 chr;
9351
Martin v. Löwis5b222132007-06-10 09:51:05 +00009352 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 if (PyUnicode_READY(uni) == -1)
9354 return -1;
9355 kind = PyUnicode_KIND(uni);
9356 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009357 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009358 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9359 if (chr != str[i])
9360 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009361 /* This check keeps Python strings that end in '\0' from comparing equal
9362 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009365 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009366 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009367 return 0;
9368}
9369
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009370
Benjamin Peterson29060642009-01-31 22:14:21 +00009371#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009372 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009373
Alexander Belopolsky40018472011-02-26 01:02:56 +00009374PyObject *
9375PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009376{
9377 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009378
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009379 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9380 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009381 if (PyUnicode_READY(left) == -1 ||
9382 PyUnicode_READY(right) == -1)
9383 return NULL;
9384 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9385 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009386 if (op == Py_EQ) {
9387 Py_INCREF(Py_False);
9388 return Py_False;
9389 }
9390 if (op == Py_NE) {
9391 Py_INCREF(Py_True);
9392 return Py_True;
9393 }
9394 }
9395 if (left == right)
9396 result = 0;
9397 else
9398 result = unicode_compare((PyUnicodeObject *)left,
9399 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009400
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009401 /* Convert the return value to a Boolean */
9402 switch (op) {
9403 case Py_EQ:
9404 v = TEST_COND(result == 0);
9405 break;
9406 case Py_NE:
9407 v = TEST_COND(result != 0);
9408 break;
9409 case Py_LE:
9410 v = TEST_COND(result <= 0);
9411 break;
9412 case Py_GE:
9413 v = TEST_COND(result >= 0);
9414 break;
9415 case Py_LT:
9416 v = TEST_COND(result == -1);
9417 break;
9418 case Py_GT:
9419 v = TEST_COND(result == 1);
9420 break;
9421 default:
9422 PyErr_BadArgument();
9423 return NULL;
9424 }
9425 Py_INCREF(v);
9426 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009427 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009428
Brian Curtindfc80e32011-08-10 20:28:54 -05009429 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009430}
9431
Alexander Belopolsky40018472011-02-26 01:02:56 +00009432int
9433PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009434{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009435 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 int kind1, kind2, kind;
9437 void *buf1, *buf2;
9438 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009439 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009440
9441 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009442 sub = PyUnicode_FromObject(element);
9443 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009444 PyErr_Format(PyExc_TypeError,
9445 "'in <string>' requires string as left operand, not %s",
9446 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009447 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 if (PyUnicode_READY(sub) == -1)
9450 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009451
Thomas Wouters477c8d52006-05-27 19:21:47 +00009452 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009453 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009454 Py_DECREF(sub);
9455 return -1;
9456 }
9457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 kind1 = PyUnicode_KIND(str);
9459 kind2 = PyUnicode_KIND(sub);
9460 kind = kind1 > kind2 ? kind1 : kind2;
9461 buf1 = PyUnicode_DATA(str);
9462 buf2 = PyUnicode_DATA(sub);
9463 if (kind1 != kind)
9464 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9465 if (!buf1) {
9466 Py_DECREF(sub);
9467 return -1;
9468 }
9469 if (kind2 != kind)
9470 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9471 if (!buf2) {
9472 Py_DECREF(sub);
9473 if (kind1 != kind) PyMem_Free(buf1);
9474 return -1;
9475 }
9476 len1 = PyUnicode_GET_LENGTH(str);
9477 len2 = PyUnicode_GET_LENGTH(sub);
9478
9479 switch(kind) {
9480 case PyUnicode_1BYTE_KIND:
9481 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9482 break;
9483 case PyUnicode_2BYTE_KIND:
9484 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9485 break;
9486 case PyUnicode_4BYTE_KIND:
9487 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9488 break;
9489 default:
9490 result = -1;
9491 assert(0);
9492 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009493
9494 Py_DECREF(str);
9495 Py_DECREF(sub);
9496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 if (kind1 != kind)
9498 PyMem_Free(buf1);
9499 if (kind2 != kind)
9500 PyMem_Free(buf2);
9501
Guido van Rossum403d68b2000-03-13 15:55:09 +00009502 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009503}
9504
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505/* Concat to string or Unicode object giving a new Unicode object. */
9506
Alexander Belopolsky40018472011-02-26 01:02:56 +00009507PyObject *
9508PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 PyObject *u = NULL, *v = NULL, *w;
9511 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512
9513 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009516 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009519 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520
9521 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009523 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009527 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009528 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529 }
9530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009532 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 w = PyUnicode_New(
9536 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9537 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009539 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009540 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9541 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009542 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009543 v, 0,
9544 PyUnicode_GET_LENGTH(v)) < 0)
9545 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009546 Py_DECREF(u);
9547 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009548 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009549
Benjamin Peterson29060642009-01-31 22:14:21 +00009550 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009551 Py_XDECREF(u);
9552 Py_XDECREF(v);
9553 return NULL;
9554}
9555
Walter Dörwald1ab83302007-05-18 17:15:44 +00009556void
9557PyUnicode_Append(PyObject **pleft, PyObject *right)
9558{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009559 PyObject *new;
9560 if (*pleft == NULL)
9561 return;
9562 if (right == NULL || !PyUnicode_Check(*pleft)) {
9563 Py_DECREF(*pleft);
9564 *pleft = NULL;
9565 return;
9566 }
9567 new = PyUnicode_Concat(*pleft, right);
9568 Py_DECREF(*pleft);
9569 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009570}
9571
9572void
9573PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9574{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009575 PyUnicode_Append(pleft, right);
9576 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009577}
9578
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009579PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009580 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009582Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009583string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009584interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585
9586static PyObject *
9587unicode_count(PyUnicodeObject *self, PyObject *args)
9588{
9589 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009590 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009591 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 int kind1, kind2, kind;
9594 void *buf1, *buf2;
9595 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009596
Jesus Ceaac451502011-04-20 17:09:23 +02009597 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9598 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009599 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 kind1 = PyUnicode_KIND(self);
9602 kind2 = PyUnicode_KIND(substring);
9603 kind = kind1 > kind2 ? kind1 : kind2;
9604 buf1 = PyUnicode_DATA(self);
9605 buf2 = PyUnicode_DATA(substring);
9606 if (kind1 != kind)
9607 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9608 if (!buf1) {
9609 Py_DECREF(substring);
9610 return NULL;
9611 }
9612 if (kind2 != kind)
9613 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9614 if (!buf2) {
9615 Py_DECREF(substring);
9616 if (kind1 != kind) PyMem_Free(buf1);
9617 return NULL;
9618 }
9619 len1 = PyUnicode_GET_LENGTH(self);
9620 len2 = PyUnicode_GET_LENGTH(substring);
9621
9622 ADJUST_INDICES(start, end, len1);
9623 switch(kind) {
9624 case PyUnicode_1BYTE_KIND:
9625 iresult = ucs1lib_count(
9626 ((Py_UCS1*)buf1) + start, end - start,
9627 buf2, len2, PY_SSIZE_T_MAX
9628 );
9629 break;
9630 case PyUnicode_2BYTE_KIND:
9631 iresult = ucs2lib_count(
9632 ((Py_UCS2*)buf1) + start, end - start,
9633 buf2, len2, PY_SSIZE_T_MAX
9634 );
9635 break;
9636 case PyUnicode_4BYTE_KIND:
9637 iresult = ucs4lib_count(
9638 ((Py_UCS4*)buf1) + start, end - start,
9639 buf2, len2, PY_SSIZE_T_MAX
9640 );
9641 break;
9642 default:
9643 assert(0); iresult = 0;
9644 }
9645
9646 result = PyLong_FromSsize_t(iresult);
9647
9648 if (kind1 != kind)
9649 PyMem_Free(buf1);
9650 if (kind2 != kind)
9651 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652
9653 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009654
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655 return result;
9656}
9657
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009658PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009659 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009661Encode S using the codec registered for encoding. Default encoding\n\
9662is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009663handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009664a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9665'xmlcharrefreplace' as well as any other name registered with\n\
9666codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667
9668static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009669unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009671 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672 char *encoding = NULL;
9673 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009674
Benjamin Peterson308d6372009-09-18 21:42:35 +00009675 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9676 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009678 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009679}
9680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009681PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009682 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683\n\
9684Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009685If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686
9687static PyObject*
9688unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9689{
9690 Py_UNICODE *e;
9691 Py_UNICODE *p;
9692 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009693 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695 PyUnicodeObject *u;
9696 int tabsize = 8;
9697
9698 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009699 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9702 return NULL;
9703
Thomas Wouters7e474022000-07-16 12:04:32 +00009704 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009705 i = 0; /* chars up to and including most recent \n or \r */
9706 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9708 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009710 if (tabsize > 0) {
9711 incr = tabsize - (j % tabsize); /* cannot overflow */
9712 if (j > PY_SSIZE_T_MAX - incr)
9713 goto overflow1;
9714 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009715 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009718 if (j > PY_SSIZE_T_MAX - 1)
9719 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720 j++;
9721 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009722 if (i > PY_SSIZE_T_MAX - j)
9723 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009725 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726 }
9727 }
9728
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009729 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009730 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009731
Guido van Rossumd57fd912000-03-10 22:53:23 +00009732 /* Second pass: create output string and fill it */
9733 u = _PyUnicode_New(i + j);
9734 if (!u)
9735 return NULL;
9736
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009737 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009738 q = _PyUnicode_WSTR(u); /* next output char */
9739 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009742 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009743 if (tabsize > 0) {
9744 i = tabsize - (j % tabsize);
9745 j += i;
9746 while (i--) {
9747 if (q >= qe)
9748 goto overflow2;
9749 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009750 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009751 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009752 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009753 else {
9754 if (q >= qe)
9755 goto overflow2;
9756 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009757 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758 if (*p == '\n' || *p == '\r')
9759 j = 0;
9760 }
9761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009762 if (PyUnicode_READY(u) == -1) {
9763 Py_DECREF(u);
9764 return NULL;
9765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009767
9768 overflow2:
9769 Py_DECREF(u);
9770 overflow1:
9771 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9772 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773}
9774
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009775PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009776 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009777\n\
9778Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009779such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780arguments start and end are interpreted as in slice notation.\n\
9781\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009782Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009783
9784static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009786{
Jesus Ceaac451502011-04-20 17:09:23 +02009787 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009788 Py_ssize_t start;
9789 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009790 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009791
Jesus Ceaac451502011-04-20 17:09:23 +02009792 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9793 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 if (PyUnicode_READY(self) == -1)
9797 return NULL;
9798 if (PyUnicode_READY(substring) == -1)
9799 return NULL;
9800
9801 result = any_find_slice(
9802 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9803 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009804 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805
9806 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 if (result == -2)
9809 return NULL;
9810
Christian Heimes217cfd12007-12-02 14:31:20 +00009811 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812}
9813
9814static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +02009815unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02009817 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
9818 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821}
9822
Guido van Rossumc2504932007-09-18 19:42:40 +00009823/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009824 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009825static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009826unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827{
Guido van Rossumc2504932007-09-18 19:42:40 +00009828 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009829 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 if (_PyUnicode_HASH(self) != -1)
9832 return _PyUnicode_HASH(self);
9833 if (PyUnicode_READY(self) == -1)
9834 return -1;
9835 len = PyUnicode_GET_LENGTH(self);
9836
9837 /* The hash function as a macro, gets expanded three times below. */
9838#define HASH(P) \
9839 x = (Py_uhash_t)*P << 7; \
9840 while (--len >= 0) \
9841 x = (1000003*x) ^ (Py_uhash_t)*P++;
9842
9843 switch (PyUnicode_KIND(self)) {
9844 case PyUnicode_1BYTE_KIND: {
9845 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9846 HASH(c);
9847 break;
9848 }
9849 case PyUnicode_2BYTE_KIND: {
9850 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9851 HASH(s);
9852 break;
9853 }
9854 default: {
9855 Py_UCS4 *l;
9856 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9857 "Impossible switch case in unicode_hash");
9858 l = PyUnicode_4BYTE_DATA(self);
9859 HASH(l);
9860 break;
9861 }
9862 }
9863 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9864
Guido van Rossumc2504932007-09-18 19:42:40 +00009865 if (x == -1)
9866 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009868 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009872PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009873 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009875Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009876
9877static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009880 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009881 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009882 Py_ssize_t start;
9883 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009884
Jesus Ceaac451502011-04-20 17:09:23 +02009885 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9886 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 if (PyUnicode_READY(self) == -1)
9890 return NULL;
9891 if (PyUnicode_READY(substring) == -1)
9892 return NULL;
9893
9894 result = any_find_slice(
9895 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9896 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009897 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898
9899 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 if (result == -2)
9902 return NULL;
9903
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904 if (result < 0) {
9905 PyErr_SetString(PyExc_ValueError, "substring not found");
9906 return NULL;
9907 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009908
Christian Heimes217cfd12007-12-02 14:31:20 +00009909 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009910}
9911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009912PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009913 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009915Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009916at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009917
9918static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009919unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 Py_ssize_t i, length;
9922 int kind;
9923 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924 int cased;
9925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 if (PyUnicode_READY(self) == -1)
9927 return NULL;
9928 length = PyUnicode_GET_LENGTH(self);
9929 kind = PyUnicode_KIND(self);
9930 data = PyUnicode_DATA(self);
9931
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 if (length == 1)
9934 return PyBool_FromLong(
9935 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009937 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009939 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009940
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 for (i = 0; i < length; i++) {
9943 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009944
Benjamin Peterson29060642009-01-31 22:14:21 +00009945 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9946 return PyBool_FromLong(0);
9947 else if (!cased && Py_UNICODE_ISLOWER(ch))
9948 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009950 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951}
9952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009953PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009954 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009956Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009957at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958
9959static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009960unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 Py_ssize_t i, length;
9963 int kind;
9964 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009965 int cased;
9966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 if (PyUnicode_READY(self) == -1)
9968 return NULL;
9969 length = PyUnicode_GET_LENGTH(self);
9970 kind = PyUnicode_KIND(self);
9971 data = PyUnicode_DATA(self);
9972
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 if (length == 1)
9975 return PyBool_FromLong(
9976 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009978 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009980 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009981
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 for (i = 0; i < length; i++) {
9984 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009985
Benjamin Peterson29060642009-01-31 22:14:21 +00009986 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9987 return PyBool_FromLong(0);
9988 else if (!cased && Py_UNICODE_ISUPPER(ch))
9989 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009991 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009992}
9993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009994PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009995 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009997Return True if S is a titlecased string and there is at least one\n\
9998character in S, i.e. upper- and titlecase characters may only\n\
9999follow uncased characters and lowercase characters only cased ones.\n\
10000Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010001
10002static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010003unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 Py_ssize_t i, length;
10006 int kind;
10007 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008 int cased, previous_is_cased;
10009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 if (PyUnicode_READY(self) == -1)
10011 return NULL;
10012 length = PyUnicode_GET_LENGTH(self);
10013 kind = PyUnicode_KIND(self);
10014 data = PyUnicode_DATA(self);
10015
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 if (length == 1) {
10018 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10019 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10020 (Py_UNICODE_ISUPPER(ch) != 0));
10021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010023 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010025 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010026
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027 cased = 0;
10028 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 for (i = 0; i < length; i++) {
10030 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010031
Benjamin Peterson29060642009-01-31 22:14:21 +000010032 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10033 if (previous_is_cased)
10034 return PyBool_FromLong(0);
10035 previous_is_cased = 1;
10036 cased = 1;
10037 }
10038 else if (Py_UNICODE_ISLOWER(ch)) {
10039 if (!previous_is_cased)
10040 return PyBool_FromLong(0);
10041 previous_is_cased = 1;
10042 cased = 1;
10043 }
10044 else
10045 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010046 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010047 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010048}
10049
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010050PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010051 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010052\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010053Return True if all characters in S are whitespace\n\
10054and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010055
10056static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010057unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010058{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 Py_ssize_t i, length;
10060 int kind;
10061 void *data;
10062
10063 if (PyUnicode_READY(self) == -1)
10064 return NULL;
10065 length = PyUnicode_GET_LENGTH(self);
10066 kind = PyUnicode_KIND(self);
10067 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010068
Guido van Rossumd57fd912000-03-10 22:53:23 +000010069 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 if (length == 1)
10071 return PyBool_FromLong(
10072 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010073
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010074 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010076 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 for (i = 0; i < length; i++) {
10079 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010080 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010081 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010082 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010083 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010084}
10085
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010086PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010087 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010088\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010089Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010090and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010091
10092static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010093unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010094{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 Py_ssize_t i, length;
10096 int kind;
10097 void *data;
10098
10099 if (PyUnicode_READY(self) == -1)
10100 return NULL;
10101 length = PyUnicode_GET_LENGTH(self);
10102 kind = PyUnicode_KIND(self);
10103 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010104
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010105 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 if (length == 1)
10107 return PyBool_FromLong(
10108 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010109
10110 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010112 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 for (i = 0; i < length; i++) {
10115 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010116 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010117 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010118 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010119}
10120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010121PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010122 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010123\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010124Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010125and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010126
10127static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010128unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010129{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 int kind;
10131 void *data;
10132 Py_ssize_t len, i;
10133
10134 if (PyUnicode_READY(self) == -1)
10135 return NULL;
10136
10137 kind = PyUnicode_KIND(self);
10138 data = PyUnicode_DATA(self);
10139 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010140
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010141 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 if (len == 1) {
10143 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10144 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10145 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010146
10147 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010149 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 for (i = 0; i < len; i++) {
10152 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010153 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010154 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010155 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010156 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010157}
10158
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010159PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010160 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010162Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010163False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164
10165static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010166unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 Py_ssize_t i, length;
10169 int kind;
10170 void *data;
10171
10172 if (PyUnicode_READY(self) == -1)
10173 return NULL;
10174 length = PyUnicode_GET_LENGTH(self);
10175 kind = PyUnicode_KIND(self);
10176 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 if (length == 1)
10180 return PyBool_FromLong(
10181 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010183 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010185 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 for (i = 0; i < length; i++) {
10188 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010189 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010190 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010191 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192}
10193
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010194PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010195 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010196\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010197Return True if all characters in S are digits\n\
10198and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199
10200static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010201unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 Py_ssize_t i, length;
10204 int kind;
10205 void *data;
10206
10207 if (PyUnicode_READY(self) == -1)
10208 return NULL;
10209 length = PyUnicode_GET_LENGTH(self);
10210 kind = PyUnicode_KIND(self);
10211 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 if (length == 1) {
10215 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10216 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10217 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010218
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010219 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010221 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 for (i = 0; i < length; i++) {
10224 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010225 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010227 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228}
10229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010230PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010231 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010233Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010234False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235
10236static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010237unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 Py_ssize_t i, length;
10240 int kind;
10241 void *data;
10242
10243 if (PyUnicode_READY(self) == -1)
10244 return NULL;
10245 length = PyUnicode_GET_LENGTH(self);
10246 kind = PyUnicode_KIND(self);
10247 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 if (length == 1)
10251 return PyBool_FromLong(
10252 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010254 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010256 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 for (i = 0; i < length; i++) {
10259 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010260 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010262 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263}
10264
Martin v. Löwis47383402007-08-15 07:32:56 +000010265int
10266PyUnicode_IsIdentifier(PyObject *self)
10267{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 int kind;
10269 void *data;
10270 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010271 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 if (PyUnicode_READY(self) == -1) {
10274 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010275 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 }
10277
10278 /* Special case for empty strings */
10279 if (PyUnicode_GET_LENGTH(self) == 0)
10280 return 0;
10281 kind = PyUnicode_KIND(self);
10282 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010283
10284 /* PEP 3131 says that the first character must be in
10285 XID_Start and subsequent characters in XID_Continue,
10286 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010287 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010288 letters, digits, underscore). However, given the current
10289 definition of XID_Start and XID_Continue, it is sufficient
10290 to check just for these, except that _ must be allowed
10291 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010293 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010294 return 0;
10295
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010296 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010298 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010299 return 1;
10300}
10301
10302PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010303 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010304\n\
10305Return True if S is a valid identifier according\n\
10306to the language definition.");
10307
10308static PyObject*
10309unicode_isidentifier(PyObject *self)
10310{
10311 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10312}
10313
Georg Brandl559e5d72008-06-11 18:37:52 +000010314PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010315 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010316\n\
10317Return True if all characters in S are considered\n\
10318printable in repr() or S is empty, False otherwise.");
10319
10320static PyObject*
10321unicode_isprintable(PyObject *self)
10322{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 Py_ssize_t i, length;
10324 int kind;
10325 void *data;
10326
10327 if (PyUnicode_READY(self) == -1)
10328 return NULL;
10329 length = PyUnicode_GET_LENGTH(self);
10330 kind = PyUnicode_KIND(self);
10331 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010332
10333 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 if (length == 1)
10335 return PyBool_FromLong(
10336 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 for (i = 0; i < length; i++) {
10339 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010340 Py_RETURN_FALSE;
10341 }
10342 }
10343 Py_RETURN_TRUE;
10344}
10345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010346PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010347 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348\n\
10349Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010350iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351
10352static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010353unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010354{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010355 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010356}
10357
Martin v. Löwis18e16552006-02-15 17:27:45 +000010358static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359unicode_length(PyUnicodeObject *self)
10360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 if (PyUnicode_READY(self) == -1)
10362 return -1;
10363 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364}
10365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010366PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010367 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010369Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010370done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371
10372static PyObject *
10373unicode_ljust(PyUnicodeObject *self, PyObject *args)
10374{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010375 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 Py_UCS4 fillchar = ' ';
10377
10378 if (PyUnicode_READY(self) == -1)
10379 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010380
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010381 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382 return NULL;
10383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385 Py_INCREF(self);
10386 return (PyObject*) self;
10387 }
10388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390}
10391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010392PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010393 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010395Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396
10397static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010398unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400 return fixup(self, fixlower);
10401}
10402
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010403#define LEFTSTRIP 0
10404#define RIGHTSTRIP 1
10405#define BOTHSTRIP 2
10406
10407/* Arrays indexed by above */
10408static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10409
10410#define STRIPNAME(i) (stripformat[i]+3)
10411
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010412/* externally visible for str.strip(unicode) */
10413PyObject *
10414_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 void *data;
10417 int kind;
10418 Py_ssize_t i, j, len;
10419 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10422 return NULL;
10423
10424 kind = PyUnicode_KIND(self);
10425 data = PyUnicode_DATA(self);
10426 len = PyUnicode_GET_LENGTH(self);
10427 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10428 PyUnicode_DATA(sepobj),
10429 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010430
Benjamin Peterson14339b62009-01-31 16:36:08 +000010431 i = 0;
10432 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 while (i < len &&
10434 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010435 i++;
10436 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010437 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010438
Benjamin Peterson14339b62009-01-31 16:36:08 +000010439 j = len;
10440 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010441 do {
10442 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 } while (j >= i &&
10444 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010445 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010446 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010447
Victor Stinner12bab6d2011-10-01 01:53:49 +020010448 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449}
10450
10451PyObject*
10452PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10453{
10454 unsigned char *data;
10455 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010456 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457
Victor Stinnerde636f32011-10-01 03:55:54 +020010458 if (PyUnicode_READY(self) == -1)
10459 return NULL;
10460
10461 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10462
Victor Stinner12bab6d2011-10-01 01:53:49 +020010463 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010465 if (PyUnicode_CheckExact(self)) {
10466 Py_INCREF(self);
10467 return self;
10468 }
10469 else
10470 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 }
10472
Victor Stinner12bab6d2011-10-01 01:53:49 +020010473 length = end - start;
10474 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010475 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476
Victor Stinnerde636f32011-10-01 03:55:54 +020010477 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010478 PyErr_SetString(PyExc_IndexError, "string index out of range");
10479 return NULL;
10480 }
10481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 kind = PyUnicode_KIND(self);
10483 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010484 return PyUnicode_FromKindAndData(kind,
10485 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010486 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488
10489static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010490do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 int kind;
10493 void *data;
10494 Py_ssize_t len, i, j;
10495
10496 if (PyUnicode_READY(self) == -1)
10497 return NULL;
10498
10499 kind = PyUnicode_KIND(self);
10500 data = PyUnicode_DATA(self);
10501 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010502
Benjamin Peterson14339b62009-01-31 16:36:08 +000010503 i = 0;
10504 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010506 i++;
10507 }
10508 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010509
Benjamin Peterson14339b62009-01-31 16:36:08 +000010510 j = len;
10511 if (striptype != LEFTSTRIP) {
10512 do {
10513 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010515 j++;
10516 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010517
Victor Stinner12bab6d2011-10-01 01:53:49 +020010518 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519}
10520
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010521
10522static PyObject *
10523do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10524{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010525 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010526
Benjamin Peterson14339b62009-01-31 16:36:08 +000010527 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10528 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010529
Benjamin Peterson14339b62009-01-31 16:36:08 +000010530 if (sep != NULL && sep != Py_None) {
10531 if (PyUnicode_Check(sep))
10532 return _PyUnicode_XStrip(self, striptype, sep);
10533 else {
10534 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010535 "%s arg must be None or str",
10536 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010537 return NULL;
10538 }
10539 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010540
Benjamin Peterson14339b62009-01-31 16:36:08 +000010541 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010542}
10543
10544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010545PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010546 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010547\n\
10548Return a copy of the string S with leading and trailing\n\
10549whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010550If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010551
10552static PyObject *
10553unicode_strip(PyUnicodeObject *self, PyObject *args)
10554{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010555 if (PyTuple_GET_SIZE(args) == 0)
10556 return do_strip(self, BOTHSTRIP); /* Common case */
10557 else
10558 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010559}
10560
10561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010562PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010563 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010564\n\
10565Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010566If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010567
10568static PyObject *
10569unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10570{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010571 if (PyTuple_GET_SIZE(args) == 0)
10572 return do_strip(self, LEFTSTRIP); /* Common case */
10573 else
10574 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010575}
10576
10577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010578PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010579 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010580\n\
10581Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010582If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010583
10584static PyObject *
10585unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10586{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010587 if (PyTuple_GET_SIZE(args) == 0)
10588 return do_strip(self, RIGHTSTRIP); /* Common case */
10589 else
10590 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010591}
10592
10593
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010595unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596{
10597 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599
Georg Brandl222de0f2009-04-12 12:01:50 +000010600 if (len < 1) {
10601 Py_INCREF(unicode_empty);
10602 return (PyObject *)unicode_empty;
10603 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604
Tim Peters7a29bd52001-09-12 03:03:31 +000010605 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606 /* no repeat, return original string */
10607 Py_INCREF(str);
10608 return (PyObject*) str;
10609 }
Tim Peters8f422462000-09-09 06:13:41 +000010610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 if (PyUnicode_READY(str) == -1)
10612 return NULL;
10613
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010614 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010615 PyErr_SetString(PyExc_OverflowError,
10616 "repeated string is too long");
10617 return NULL;
10618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622 if (!u)
10623 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010624 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 if (PyUnicode_GET_LENGTH(str) == 1) {
10627 const int kind = PyUnicode_KIND(str);
10628 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10629 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010630 if (kind == PyUnicode_1BYTE_KIND)
10631 memset(to, (unsigned char)fill_char, len);
10632 else {
10633 for (n = 0; n < len; ++n)
10634 PyUnicode_WRITE(kind, to, n, fill_char);
10635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 }
10637 else {
10638 /* number of characters copied this far */
10639 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10640 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10641 char *to = (char *) PyUnicode_DATA(u);
10642 Py_MEMCPY(to, PyUnicode_DATA(str),
10643 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010644 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 n = (done <= nchars-done) ? done : nchars-done;
10646 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010647 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010648 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649 }
10650
10651 return (PyObject*) u;
10652}
10653
Alexander Belopolsky40018472011-02-26 01:02:56 +000010654PyObject *
10655PyUnicode_Replace(PyObject *obj,
10656 PyObject *subobj,
10657 PyObject *replobj,
10658 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010659{
10660 PyObject *self;
10661 PyObject *str1;
10662 PyObject *str2;
10663 PyObject *result;
10664
10665 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010666 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010667 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010669 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010670 Py_DECREF(self);
10671 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672 }
10673 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010674 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010675 Py_DECREF(self);
10676 Py_DECREF(str1);
10677 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010680 Py_DECREF(self);
10681 Py_DECREF(str1);
10682 Py_DECREF(str2);
10683 return result;
10684}
10685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010686PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010687 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688\n\
10689Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010690old replaced by new. If the optional argument count is\n\
10691given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692
10693static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 PyObject *str1;
10697 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010698 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699 PyObject *result;
10700
Martin v. Löwis18e16552006-02-15 17:27:45 +000010701 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010704 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 str1 = PyUnicode_FromObject(str1);
10706 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10707 return NULL;
10708 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020010709 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010710 Py_DECREF(str1);
10711 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713
10714 result = replace(self, str1, str2, maxcount);
10715
10716 Py_DECREF(str1);
10717 Py_DECREF(str2);
10718 return result;
10719}
10720
Alexander Belopolsky40018472011-02-26 01:02:56 +000010721static PyObject *
10722unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010724 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 Py_ssize_t isize;
10726 Py_ssize_t osize, squote, dquote, i, o;
10727 Py_UCS4 max, quote;
10728 int ikind, okind;
10729 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010732 return NULL;
10733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 isize = PyUnicode_GET_LENGTH(unicode);
10735 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 /* Compute length of output, quote characters, and
10738 maximum character */
10739 osize = 2; /* quotes */
10740 max = 127;
10741 squote = dquote = 0;
10742 ikind = PyUnicode_KIND(unicode);
10743 for (i = 0; i < isize; i++) {
10744 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10745 switch (ch) {
10746 case '\'': squote++; osize++; break;
10747 case '"': dquote++; osize++; break;
10748 case '\\': case '\t': case '\r': case '\n':
10749 osize += 2; break;
10750 default:
10751 /* Fast-path ASCII */
10752 if (ch < ' ' || ch == 0x7f)
10753 osize += 4; /* \xHH */
10754 else if (ch < 0x7f)
10755 osize++;
10756 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10757 osize++;
10758 max = ch > max ? ch : max;
10759 }
10760 else if (ch < 0x100)
10761 osize += 4; /* \xHH */
10762 else if (ch < 0x10000)
10763 osize += 6; /* \uHHHH */
10764 else
10765 osize += 10; /* \uHHHHHHHH */
10766 }
10767 }
10768
10769 quote = '\'';
10770 if (squote) {
10771 if (dquote)
10772 /* Both squote and dquote present. Use squote,
10773 and escape them */
10774 osize += squote;
10775 else
10776 quote = '"';
10777 }
10778
10779 repr = PyUnicode_New(osize, max);
10780 if (repr == NULL)
10781 return NULL;
10782 okind = PyUnicode_KIND(repr);
10783 odata = PyUnicode_DATA(repr);
10784
10785 PyUnicode_WRITE(okind, odata, 0, quote);
10786 PyUnicode_WRITE(okind, odata, osize-1, quote);
10787
10788 for (i = 0, o = 1; i < isize; i++) {
10789 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010790
10791 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792 if ((ch == quote) || (ch == '\\')) {
10793 PyUnicode_WRITE(okind, odata, o++, '\\');
10794 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010795 continue;
10796 }
10797
Benjamin Peterson29060642009-01-31 22:14:21 +000010798 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010799 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 PyUnicode_WRITE(okind, odata, o++, '\\');
10801 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010802 }
10803 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 PyUnicode_WRITE(okind, odata, o++, '\\');
10805 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010806 }
10807 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 PyUnicode_WRITE(okind, odata, o++, '\\');
10809 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010810 }
10811
10812 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010813 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 PyUnicode_WRITE(okind, odata, o++, '\\');
10815 PyUnicode_WRITE(okind, odata, o++, 'x');
10816 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10817 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010818 }
10819
Georg Brandl559e5d72008-06-11 18:37:52 +000010820 /* Copy ASCII characters as-is */
10821 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010823 }
10824
Benjamin Peterson29060642009-01-31 22:14:21 +000010825 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010826 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010827 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010828 (categories Z* and C* except ASCII space)
10829 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010831 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 if (ch <= 0xff) {
10833 PyUnicode_WRITE(okind, odata, o++, '\\');
10834 PyUnicode_WRITE(okind, odata, o++, 'x');
10835 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10836 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010837 }
10838 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 else if (ch >= 0x10000) {
10840 PyUnicode_WRITE(okind, odata, o++, '\\');
10841 PyUnicode_WRITE(okind, odata, o++, 'U');
10842 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10843 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10844 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10845 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10846 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10847 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10848 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10849 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010850 }
10851 /* Map 16-bit characters to '\uxxxx' */
10852 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853 PyUnicode_WRITE(okind, odata, o++, '\\');
10854 PyUnicode_WRITE(okind, odata, o++, 'u');
10855 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10856 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10857 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10858 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010859 }
10860 }
10861 /* Copy characters as-is */
10862 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010864 }
10865 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010868 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869}
10870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010871PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010872 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010873\n\
10874Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010875such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876arguments start and end are interpreted as in slice notation.\n\
10877\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010878Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879
10880static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882{
Jesus Ceaac451502011-04-20 17:09:23 +020010883 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010884 Py_ssize_t start;
10885 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010886 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010887
Jesus Ceaac451502011-04-20 17:09:23 +020010888 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10889 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010891
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892 if (PyUnicode_READY(self) == -1)
10893 return NULL;
10894 if (PyUnicode_READY(substring) == -1)
10895 return NULL;
10896
10897 result = any_find_slice(
10898 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10899 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010900 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901
10902 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904 if (result == -2)
10905 return NULL;
10906
Christian Heimes217cfd12007-12-02 14:31:20 +000010907 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908}
10909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010910PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010911 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010913Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914
10915static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917{
Jesus Ceaac451502011-04-20 17:09:23 +020010918 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010919 Py_ssize_t start;
10920 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010921 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922
Jesus Ceaac451502011-04-20 17:09:23 +020010923 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10924 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010925 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 if (PyUnicode_READY(self) == -1)
10928 return NULL;
10929 if (PyUnicode_READY(substring) == -1)
10930 return NULL;
10931
10932 result = any_find_slice(
10933 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10934 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010935 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010936
10937 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 if (result == -2)
10940 return NULL;
10941
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942 if (result < 0) {
10943 PyErr_SetString(PyExc_ValueError, "substring not found");
10944 return NULL;
10945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946
Christian Heimes217cfd12007-12-02 14:31:20 +000010947 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948}
10949
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010950PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010951 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010953Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010954done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955
10956static PyObject *
10957unicode_rjust(PyUnicodeObject *self, PyObject *args)
10958{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010959 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010960 Py_UCS4 fillchar = ' ';
10961
Victor Stinnere9a29352011-10-01 02:14:59 +020010962 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010964
Victor Stinnere9a29352011-10-01 02:14:59 +020010965 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966 return NULL;
10967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969 Py_INCREF(self);
10970 return (PyObject*) self;
10971 }
10972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010973 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974}
10975
Alexander Belopolsky40018472011-02-26 01:02:56 +000010976PyObject *
10977PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978{
10979 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010980
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981 s = PyUnicode_FromObject(s);
10982 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010983 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010984 if (sep != NULL) {
10985 sep = PyUnicode_FromObject(sep);
10986 if (sep == NULL) {
10987 Py_DECREF(s);
10988 return NULL;
10989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990 }
10991
10992 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
10993
10994 Py_DECREF(s);
10995 Py_XDECREF(sep);
10996 return result;
10997}
10998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010999PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011000 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001\n\
11002Return a list of the words in S, using sep as the\n\
11003delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011004splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011005whitespace string is a separator and empty strings are\n\
11006removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007
11008static PyObject*
11009unicode_split(PyUnicodeObject *self, PyObject *args)
11010{
11011 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011012 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013
Martin v. Löwis18e16552006-02-15 17:27:45 +000011014 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 return NULL;
11016
11017 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011018 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023}
11024
Thomas Wouters477c8d52006-05-27 19:21:47 +000011025PyObject *
11026PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11027{
11028 PyObject* str_obj;
11029 PyObject* sep_obj;
11030 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031 int kind1, kind2, kind;
11032 void *buf1 = NULL, *buf2 = NULL;
11033 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011034
11035 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011036 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011037 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011038 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011040 Py_DECREF(str_obj);
11041 return NULL;
11042 }
11043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 kind1 = PyUnicode_KIND(str_in);
11045 kind2 = PyUnicode_KIND(sep_obj);
11046 kind = kind1 > kind2 ? kind1 : kind2;
11047 buf1 = PyUnicode_DATA(str_in);
11048 if (kind1 != kind)
11049 buf1 = _PyUnicode_AsKind(str_in, kind);
11050 if (!buf1)
11051 goto onError;
11052 buf2 = PyUnicode_DATA(sep_obj);
11053 if (kind2 != kind)
11054 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11055 if (!buf2)
11056 goto onError;
11057 len1 = PyUnicode_GET_LENGTH(str_obj);
11058 len2 = PyUnicode_GET_LENGTH(sep_obj);
11059
11060 switch(PyUnicode_KIND(str_in)) {
11061 case PyUnicode_1BYTE_KIND:
11062 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11063 break;
11064 case PyUnicode_2BYTE_KIND:
11065 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11066 break;
11067 case PyUnicode_4BYTE_KIND:
11068 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11069 break;
11070 default:
11071 assert(0);
11072 out = 0;
11073 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011074
11075 Py_DECREF(sep_obj);
11076 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011077 if (kind1 != kind)
11078 PyMem_Free(buf1);
11079 if (kind2 != kind)
11080 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011081
11082 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 onError:
11084 Py_DECREF(sep_obj);
11085 Py_DECREF(str_obj);
11086 if (kind1 != kind && buf1)
11087 PyMem_Free(buf1);
11088 if (kind2 != kind && buf2)
11089 PyMem_Free(buf2);
11090 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011091}
11092
11093
11094PyObject *
11095PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11096{
11097 PyObject* str_obj;
11098 PyObject* sep_obj;
11099 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 int kind1, kind2, kind;
11101 void *buf1 = NULL, *buf2 = NULL;
11102 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011103
11104 str_obj = PyUnicode_FromObject(str_in);
11105 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011106 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011107 sep_obj = PyUnicode_FromObject(sep_in);
11108 if (!sep_obj) {
11109 Py_DECREF(str_obj);
11110 return NULL;
11111 }
11112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011113 kind1 = PyUnicode_KIND(str_in);
11114 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011115 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 buf1 = PyUnicode_DATA(str_in);
11117 if (kind1 != kind)
11118 buf1 = _PyUnicode_AsKind(str_in, kind);
11119 if (!buf1)
11120 goto onError;
11121 buf2 = PyUnicode_DATA(sep_obj);
11122 if (kind2 != kind)
11123 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11124 if (!buf2)
11125 goto onError;
11126 len1 = PyUnicode_GET_LENGTH(str_obj);
11127 len2 = PyUnicode_GET_LENGTH(sep_obj);
11128
11129 switch(PyUnicode_KIND(str_in)) {
11130 case PyUnicode_1BYTE_KIND:
11131 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11132 break;
11133 case PyUnicode_2BYTE_KIND:
11134 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11135 break;
11136 case PyUnicode_4BYTE_KIND:
11137 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11138 break;
11139 default:
11140 assert(0);
11141 out = 0;
11142 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011143
11144 Py_DECREF(sep_obj);
11145 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 if (kind1 != kind)
11147 PyMem_Free(buf1);
11148 if (kind2 != kind)
11149 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011150
11151 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 onError:
11153 Py_DECREF(sep_obj);
11154 Py_DECREF(str_obj);
11155 if (kind1 != kind && buf1)
11156 PyMem_Free(buf1);
11157 if (kind2 != kind && buf2)
11158 PyMem_Free(buf2);
11159 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011160}
11161
11162PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011163 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011164\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011165Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011166the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011167found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011168
11169static PyObject*
11170unicode_partition(PyUnicodeObject *self, PyObject *separator)
11171{
11172 return PyUnicode_Partition((PyObject *)self, separator);
11173}
11174
11175PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011176 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011177\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011178Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011179the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011180separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011181
11182static PyObject*
11183unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11184{
11185 return PyUnicode_RPartition((PyObject *)self, separator);
11186}
11187
Alexander Belopolsky40018472011-02-26 01:02:56 +000011188PyObject *
11189PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011190{
11191 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011192
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011193 s = PyUnicode_FromObject(s);
11194 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011195 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011196 if (sep != NULL) {
11197 sep = PyUnicode_FromObject(sep);
11198 if (sep == NULL) {
11199 Py_DECREF(s);
11200 return NULL;
11201 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011202 }
11203
11204 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11205
11206 Py_DECREF(s);
11207 Py_XDECREF(sep);
11208 return result;
11209}
11210
11211PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011212 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011213\n\
11214Return a list of the words in S, using sep as the\n\
11215delimiter string, starting at the end of the string and\n\
11216working to the front. If maxsplit is given, at most maxsplit\n\
11217splits are done. If sep is not specified, any whitespace string\n\
11218is a separator.");
11219
11220static PyObject*
11221unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11222{
11223 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011224 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011225
Martin v. Löwis18e16552006-02-15 17:27:45 +000011226 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011227 return NULL;
11228
11229 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011230 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011231 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011232 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011233 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011234 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011235}
11236
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011237PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011238 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239\n\
11240Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011241Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011242is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243
11244static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011245unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011247 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011248 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011250 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11251 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252 return NULL;
11253
Guido van Rossum86662912000-04-11 15:38:46 +000011254 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255}
11256
11257static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011258PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259{
Walter Dörwald346737f2007-05-31 10:44:43 +000011260 if (PyUnicode_CheckExact(self)) {
11261 Py_INCREF(self);
11262 return self;
11263 } else
11264 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011265 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266}
11267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011268PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270\n\
11271Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011272and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273
11274static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011275unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277 return fixup(self, fixswapcase);
11278}
11279
Georg Brandlceee0772007-11-27 23:48:05 +000011280PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011281 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011282\n\
11283Return a translation table usable for str.translate().\n\
11284If there is only one argument, it must be a dictionary mapping Unicode\n\
11285ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011286Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011287If there are two arguments, they must be strings of equal length, and\n\
11288in the resulting dictionary, each character in x will be mapped to the\n\
11289character at the same position in y. If there is a third argument, it\n\
11290must be a string, whose characters will be mapped to None in the result.");
11291
11292static PyObject*
11293unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11294{
11295 PyObject *x, *y = NULL, *z = NULL;
11296 PyObject *new = NULL, *key, *value;
11297 Py_ssize_t i = 0;
11298 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011299
Georg Brandlceee0772007-11-27 23:48:05 +000011300 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11301 return NULL;
11302 new = PyDict_New();
11303 if (!new)
11304 return NULL;
11305 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 int x_kind, y_kind, z_kind;
11307 void *x_data, *y_data, *z_data;
11308
Georg Brandlceee0772007-11-27 23:48:05 +000011309 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011310 if (!PyUnicode_Check(x)) {
11311 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11312 "be a string if there is a second argument");
11313 goto err;
11314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011315 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011316 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11317 "arguments must have equal length");
11318 goto err;
11319 }
11320 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 x_kind = PyUnicode_KIND(x);
11322 y_kind = PyUnicode_KIND(y);
11323 x_data = PyUnicode_DATA(x);
11324 y_data = PyUnicode_DATA(y);
11325 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11326 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11327 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011328 if (!key || !value)
11329 goto err;
11330 res = PyDict_SetItem(new, key, value);
11331 Py_DECREF(key);
11332 Py_DECREF(value);
11333 if (res < 0)
11334 goto err;
11335 }
11336 /* create entries for deleting chars in z */
11337 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 z_kind = PyUnicode_KIND(z);
11339 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011340 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011342 if (!key)
11343 goto err;
11344 res = PyDict_SetItem(new, key, Py_None);
11345 Py_DECREF(key);
11346 if (res < 0)
11347 goto err;
11348 }
11349 }
11350 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011351 int kind;
11352 void *data;
11353
Georg Brandlceee0772007-11-27 23:48:05 +000011354 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011355 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011356 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11357 "to maketrans it must be a dict");
11358 goto err;
11359 }
11360 /* copy entries into the new dict, converting string keys to int keys */
11361 while (PyDict_Next(x, &i, &key, &value)) {
11362 if (PyUnicode_Check(key)) {
11363 /* convert string keys to integer keys */
11364 PyObject *newkey;
11365 if (PyUnicode_GET_SIZE(key) != 1) {
11366 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11367 "table must be of length 1");
11368 goto err;
11369 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011370 kind = PyUnicode_KIND(key);
11371 data = PyUnicode_DATA(key);
11372 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011373 if (!newkey)
11374 goto err;
11375 res = PyDict_SetItem(new, newkey, value);
11376 Py_DECREF(newkey);
11377 if (res < 0)
11378 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011379 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011380 /* just keep integer keys */
11381 if (PyDict_SetItem(new, key, value) < 0)
11382 goto err;
11383 } else {
11384 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11385 "be strings or integers");
11386 goto err;
11387 }
11388 }
11389 }
11390 return new;
11391 err:
11392 Py_DECREF(new);
11393 return NULL;
11394}
11395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011396PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011397 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398\n\
11399Return a copy of the string S, where all characters have been mapped\n\
11400through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011401Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011402Unmapped characters are left untouched. Characters mapped to None\n\
11403are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404
11405static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409}
11410
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011411PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011412 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415
11416static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011417unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419 return fixup(self, fixupper);
11420}
11421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011422PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011423 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011425Pad a numeric string S with zeros on the left, to fill a field\n\
11426of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427
11428static PyObject *
11429unicode_zfill(PyUnicodeObject *self, PyObject *args)
11430{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011431 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011433 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 int kind;
11435 void *data;
11436 Py_UCS4 chr;
11437
11438 if (PyUnicode_READY(self) == -1)
11439 return NULL;
11440
Martin v. Löwis18e16552006-02-15 17:27:45 +000011441 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442 return NULL;
11443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011445 if (PyUnicode_CheckExact(self)) {
11446 Py_INCREF(self);
11447 return (PyObject*) self;
11448 }
11449 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011450 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451 }
11452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454
11455 u = pad(self, fill, 0, '0');
11456
Walter Dörwald068325e2002-04-15 13:36:47 +000011457 if (u == NULL)
11458 return NULL;
11459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 kind = PyUnicode_KIND(u);
11461 data = PyUnicode_DATA(u);
11462 chr = PyUnicode_READ(kind, data, fill);
11463
11464 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 PyUnicode_WRITE(kind, data, 0, chr);
11467 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468 }
11469
11470 return (PyObject*) u;
11471}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472
11473#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011474static PyObject *
11475unicode__decimal2ascii(PyObject *self)
11476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011478}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479#endif
11480
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011481PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011482 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011484Return True if S starts with the specified prefix, False otherwise.\n\
11485With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011486With optional end, stop comparing S at that position.\n\
11487prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488
11489static PyObject *
11490unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011493 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011495 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011496 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011497 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
Jesus Ceaac451502011-04-20 17:09:23 +020011499 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011500 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011501 if (PyTuple_Check(subobj)) {
11502 Py_ssize_t i;
11503 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11504 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011505 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011506 if (substring == NULL)
11507 return NULL;
11508 result = tailmatch(self, substring, start, end, -1);
11509 Py_DECREF(substring);
11510 if (result) {
11511 Py_RETURN_TRUE;
11512 }
11513 }
11514 /* nothing matched */
11515 Py_RETURN_FALSE;
11516 }
11517 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011518 if (substring == NULL) {
11519 if (PyErr_ExceptionMatches(PyExc_TypeError))
11520 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11521 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011523 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011524 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011526 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527}
11528
11529
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011530PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011533Return True if S ends with the specified suffix, False otherwise.\n\
11534With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011535With optional end, stop comparing S at that position.\n\
11536suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537
11538static PyObject *
11539unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011540 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011542 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011544 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011545 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011546 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547
Jesus Ceaac451502011-04-20 17:09:23 +020011548 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011550 if (PyTuple_Check(subobj)) {
11551 Py_ssize_t i;
11552 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11553 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011554 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011555 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011556 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011557 result = tailmatch(self, substring, start, end, +1);
11558 Py_DECREF(substring);
11559 if (result) {
11560 Py_RETURN_TRUE;
11561 }
11562 }
11563 Py_RETURN_FALSE;
11564 }
11565 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011566 if (substring == NULL) {
11567 if (PyErr_ExceptionMatches(PyExc_TypeError))
11568 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11569 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011570 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011571 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011572 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011574 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575}
11576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011578
11579PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011580 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011581\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011582Return a formatted version of S, using substitutions from args and kwargs.\n\
11583The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011584
Eric Smith27bbca62010-11-04 17:06:58 +000011585PyDoc_STRVAR(format_map__doc__,
11586 "S.format_map(mapping) -> str\n\
11587\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011588Return a formatted version of S, using substitutions from mapping.\n\
11589The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011590
Eric Smith4a7d76d2008-05-30 18:10:19 +000011591static PyObject *
11592unicode__format__(PyObject* self, PyObject* args)
11593{
11594 PyObject *format_spec;
11595
11596 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11597 return NULL;
11598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011599 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11600 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011601}
11602
Eric Smith8c663262007-08-25 02:26:07 +000011603PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011604 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011605\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011606Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011607
11608static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011609unicode__sizeof__(PyUnicodeObject *v)
11610{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011611 Py_ssize_t size;
11612
11613 /* If it's a compact object, account for base structure +
11614 character data. */
11615 if (PyUnicode_IS_COMPACT_ASCII(v))
11616 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11617 else if (PyUnicode_IS_COMPACT(v))
11618 size = sizeof(PyCompactUnicodeObject) +
11619 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11620 else {
11621 /* If it is a two-block object, account for base object, and
11622 for character block if present. */
11623 size = sizeof(PyUnicodeObject);
11624 if (v->data.any)
11625 size += (PyUnicode_GET_LENGTH(v) + 1) *
11626 PyUnicode_CHARACTER_SIZE(v);
11627 }
11628 /* If the wstr pointer is present, account for it unless it is shared
11629 with the data pointer. Since PyUnicode_DATA will crash if the object
11630 is not ready, check whether it's either not ready (in which case the
11631 data is entirely in wstr) or if the data is not shared. */
11632 if (_PyUnicode_WSTR(v) &&
11633 (!PyUnicode_IS_READY(v) ||
11634 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11635 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011636 if (!PyUnicode_IS_COMPACT_ASCII(v)
11637 && _PyUnicode_UTF8(v)
11638 && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11639 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640
11641 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011642}
11643
11644PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011645 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011646
11647static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011648unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011649{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011650 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 if (!copy)
11652 return NULL;
11653 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011654}
11655
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656static PyMethodDef unicode_methods[] = {
11657
11658 /* Order is according to common usage: often used methods should
11659 appear first, since lookup is done sequentially. */
11660
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011661 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011662 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11663 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011664 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011665 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11666 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11667 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11668 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11669 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11670 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11671 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011672 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011673 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11674 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11675 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011676 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011677 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11678 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11679 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011680 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011681 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011682 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011683 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011684 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11685 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11686 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11687 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11688 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11689 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11690 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11691 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11692 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11693 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11694 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11695 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11696 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11697 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011698 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011699 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011700 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011701 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011702 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011703 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011704 {"maketrans", (PyCFunction) unicode_maketrans,
11705 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011706 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011707#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011708 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709#endif
11710
11711#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011712 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011713 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714#endif
11715
Benjamin Peterson14339b62009-01-31 16:36:08 +000011716 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717 {NULL, NULL}
11718};
11719
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011720static PyObject *
11721unicode_mod(PyObject *v, PyObject *w)
11722{
Brian Curtindfc80e32011-08-10 20:28:54 -050011723 if (!PyUnicode_Check(v))
11724 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011725 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011726}
11727
11728static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011729 0, /*nb_add*/
11730 0, /*nb_subtract*/
11731 0, /*nb_multiply*/
11732 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011733};
11734
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011736 (lenfunc) unicode_length, /* sq_length */
11737 PyUnicode_Concat, /* sq_concat */
11738 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11739 (ssizeargfunc) unicode_getitem, /* sq_item */
11740 0, /* sq_slice */
11741 0, /* sq_ass_item */
11742 0, /* sq_ass_slice */
11743 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744};
11745
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011746static PyObject*
11747unicode_subscript(PyUnicodeObject* self, PyObject* item)
11748{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 if (PyUnicode_READY(self) == -1)
11750 return NULL;
11751
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011752 if (PyIndex_Check(item)) {
11753 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011754 if (i == -1 && PyErr_Occurred())
11755 return NULL;
11756 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011758 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011759 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011760 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011762 Py_UNICODE* result_buf;
11763 PyObject* result;
11764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011766 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011767 return NULL;
11768 }
11769
11770 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 return PyUnicode_New(0, 0);
11772 } else if (start == 0 && step == 1 &&
11773 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011774 PyUnicode_CheckExact(self)) {
11775 Py_INCREF(self);
11776 return (PyObject *)self;
11777 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011778 return PyUnicode_Substring((PyObject*)self,
11779 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011780 } else {
11781 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011782 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11783 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011784
Benjamin Peterson29060642009-01-31 22:14:21 +000011785 if (result_buf == NULL)
11786 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011787
11788 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11789 result_buf[i] = source_buf[cur];
11790 }
Tim Petersced69f82003-09-16 20:30:58 +000011791
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011792 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011793 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011794 return result;
11795 }
11796 } else {
11797 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11798 return NULL;
11799 }
11800}
11801
11802static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011803 (lenfunc)unicode_length, /* mp_length */
11804 (binaryfunc)unicode_subscript, /* mp_subscript */
11805 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011806};
11807
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809/* Helpers for PyUnicode_Format() */
11810
11811static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011812getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011814 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 (*p_argidx)++;
11817 if (arglen < 0)
11818 return args;
11819 else
11820 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 }
11822 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011823 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824 return NULL;
11825}
11826
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011827/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011829static PyObject *
11830formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011832 char *p;
11833 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011835
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836 x = PyFloat_AsDouble(v);
11837 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011838 return NULL;
11839
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011842
Eric Smith0923d1d2009-04-16 20:16:10 +000011843 p = PyOS_double_to_string(x, type, prec,
11844 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011845 if (p == NULL)
11846 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011848 PyMem_Free(p);
11849 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850}
11851
Tim Peters38fd5b62000-09-21 05:43:11 +000011852static PyObject*
11853formatlong(PyObject *val, int flags, int prec, int type)
11854{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011855 char *buf;
11856 int len;
11857 PyObject *str; /* temporary string object. */
11858 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011859
Benjamin Peterson14339b62009-01-31 16:36:08 +000011860 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11861 if (!str)
11862 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011864 Py_DECREF(str);
11865 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011866}
11867
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011869formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011870 size_t buflen,
11871 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011873 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011874 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 if (PyUnicode_GET_LENGTH(v) == 1) {
11876 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 buf[1] = '\0';
11878 return 1;
11879 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011880 goto onError;
11881 }
11882 else {
11883 /* Integer input truncated to a character */
11884 long x;
11885 x = PyLong_AsLong(v);
11886 if (x == -1 && PyErr_Occurred())
11887 goto onError;
11888
11889 if (x < 0 || x > 0x10ffff) {
11890 PyErr_SetString(PyExc_OverflowError,
11891 "%c arg not in range(0x110000)");
11892 return -1;
11893 }
11894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011896 buf[1] = '\0';
11897 return 1;
11898 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011899
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011901 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011903 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904}
11905
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011906/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011907 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011908*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011909#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011910
Alexander Belopolsky40018472011-02-26 01:02:56 +000011911PyObject *
11912PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 void *fmt;
11915 int fmtkind;
11916 PyObject *result;
11917 Py_UCS4 *res, *res0;
11918 Py_UCS4 max;
11919 int kind;
11920 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011924
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011926 PyErr_BadInternalCall();
11927 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11930 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011931 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932 fmt = PyUnicode_DATA(uformat);
11933 fmtkind = PyUnicode_KIND(uformat);
11934 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11935 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936
11937 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11939 if (res0 == NULL) {
11940 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943
11944 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 arglen = PyTuple_Size(args);
11946 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947 }
11948 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011949 arglen = -1;
11950 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011952 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011953 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955
11956 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011958 if (--rescnt < 0) {
11959 rescnt = fmtcnt + 100;
11960 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11962 if (res0 == NULL){
11963 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 }
11966 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011967 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011970 }
11971 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011972 /* Got a format specifier */
11973 int flags = 0;
11974 Py_ssize_t width = -1;
11975 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 Py_UCS4 c = '\0';
11977 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011978 int isnumok;
11979 PyObject *v = NULL;
11980 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 void *pbuf;
11982 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011983 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 Py_ssize_t len, len1;
11985 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 fmtpos++;
11988 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
11989 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000011990 Py_ssize_t keylen;
11991 PyObject *key;
11992 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000011993
Benjamin Peterson29060642009-01-31 22:14:21 +000011994 if (dict == NULL) {
11995 PyErr_SetString(PyExc_TypeError,
11996 "format requires a mapping");
11997 goto onError;
11998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012002 /* Skip over balanced parentheses */
12003 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012007 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012009 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012011 if (fmtcnt < 0 || pcount > 0) {
12012 PyErr_SetString(PyExc_ValueError,
12013 "incomplete format key");
12014 goto onError;
12015 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012016 key = PyUnicode_Substring((PyObject*)uformat,
12017 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012018 if (key == NULL)
12019 goto onError;
12020 if (args_owned) {
12021 Py_DECREF(args);
12022 args_owned = 0;
12023 }
12024 args = PyObject_GetItem(dict, key);
12025 Py_DECREF(key);
12026 if (args == NULL) {
12027 goto onError;
12028 }
12029 args_owned = 1;
12030 arglen = -1;
12031 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012032 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012033 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012035 case '-': flags |= F_LJUST; continue;
12036 case '+': flags |= F_SIGN; continue;
12037 case ' ': flags |= F_BLANK; continue;
12038 case '#': flags |= F_ALT; continue;
12039 case '0': flags |= F_ZERO; continue;
12040 }
12041 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012042 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012043 if (c == '*') {
12044 v = getnextarg(args, arglen, &argidx);
12045 if (v == NULL)
12046 goto onError;
12047 if (!PyLong_Check(v)) {
12048 PyErr_SetString(PyExc_TypeError,
12049 "* wants int");
12050 goto onError;
12051 }
12052 width = PyLong_AsLong(v);
12053 if (width == -1 && PyErr_Occurred())
12054 goto onError;
12055 if (width < 0) {
12056 flags |= F_LJUST;
12057 width = -width;
12058 }
12059 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012061 }
12062 else if (c >= '0' && c <= '9') {
12063 width = c - '0';
12064 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012065 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012066 if (c < '0' || c > '9')
12067 break;
12068 if ((width*10) / 10 != width) {
12069 PyErr_SetString(PyExc_ValueError,
12070 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012071 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012072 }
12073 width = width*10 + (c - '0');
12074 }
12075 }
12076 if (c == '.') {
12077 prec = 0;
12078 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012080 if (c == '*') {
12081 v = getnextarg(args, arglen, &argidx);
12082 if (v == NULL)
12083 goto onError;
12084 if (!PyLong_Check(v)) {
12085 PyErr_SetString(PyExc_TypeError,
12086 "* wants int");
12087 goto onError;
12088 }
12089 prec = PyLong_AsLong(v);
12090 if (prec == -1 && PyErr_Occurred())
12091 goto onError;
12092 if (prec < 0)
12093 prec = 0;
12094 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012096 }
12097 else if (c >= '0' && c <= '9') {
12098 prec = c - '0';
12099 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012101 if (c < '0' || c > '9')
12102 break;
12103 if ((prec*10) / 10 != prec) {
12104 PyErr_SetString(PyExc_ValueError,
12105 "prec too big");
12106 goto onError;
12107 }
12108 prec = prec*10 + (c - '0');
12109 }
12110 }
12111 } /* prec */
12112 if (fmtcnt >= 0) {
12113 if (c == 'h' || c == 'l' || c == 'L') {
12114 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012116 }
12117 }
12118 if (fmtcnt < 0) {
12119 PyErr_SetString(PyExc_ValueError,
12120 "incomplete format");
12121 goto onError;
12122 }
12123 if (c != '%') {
12124 v = getnextarg(args, arglen, &argidx);
12125 if (v == NULL)
12126 goto onError;
12127 }
12128 sign = 0;
12129 fill = ' ';
12130 switch (c) {
12131
12132 case '%':
12133 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012135 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012137 len = 1;
12138 break;
12139
12140 case 's':
12141 case 'r':
12142 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012143 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012144 temp = v;
12145 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012146 }
12147 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012148 if (c == 's')
12149 temp = PyObject_Str(v);
12150 else if (c == 'r')
12151 temp = PyObject_Repr(v);
12152 else
12153 temp = PyObject_ASCII(v);
12154 if (temp == NULL)
12155 goto onError;
12156 if (PyUnicode_Check(temp))
12157 /* nothing to do */;
12158 else {
12159 Py_DECREF(temp);
12160 PyErr_SetString(PyExc_TypeError,
12161 "%s argument has non-string str()");
12162 goto onError;
12163 }
12164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 if (PyUnicode_READY(temp) == -1) {
12166 Py_CLEAR(temp);
12167 goto onError;
12168 }
12169 pbuf = PyUnicode_DATA(temp);
12170 kind = PyUnicode_KIND(temp);
12171 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012172 if (prec >= 0 && len > prec)
12173 len = prec;
12174 break;
12175
12176 case 'i':
12177 case 'd':
12178 case 'u':
12179 case 'o':
12180 case 'x':
12181 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012182 isnumok = 0;
12183 if (PyNumber_Check(v)) {
12184 PyObject *iobj=NULL;
12185
12186 if (PyLong_Check(v)) {
12187 iobj = v;
12188 Py_INCREF(iobj);
12189 }
12190 else {
12191 iobj = PyNumber_Long(v);
12192 }
12193 if (iobj!=NULL) {
12194 if (PyLong_Check(iobj)) {
12195 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012196 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012197 Py_DECREF(iobj);
12198 if (!temp)
12199 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 if (PyUnicode_READY(temp) == -1) {
12201 Py_CLEAR(temp);
12202 goto onError;
12203 }
12204 pbuf = PyUnicode_DATA(temp);
12205 kind = PyUnicode_KIND(temp);
12206 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012207 sign = 1;
12208 }
12209 else {
12210 Py_DECREF(iobj);
12211 }
12212 }
12213 }
12214 if (!isnumok) {
12215 PyErr_Format(PyExc_TypeError,
12216 "%%%c format: a number is required, "
12217 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12218 goto onError;
12219 }
12220 if (flags & F_ZERO)
12221 fill = '0';
12222 break;
12223
12224 case 'e':
12225 case 'E':
12226 case 'f':
12227 case 'F':
12228 case 'g':
12229 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012230 temp = formatfloat(v, flags, prec, c);
12231 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012232 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 if (PyUnicode_READY(temp) == -1) {
12234 Py_CLEAR(temp);
12235 goto onError;
12236 }
12237 pbuf = PyUnicode_DATA(temp);
12238 kind = PyUnicode_KIND(temp);
12239 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012240 sign = 1;
12241 if (flags & F_ZERO)
12242 fill = '0';
12243 break;
12244
12245 case 'c':
12246 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012248 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012249 if (len < 0)
12250 goto onError;
12251 break;
12252
12253 default:
12254 PyErr_Format(PyExc_ValueError,
12255 "unsupported format character '%c' (0x%x) "
12256 "at index %zd",
12257 (31<=c && c<=126) ? (char)c : '?',
12258 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012260 goto onError;
12261 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012262 /* pbuf is initialized here. */
12263 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012264 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12266 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12267 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012268 len--;
12269 }
12270 else if (flags & F_SIGN)
12271 sign = '+';
12272 else if (flags & F_BLANK)
12273 sign = ' ';
12274 else
12275 sign = 0;
12276 }
12277 if (width < len)
12278 width = len;
12279 if (rescnt - (sign != 0) < width) {
12280 reslen -= rescnt;
12281 rescnt = width + fmtcnt + 100;
12282 reslen += rescnt;
12283 if (reslen < 0) {
12284 Py_XDECREF(temp);
12285 PyErr_NoMemory();
12286 goto onError;
12287 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12289 if (res0 == 0) {
12290 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012291 Py_XDECREF(temp);
12292 goto onError;
12293 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012295 }
12296 if (sign) {
12297 if (fill != ' ')
12298 *res++ = sign;
12299 rescnt--;
12300 if (width > len)
12301 width--;
12302 }
12303 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12305 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012306 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12308 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012309 }
12310 rescnt -= 2;
12311 width -= 2;
12312 if (width < 0)
12313 width = 0;
12314 len -= 2;
12315 }
12316 if (width > len && !(flags & F_LJUST)) {
12317 do {
12318 --rescnt;
12319 *res++ = fill;
12320 } while (--width > len);
12321 }
12322 if (fill == ' ') {
12323 if (sign)
12324 *res++ = sign;
12325 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12327 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12328 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12329 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012330 }
12331 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 /* Copy all characters, preserving len */
12333 len1 = len;
12334 while (len1--) {
12335 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12336 rescnt--;
12337 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012338 while (--width >= len) {
12339 --rescnt;
12340 *res++ = ' ';
12341 }
12342 if (dict && (argidx < arglen) && c != '%') {
12343 PyErr_SetString(PyExc_TypeError,
12344 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012345 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012346 goto onError;
12347 }
12348 Py_XDECREF(temp);
12349 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350 } /* until end */
12351 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 PyErr_SetString(PyExc_TypeError,
12353 "not all arguments converted during string formatting");
12354 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355 }
12356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357
12358 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12359 if (*res > max)
12360 max = *res;
12361 result = PyUnicode_New(reslen - rescnt, max);
12362 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012363 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 kind = PyUnicode_KIND(result);
12365 for (res = res0; res < res0+reslen-rescnt; res++)
12366 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12367 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012369 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370 }
12371 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372 return (PyObject *)result;
12373
Benjamin Peterson29060642009-01-31 22:14:21 +000012374 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376 Py_DECREF(uformat);
12377 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012378 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379 }
12380 return NULL;
12381}
12382
Jeremy Hylton938ace62002-07-17 16:30:39 +000012383static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012384unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12385
Tim Peters6d6c1a32001-08-02 04:15:00 +000012386static PyObject *
12387unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12388{
Benjamin Peterson29060642009-01-31 22:14:21 +000012389 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012390 static char *kwlist[] = {"object", "encoding", "errors", 0};
12391 char *encoding = NULL;
12392 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012393
Benjamin Peterson14339b62009-01-31 16:36:08 +000012394 if (type != &PyUnicode_Type)
12395 return unicode_subtype_new(type, args, kwds);
12396 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012397 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012398 return NULL;
12399 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012401 if (encoding == NULL && errors == NULL)
12402 return PyObject_Str(x);
12403 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012404 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012405}
12406
Guido van Rossume023fe02001-08-30 03:12:59 +000012407static PyObject *
12408unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12409{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012410 PyUnicodeObject *unicode, *self;
12411 Py_ssize_t length, char_size;
12412 int share_wstr, share_utf8;
12413 unsigned int kind;
12414 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012415
Benjamin Peterson14339b62009-01-31 16:36:08 +000012416 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012417
12418 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12419 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012420 return NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012421 assert(PyUnicode_Check(unicode));
12422 if (PyUnicode_READY(unicode))
12423 return NULL;
12424
12425 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12426 if (self == NULL) {
12427 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012428 return NULL;
12429 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012430 kind = PyUnicode_KIND(unicode);
12431 length = PyUnicode_GET_LENGTH(unicode);
12432
12433 _PyUnicode_LENGTH(self) = length;
12434 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12435 _PyUnicode_STATE(self).interned = 0;
12436 _PyUnicode_STATE(self).kind = kind;
12437 _PyUnicode_STATE(self).compact = 0;
12438 _PyUnicode_STATE(self).ascii = 0;
12439 _PyUnicode_STATE(self).ready = 1;
12440 _PyUnicode_WSTR(self) = NULL;
12441 _PyUnicode_UTF8_LENGTH(self) = 0;
12442 _PyUnicode_UTF8(self) = NULL;
12443 _PyUnicode_WSTR_LENGTH(self) = 0;
12444 self->data.any = NULL;
12445
12446 share_utf8 = 0;
12447 share_wstr = 0;
12448 if (kind == PyUnicode_1BYTE_KIND) {
12449 char_size = 1;
12450 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12451 share_utf8 = 1;
12452 }
12453 else if (kind == PyUnicode_2BYTE_KIND) {
12454 char_size = 2;
12455 if (sizeof(wchar_t) == 2)
12456 share_wstr = 1;
12457 }
12458 else {
12459 assert(kind == PyUnicode_4BYTE_KIND);
12460 char_size = 4;
12461 if (sizeof(wchar_t) == 4)
12462 share_wstr = 1;
12463 }
12464
12465 /* Ensure we won't overflow the length. */
12466 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12467 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012469 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012470 data = PyObject_MALLOC((length + 1) * char_size);
12471 if (data == NULL) {
12472 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 goto onError;
12474 }
12475
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012476 self->data.any = data;
12477 if (share_utf8) {
12478 _PyUnicode_UTF8_LENGTH(self) = length;
12479 _PyUnicode_UTF8(self) = data;
12480 }
12481 if (share_wstr) {
12482 _PyUnicode_WSTR_LENGTH(self) = length;
12483 _PyUnicode_WSTR(self) = (wchar_t *)data;
12484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012486 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12487 PyUnicode_KIND_SIZE(kind, length + 1));
12488 Py_DECREF(unicode);
12489 return (PyObject *)self;
12490
12491onError:
12492 Py_DECREF(unicode);
12493 Py_DECREF(self);
12494 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012495}
12496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012497PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012498 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012499\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012500Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012501encoding defaults to the current default string encoding.\n\
12502errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012503
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012504static PyObject *unicode_iter(PyObject *seq);
12505
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012507 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012508 "str", /* tp_name */
12509 sizeof(PyUnicodeObject), /* tp_size */
12510 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012512 (destructor)unicode_dealloc, /* tp_dealloc */
12513 0, /* tp_print */
12514 0, /* tp_getattr */
12515 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012516 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012517 unicode_repr, /* tp_repr */
12518 &unicode_as_number, /* tp_as_number */
12519 &unicode_as_sequence, /* tp_as_sequence */
12520 &unicode_as_mapping, /* tp_as_mapping */
12521 (hashfunc) unicode_hash, /* tp_hash*/
12522 0, /* tp_call*/
12523 (reprfunc) unicode_str, /* tp_str */
12524 PyObject_GenericGetAttr, /* tp_getattro */
12525 0, /* tp_setattro */
12526 0, /* tp_as_buffer */
12527 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012528 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012529 unicode_doc, /* tp_doc */
12530 0, /* tp_traverse */
12531 0, /* tp_clear */
12532 PyUnicode_RichCompare, /* tp_richcompare */
12533 0, /* tp_weaklistoffset */
12534 unicode_iter, /* tp_iter */
12535 0, /* tp_iternext */
12536 unicode_methods, /* tp_methods */
12537 0, /* tp_members */
12538 0, /* tp_getset */
12539 &PyBaseObject_Type, /* tp_base */
12540 0, /* tp_dict */
12541 0, /* tp_descr_get */
12542 0, /* tp_descr_set */
12543 0, /* tp_dictoffset */
12544 0, /* tp_init */
12545 0, /* tp_alloc */
12546 unicode_new, /* tp_new */
12547 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548};
12549
12550/* Initialize the Unicode implementation */
12551
Thomas Wouters78890102000-07-22 19:25:51 +000012552void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012554 int i;
12555
Thomas Wouters477c8d52006-05-27 19:21:47 +000012556 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012558 0x000A, /* LINE FEED */
12559 0x000D, /* CARRIAGE RETURN */
12560 0x001C, /* FILE SEPARATOR */
12561 0x001D, /* GROUP SEPARATOR */
12562 0x001E, /* RECORD SEPARATOR */
12563 0x0085, /* NEXT LINE */
12564 0x2028, /* LINE SEPARATOR */
12565 0x2029, /* PARAGRAPH SEPARATOR */
12566 };
12567
Fred Drakee4315f52000-05-09 19:53:39 +000012568 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012570 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012572
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012573 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012574 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012575 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012577
12578 /* initialize the linebreak bloom filter */
12579 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012581 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012582
12583 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584}
12585
12586/* Finalize the Unicode implementation */
12587
Christian Heimesa156e092008-02-16 07:38:31 +000012588int
12589PyUnicode_ClearFreeList(void)
12590{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012592}
12593
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594void
Thomas Wouters78890102000-07-22 19:25:51 +000012595_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012597 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012599 Py_XDECREF(unicode_empty);
12600 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012601
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012602 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 if (unicode_latin1[i]) {
12604 Py_DECREF(unicode_latin1[i]);
12605 unicode_latin1[i] = NULL;
12606 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012607 }
Christian Heimesa156e092008-02-16 07:38:31 +000012608 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012610
Walter Dörwald16807132007-05-25 13:52:07 +000012611void
12612PyUnicode_InternInPlace(PyObject **p)
12613{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012614 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12615 PyObject *t;
12616 if (s == NULL || !PyUnicode_Check(s))
12617 Py_FatalError(
12618 "PyUnicode_InternInPlace: unicode strings only please!");
12619 /* If it's a subclass, we don't really know what putting
12620 it in the interned dict might do. */
12621 if (!PyUnicode_CheckExact(s))
12622 return;
12623 if (PyUnicode_CHECK_INTERNED(s))
12624 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 if (PyUnicode_READY(s) == -1) {
12626 assert(0 && "ready fail in intern...");
12627 return;
12628 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012629 if (interned == NULL) {
12630 interned = PyDict_New();
12631 if (interned == NULL) {
12632 PyErr_Clear(); /* Don't leave an exception */
12633 return;
12634 }
12635 }
12636 /* It might be that the GetItem call fails even
12637 though the key is present in the dictionary,
12638 namely when this happens during a stack overflow. */
12639 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012640 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012641 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012642
Benjamin Peterson29060642009-01-31 22:14:21 +000012643 if (t) {
12644 Py_INCREF(t);
12645 Py_DECREF(*p);
12646 *p = t;
12647 return;
12648 }
Walter Dörwald16807132007-05-25 13:52:07 +000012649
Benjamin Peterson14339b62009-01-31 16:36:08 +000012650 PyThreadState_GET()->recursion_critical = 1;
12651 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12652 PyErr_Clear();
12653 PyThreadState_GET()->recursion_critical = 0;
12654 return;
12655 }
12656 PyThreadState_GET()->recursion_critical = 0;
12657 /* The two references in interned are not counted by refcnt.
12658 The deallocator will take care of this */
12659 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012661}
12662
12663void
12664PyUnicode_InternImmortal(PyObject **p)
12665{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12667
Benjamin Peterson14339b62009-01-31 16:36:08 +000012668 PyUnicode_InternInPlace(p);
12669 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012671 Py_INCREF(*p);
12672 }
Walter Dörwald16807132007-05-25 13:52:07 +000012673}
12674
12675PyObject *
12676PyUnicode_InternFromString(const char *cp)
12677{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012678 PyObject *s = PyUnicode_FromString(cp);
12679 if (s == NULL)
12680 return NULL;
12681 PyUnicode_InternInPlace(&s);
12682 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012683}
12684
Alexander Belopolsky40018472011-02-26 01:02:56 +000012685void
12686_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012687{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012688 PyObject *keys;
12689 PyUnicodeObject *s;
12690 Py_ssize_t i, n;
12691 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012692
Benjamin Peterson14339b62009-01-31 16:36:08 +000012693 if (interned == NULL || !PyDict_Check(interned))
12694 return;
12695 keys = PyDict_Keys(interned);
12696 if (keys == NULL || !PyList_Check(keys)) {
12697 PyErr_Clear();
12698 return;
12699 }
Walter Dörwald16807132007-05-25 13:52:07 +000012700
Benjamin Peterson14339b62009-01-31 16:36:08 +000012701 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12702 detector, interned unicode strings are not forcibly deallocated;
12703 rather, we give them their stolen references back, and then clear
12704 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012705
Benjamin Peterson14339b62009-01-31 16:36:08 +000012706 n = PyList_GET_SIZE(keys);
12707 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012708 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012709 for (i = 0; i < n; i++) {
12710 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 if (PyUnicode_READY(s) == -1)
12712 fprintf(stderr, "could not ready string\n");
12713 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012714 case SSTATE_NOT_INTERNED:
12715 /* XXX Shouldn't happen */
12716 break;
12717 case SSTATE_INTERNED_IMMORTAL:
12718 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012720 break;
12721 case SSTATE_INTERNED_MORTAL:
12722 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012724 break;
12725 default:
12726 Py_FatalError("Inconsistent interned string state.");
12727 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012729 }
12730 fprintf(stderr, "total size of all interned strings: "
12731 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12732 "mortal/immortal\n", mortal_size, immortal_size);
12733 Py_DECREF(keys);
12734 PyDict_Clear(interned);
12735 Py_DECREF(interned);
12736 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012737}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012738
12739
12740/********************* Unicode Iterator **************************/
12741
12742typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012743 PyObject_HEAD
12744 Py_ssize_t it_index;
12745 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012746} unicodeiterobject;
12747
12748static void
12749unicodeiter_dealloc(unicodeiterobject *it)
12750{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012751 _PyObject_GC_UNTRACK(it);
12752 Py_XDECREF(it->it_seq);
12753 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012754}
12755
12756static int
12757unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12758{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012759 Py_VISIT(it->it_seq);
12760 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012761}
12762
12763static PyObject *
12764unicodeiter_next(unicodeiterobject *it)
12765{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012766 PyUnicodeObject *seq;
12767 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012768
Benjamin Peterson14339b62009-01-31 16:36:08 +000012769 assert(it != NULL);
12770 seq = it->it_seq;
12771 if (seq == NULL)
12772 return NULL;
12773 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12776 int kind = PyUnicode_KIND(seq);
12777 void *data = PyUnicode_DATA(seq);
12778 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12779 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012780 if (item != NULL)
12781 ++it->it_index;
12782 return item;
12783 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012784
Benjamin Peterson14339b62009-01-31 16:36:08 +000012785 Py_DECREF(seq);
12786 it->it_seq = NULL;
12787 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012788}
12789
12790static PyObject *
12791unicodeiter_len(unicodeiterobject *it)
12792{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012793 Py_ssize_t len = 0;
12794 if (it->it_seq)
12795 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12796 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012797}
12798
12799PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12800
12801static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012802 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012804 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012805};
12806
12807PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012808 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12809 "str_iterator", /* tp_name */
12810 sizeof(unicodeiterobject), /* tp_basicsize */
12811 0, /* tp_itemsize */
12812 /* methods */
12813 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12814 0, /* tp_print */
12815 0, /* tp_getattr */
12816 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012817 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012818 0, /* tp_repr */
12819 0, /* tp_as_number */
12820 0, /* tp_as_sequence */
12821 0, /* tp_as_mapping */
12822 0, /* tp_hash */
12823 0, /* tp_call */
12824 0, /* tp_str */
12825 PyObject_GenericGetAttr, /* tp_getattro */
12826 0, /* tp_setattro */
12827 0, /* tp_as_buffer */
12828 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12829 0, /* tp_doc */
12830 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12831 0, /* tp_clear */
12832 0, /* tp_richcompare */
12833 0, /* tp_weaklistoffset */
12834 PyObject_SelfIter, /* tp_iter */
12835 (iternextfunc)unicodeiter_next, /* tp_iternext */
12836 unicodeiter_methods, /* tp_methods */
12837 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012838};
12839
12840static PyObject *
12841unicode_iter(PyObject *seq)
12842{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012843 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012844
Benjamin Peterson14339b62009-01-31 16:36:08 +000012845 if (!PyUnicode_Check(seq)) {
12846 PyErr_BadInternalCall();
12847 return NULL;
12848 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012849 if (PyUnicode_READY(seq) == -1)
12850 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012851 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12852 if (it == NULL)
12853 return NULL;
12854 it->it_index = 0;
12855 Py_INCREF(seq);
12856 it->it_seq = (PyUnicodeObject *)seq;
12857 _PyObject_GC_TRACK(it);
12858 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012859}
12860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012861#define UNIOP(x) Py_UNICODE_##x
12862#define UNIOP_t Py_UNICODE
12863#include "uniops.h"
12864#undef UNIOP
12865#undef UNIOP_t
12866#define UNIOP(x) Py_UCS4_##x
12867#define UNIOP_t Py_UCS4
12868#include "uniops.h"
12869#undef UNIOP
12870#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012871
Victor Stinner71133ff2010-09-01 23:43:53 +000012872Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012873PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012874{
12875 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12876 Py_UNICODE *copy;
12877 Py_ssize_t size;
12878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012879 if (!PyUnicode_Check(unicode)) {
12880 PyErr_BadArgument();
12881 return NULL;
12882 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012883 /* Ensure we won't overflow the size. */
12884 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12885 PyErr_NoMemory();
12886 return NULL;
12887 }
12888 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12889 size *= sizeof(Py_UNICODE);
12890 copy = PyMem_Malloc(size);
12891 if (copy == NULL) {
12892 PyErr_NoMemory();
12893 return NULL;
12894 }
12895 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12896 return copy;
12897}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012898
Georg Brandl66c221e2010-10-14 07:04:07 +000012899/* A _string module, to export formatter_parser and formatter_field_name_split
12900 to the string.Formatter class implemented in Python. */
12901
12902static PyMethodDef _string_methods[] = {
12903 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12904 METH_O, PyDoc_STR("split the argument as a field name")},
12905 {"formatter_parser", (PyCFunction) formatter_parser,
12906 METH_O, PyDoc_STR("parse the argument as a format string")},
12907 {NULL, NULL}
12908};
12909
12910static struct PyModuleDef _string_module = {
12911 PyModuleDef_HEAD_INIT,
12912 "_string",
12913 PyDoc_STR("string helper module"),
12914 0,
12915 _string_methods,
12916 NULL,
12917 NULL,
12918 NULL,
12919 NULL
12920};
12921
12922PyMODINIT_FUNC
12923PyInit__string(void)
12924{
12925 return PyModule_Create(&_string_module);
12926}
12927
12928
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012929#ifdef __cplusplus
12930}
12931#endif