blob: c0511ddc192c75fc2ac20d9a9eb410bc6030fef3 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092/* Generic helper macro to convert characters of different types.
93 from_type and to_type have to be valid type names, begin and end
94 are pointers to the source characters which should be of type
95 "from_type *". to is a pointer of type "to_type *" and points to the
96 buffer where the result characters are written to. */
97#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
98 do { \
99 const from_type *iter_; to_type *to_; \
100 for (iter_ = (begin), to_ = (to_type *)(to); \
101 iter_ < (end); \
102 ++iter_, ++to_) { \
103 *to_ = (to_type)*iter_; \
104 } \
105 } while (0)
106
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107#define _PyUnicode_UTF8(op) \
108 (((PyCompactUnicodeObject*)(op))->utf8)
109#define PyUnicode_UTF8(op) \
110 (assert(PyUnicode_Check(op)), \
111 assert(PyUnicode_IS_READY(op)), \
112 PyUnicode_IS_COMPACT_ASCII(op) ? \
113 ((char*)((PyASCIIObject*)(op) + 1)) : \
114 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200115#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 (((PyCompactUnicodeObject*)(op))->utf8_length)
117#define PyUnicode_UTF8_LENGTH(op) \
118 (assert(PyUnicode_Check(op)), \
119 assert(PyUnicode_IS_READY(op)), \
120 PyUnicode_IS_COMPACT_ASCII(op) ? \
121 ((PyASCIIObject*)(op))->length : \
122 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
124#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
125#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
126#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
128#define _PyUnicode_KIND(op) \
129 (assert(PyUnicode_Check(op)), \
130 ((PyASCIIObject *)(op))->state.kind)
131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(PyUnicode_Check(op)), \
133 ((PyASCIIObject *)(op))->length)
134
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200135/* The Unicode string has been modified: reset the hash */
136#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200138
Walter Dörwald16807132007-05-25 13:52:07 +0000139/* This dictionary holds all interned unicode strings. Note that references
140 to strings in this dictionary are *not* counted in the string's ob_refcnt.
141 When the interned string reaches a refcnt of 0 the string deallocation
142 function will delete the reference from this dictionary.
143
144 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000145 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000146*/
147static PyObject *interned;
148
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000149/* The empty Unicode object is shared to improve performance. */
150static PyUnicodeObject *unicode_empty;
151
152/* Single character Unicode strings in the Latin-1 range are being
153 shared as well. */
154static PyUnicodeObject *unicode_latin1[256];
155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Fast detection of the most frequent whitespace characters */
157const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000159/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000161/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* case 0x000C: * FORM FEED */
163/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 1, 1, 1, 1, 1, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000166/* case 0x001C: * FILE SEPARATOR */
167/* case 0x001D: * GROUP SEPARATOR */
168/* case 0x001E: * RECORD SEPARATOR */
169/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 1, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
Alexander Belopolsky40018472011-02-26 01:02:56 +0000187static PyObject *
188unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000189 PyObject **errorHandler,const char *encoding, const char *reason,
190 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
191 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
192
Alexander Belopolsky40018472011-02-26 01:02:56 +0000193static void
194raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300195 const char *encoding,
196 const Py_UNICODE *unicode, Py_ssize_t size,
197 Py_ssize_t startpos, Py_ssize_t endpos,
198 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000199
Christian Heimes190d79e2008-01-30 11:58:22 +0000200/* Same for linebreaks */
201static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000204/* 0x000B, * LINE TABULATION */
205/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000206/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* 0x001C, * FILE SEPARATOR */
210/* 0x001D, * GROUP SEPARATOR */
211/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 1, 1, 1, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300228/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
229 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000230Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000231PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000232{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000233#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000235#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 /* This is actually an illegal character, so it should
237 not be passed to unichr. */
238 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000239#endif
240}
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242/* --- Bloom Filters ----------------------------------------------------- */
243
244/* stuff to implement simple "bloom filters" for Unicode characters.
245 to keep things simple, we use a single bitmask, using the least 5
246 bits from each unicode characters as the bit index. */
247
248/* the linebreak mask is set up by Unicode_Init below */
249
Antoine Pitrouf068f942010-01-13 14:19:12 +0000250#if LONG_BIT >= 128
251#define BLOOM_WIDTH 128
252#elif LONG_BIT >= 64
253#define BLOOM_WIDTH 64
254#elif LONG_BIT >= 32
255#define BLOOM_WIDTH 32
256#else
257#error "LONG_BIT is smaller than 32"
258#endif
259
Thomas Wouters477c8d52006-05-27 19:21:47 +0000260#define BLOOM_MASK unsigned long
261
262static BLOOM_MASK bloom_linebreak;
263
Antoine Pitrouf068f942010-01-13 14:19:12 +0000264#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
265#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266
Benjamin Peterson29060642009-01-31 22:14:21 +0000267#define BLOOM_LINEBREAK(ch) \
268 ((ch) < 128U ? ascii_linebreak[(ch)] : \
269 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270
Alexander Belopolsky40018472011-02-26 01:02:56 +0000271Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200272make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273{
274 /* calculate simple bloom-style bitmask for a given unicode string */
275
Antoine Pitrouf068f942010-01-13 14:19:12 +0000276 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000277 Py_ssize_t i;
278
279 mask = 0;
280 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
283 return mask;
284}
285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200286#define BLOOM_MEMBER(mask, chr, str) \
287 (BLOOM(mask, chr) \
288 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000289
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290/* --- Unicode Object ----------------------------------------------------- */
291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200292static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200293fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
294
295Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
296 Py_ssize_t size, Py_UCS4 ch,
297 int direction)
298{
299 /* like wcschr, but doesn't stop at NULL characters */
300 Py_ssize_t i;
301 if (direction == 1) {
302 for(i = 0; i < size; i++)
303 if (PyUnicode_READ(kind, s, i) == ch)
304 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
305 }
306 else {
307 for(i = size-1; i >= 0; i--)
308 if (PyUnicode_READ(kind, s, i) == ch)
309 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
310 }
311 return NULL;
312}
313
Alexander Belopolsky40018472011-02-26 01:02:56 +0000314static int
315unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000317{
318 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200320 /* Resizing is only supported for old unicode objects. */
321 assert(!PyUnicode_IS_COMPACT(unicode));
322 assert(_PyUnicode_WSTR(unicode) != NULL);
323
324 /* ... and only if they have not been readied yet, because
325 callees usually rely on the wstr representation when resizing. */
326 assert(unicode->data.any == NULL);
327
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000328 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200329 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000330 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000332 /* Resizing shared object (unicode_empty or single character
333 objects) in-place is not allowed. Use PyUnicode_Resize()
334 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000335
Benjamin Peterson14339b62009-01-31 16:36:08 +0000336 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200337 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
338 _PyUnicode_WSTR(unicode)[0] < 256U &&
339 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000341 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 return -1;
343 }
344
Thomas Wouters477c8d52006-05-27 19:21:47 +0000345 /* We allocate one more byte to make sure the string is Ux0000 terminated.
346 The overallocation is also used by fastsearch, which assumes that it's
347 safe to look at str[length] (without making any assumptions about what
348 it contains). */
349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350 oldstr = _PyUnicode_WSTR(unicode);
351 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
352 sizeof(Py_UNICODE) * (length + 1));
353 if (!_PyUnicode_WSTR(unicode)) {
354 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 PyErr_NoMemory();
356 return -1;
357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 _PyUnicode_WSTR(unicode)[length] = 0;
359 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360
Benjamin Peterson29060642009-01-31 22:14:21 +0000361 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362 if (unicode->data.any != NULL) {
363 PyObject_FREE(unicode->data.any);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200364 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != unicode->data.any) {
365 PyObject_FREE(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200367 _PyUnicode_UTF8(unicode) = NULL;
368 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200369 unicode->data.any = NULL;
370 _PyUnicode_LENGTH(unicode) = 0;
371 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
372 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200374 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000375
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return 0;
377}
378
379/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000380 Ux0000 terminated; some code (e.g. new_identifier)
381 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382
383 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385
386*/
387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200388#ifdef Py_DEBUG
389int unicode_old_new_calls = 0;
390#endif
391
Alexander Belopolsky40018472011-02-26 01:02:56 +0000392static PyUnicodeObject *
393_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394{
395 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200396 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 if (length == 0 && unicode_empty != NULL) {
400 Py_INCREF(unicode_empty);
401 return unicode_empty;
402 }
403
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000404 /* Ensure we won't overflow the size. */
405 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
406 return (PyUnicodeObject *)PyErr_NoMemory();
407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200408 if (length < 0) {
409 PyErr_SetString(PyExc_SystemError,
410 "Negative size passed to _PyUnicode_New");
411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000412 }
413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200414#ifdef Py_DEBUG
415 ++unicode_old_new_calls;
416#endif
417
418 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
419 if (unicode == NULL)
420 return NULL;
421 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
422 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
423 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 PyErr_NoMemory();
425 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200427
Jeremy Hyltond8082792003-09-16 19:41:39 +0000428 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000429 * the caller fails before initializing str -- unicode_resize()
430 * reads str[0], and the Keep-Alive optimization can keep memory
431 * allocated for str alive across a call to unicode_dealloc(unicode).
432 * We don't want unicode_resize to read uninitialized memory in
433 * that case.
434 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200435 _PyUnicode_WSTR(unicode)[0] = 0;
436 _PyUnicode_WSTR(unicode)[length] = 0;
437 _PyUnicode_WSTR_LENGTH(unicode) = length;
438 _PyUnicode_HASH(unicode) = -1;
439 _PyUnicode_STATE(unicode).interned = 0;
440 _PyUnicode_STATE(unicode).kind = 0;
441 _PyUnicode_STATE(unicode).compact = 0;
442 _PyUnicode_STATE(unicode).ready = 0;
443 _PyUnicode_STATE(unicode).ascii = 0;
444 unicode->data.any = NULL;
445 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200446 _PyUnicode_UTF8(unicode) = NULL;
447 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000449
Benjamin Peterson29060642009-01-31 22:14:21 +0000450 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000451 /* XXX UNREF/NEWREF interface should be more symmetrical */
452 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000453 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000454 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000455 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456}
457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200458#ifdef Py_DEBUG
459int unicode_new_new_calls = 0;
460
461/* Functions wrapping macros for use in debugger */
462char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200463 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200464}
465
466void *_PyUnicode_compact_data(void *unicode) {
467 return _PyUnicode_COMPACT_DATA(unicode);
468}
469void *_PyUnicode_data(void *unicode){
470 printf("obj %p\n", unicode);
471 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
472 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
473 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
474 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
475 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
476 return PyUnicode_DATA(unicode);
477}
478#endif
479
480PyObject *
481PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
482{
483 PyObject *obj;
484 PyCompactUnicodeObject *unicode;
485 void *data;
486 int kind_state;
487 int is_sharing = 0, is_ascii = 0;
488 Py_ssize_t char_size;
489 Py_ssize_t struct_size;
490
491 /* Optimization for empty strings */
492 if (size == 0 && unicode_empty != NULL) {
493 Py_INCREF(unicode_empty);
494 return (PyObject *)unicode_empty;
495 }
496
497#ifdef Py_DEBUG
498 ++unicode_new_new_calls;
499#endif
500
501 struct_size = sizeof(PyCompactUnicodeObject);
502 if (maxchar < 128) {
503 kind_state = PyUnicode_1BYTE_KIND;
504 char_size = 1;
505 is_ascii = 1;
506 struct_size = sizeof(PyASCIIObject);
507 }
508 else if (maxchar < 256) {
509 kind_state = PyUnicode_1BYTE_KIND;
510 char_size = 1;
511 }
512 else if (maxchar < 65536) {
513 kind_state = PyUnicode_2BYTE_KIND;
514 char_size = 2;
515 if (sizeof(wchar_t) == 2)
516 is_sharing = 1;
517 }
518 else {
519 kind_state = PyUnicode_4BYTE_KIND;
520 char_size = 4;
521 if (sizeof(wchar_t) == 4)
522 is_sharing = 1;
523 }
524
525 /* Ensure we won't overflow the size. */
526 if (size < 0) {
527 PyErr_SetString(PyExc_SystemError,
528 "Negative size passed to PyUnicode_New");
529 return NULL;
530 }
531 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
532 return PyErr_NoMemory();
533
534 /* Duplicated allocation code from _PyObject_New() instead of a call to
535 * PyObject_New() so we are able to allocate space for the object and
536 * it's data buffer.
537 */
538 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
539 if (obj == NULL)
540 return PyErr_NoMemory();
541 obj = PyObject_INIT(obj, &PyUnicode_Type);
542 if (obj == NULL)
543 return NULL;
544
545 unicode = (PyCompactUnicodeObject *)obj;
546 if (is_ascii)
547 data = ((PyASCIIObject*)obj) + 1;
548 else
549 data = unicode + 1;
550 _PyUnicode_LENGTH(unicode) = size;
551 _PyUnicode_HASH(unicode) = -1;
552 _PyUnicode_STATE(unicode).interned = 0;
553 _PyUnicode_STATE(unicode).kind = kind_state;
554 _PyUnicode_STATE(unicode).compact = 1;
555 _PyUnicode_STATE(unicode).ready = 1;
556 _PyUnicode_STATE(unicode).ascii = is_ascii;
557 if (is_ascii) {
558 ((char*)data)[size] = 0;
559 _PyUnicode_WSTR(unicode) = NULL;
560 }
561 else if (kind_state == PyUnicode_1BYTE_KIND) {
562 ((char*)data)[size] = 0;
563 _PyUnicode_WSTR(unicode) = NULL;
564 _PyUnicode_WSTR_LENGTH(unicode) = 0;
565 unicode->utf8_length = 0;
566 unicode->utf8 = NULL;
567 }
568 else {
569 unicode->utf8 = NULL;
570 if (kind_state == PyUnicode_2BYTE_KIND)
571 ((Py_UCS2*)data)[size] = 0;
572 else /* kind_state == PyUnicode_4BYTE_KIND */
573 ((Py_UCS4*)data)[size] = 0;
574 if (is_sharing) {
575 _PyUnicode_WSTR_LENGTH(unicode) = size;
576 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
577 }
578 else {
579 _PyUnicode_WSTR_LENGTH(unicode) = 0;
580 _PyUnicode_WSTR(unicode) = NULL;
581 }
582 }
583 return obj;
584}
585
586#if SIZEOF_WCHAR_T == 2
587/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
588 will decode surrogate pairs, the other conversions are implemented as macros
589 for efficency.
590
591 This function assumes that unicode can hold one more code point than wstr
592 characters for a terminating null character. */
593static int
594unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
595 PyUnicodeObject *unicode)
596{
597 const wchar_t *iter;
598 Py_UCS4 *ucs4_out;
599
600 assert(unicode && PyUnicode_Check(unicode));
601 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
602 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
603
604 for (iter = begin; iter < end; ) {
605 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
606 _PyUnicode_GET_LENGTH(unicode)));
607 if (*iter >= 0xD800 && *iter <= 0xDBFF
608 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
609 {
610 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
611 iter += 2;
612 }
613 else {
614 *ucs4_out++ = *iter;
615 iter++;
616 }
617 }
618 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
619 _PyUnicode_GET_LENGTH(unicode)));
620
621 return 0;
622}
623#endif
624
Victor Stinnercd9950f2011-10-02 00:34:53 +0200625static int
626_PyUnicode_Dirty(PyObject *unicode)
627{
628 assert(PyUnicode_Check(unicode));
629 if (Py_REFCNT(unicode) != 1) {
630 PyErr_SetString(PyExc_ValueError,
631 "Cannot modify a string having more than 1 reference");
632 return -1;
633 }
634 _PyUnicode_DIRTY(unicode);
635 return 0;
636}
637
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200638Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200639PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
640 PyObject *from, Py_ssize_t from_start,
641 Py_ssize_t how_many)
642{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200643 unsigned int from_kind, to_kind;
644 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645
Victor Stinnerb1536152011-09-30 02:26:10 +0200646 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
647 PyErr_BadInternalCall();
648 return -1;
649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200650
651 if (PyUnicode_READY(from))
652 return -1;
653 if (PyUnicode_READY(to))
654 return -1;
655
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200656 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200657 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
658 PyErr_Format(PyExc_ValueError,
659 "Cannot write %zi characters at %zi "
660 "in a string of %zi characters",
661 how_many, to_start, PyUnicode_GET_LENGTH(to));
662 return -1;
663 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200664 if (how_many == 0)
665 return 0;
666
Victor Stinnercd9950f2011-10-02 00:34:53 +0200667 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200668 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200669
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200670 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200671 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200673 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200674
675 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200676 /* fast path */
Victor Stinnera0702ab2011-09-29 14:14:38 +0200677 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200678 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200679 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200680 + PyUnicode_KIND_SIZE(from_kind, from_start),
681 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200682 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200683 else if (from_kind == PyUnicode_1BYTE_KIND
684 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200685 {
686 _PyUnicode_CONVERT_BYTES(
687 Py_UCS1, Py_UCS2,
688 PyUnicode_1BYTE_DATA(from) + from_start,
689 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
690 PyUnicode_2BYTE_DATA(to) + to_start
691 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200692 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200693 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200694 && to_kind == PyUnicode_4BYTE_KIND)
695 {
696 _PyUnicode_CONVERT_BYTES(
697 Py_UCS1, Py_UCS4,
698 PyUnicode_1BYTE_DATA(from) + from_start,
699 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
700 PyUnicode_4BYTE_DATA(to) + to_start
701 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200702 }
703 else if (from_kind == PyUnicode_2BYTE_KIND
704 && to_kind == PyUnicode_4BYTE_KIND)
705 {
706 _PyUnicode_CONVERT_BYTES(
707 Py_UCS2, Py_UCS4,
708 PyUnicode_2BYTE_DATA(from) + from_start,
709 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
710 PyUnicode_4BYTE_DATA(to) + to_start
711 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200712 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200713 else {
714 int invalid_kinds;
715 if (from_kind > to_kind) {
716 /* slow path to check for character overflow */
717 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
718 Py_UCS4 ch, maxchar;
719 Py_ssize_t i;
720
721 maxchar = 0;
722 invalid_kinds = 0;
723 for (i=0; i < how_many; i++) {
724 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
725 if (ch > maxchar) {
726 maxchar = ch;
727 if (maxchar > to_maxchar) {
728 invalid_kinds = 1;
729 break;
730 }
731 }
732 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
733 }
734 }
735 else
736 invalid_kinds = 1;
737 if (invalid_kinds) {
738 PyErr_Format(PyExc_ValueError,
739 "Cannot copy UCS%u characters "
740 "into a string of UCS%u characters",
741 1 << (from_kind - 1),
742 1 << (to_kind -1));
743 return -1;
744 }
745 }
746 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200747}
748
Victor Stinner17222162011-09-28 22:15:37 +0200749/* Find the maximum code point and count the number of surrogate pairs so a
750 correct string length can be computed before converting a string to UCS4.
751 This function counts single surrogates as a character and not as a pair.
752
753 Return 0 on success, or -1 on error. */
754static int
755find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
756 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200757{
758 const wchar_t *iter;
759
760 if (num_surrogates == NULL || maxchar == NULL) {
761 PyErr_SetString(PyExc_SystemError,
762 "unexpected NULL arguments to "
763 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
764 return -1;
765 }
766
767 *num_surrogates = 0;
768 *maxchar = 0;
769
770 for (iter = begin; iter < end; ) {
771 if (*iter > *maxchar)
772 *maxchar = *iter;
773#if SIZEOF_WCHAR_T == 2
774 if (*iter >= 0xD800 && *iter <= 0xDBFF
775 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
776 {
777 Py_UCS4 surrogate_val;
778 surrogate_val = (((iter[0] & 0x3FF)<<10)
779 | (iter[1] & 0x3FF)) + 0x10000;
780 ++(*num_surrogates);
781 if (surrogate_val > *maxchar)
782 *maxchar = surrogate_val;
783 iter += 2;
784 }
785 else
786 iter++;
787#else
788 iter++;
789#endif
790 }
791 return 0;
792}
793
794#ifdef Py_DEBUG
795int unicode_ready_calls = 0;
796#endif
797
798int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200799_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200800{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200801 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 wchar_t *end;
803 Py_UCS4 maxchar = 0;
804 Py_ssize_t num_surrogates;
805#if SIZEOF_WCHAR_T == 2
806 Py_ssize_t length_wo_surrogates;
807#endif
808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200810 strings were created using _PyObject_New() and where no canonical
811 representation (the str field) has been set yet aka strings
812 which are not yet ready. */
813 assert(PyUnicode_Check(obj));
814 assert(!PyUnicode_IS_READY(obj));
815 assert(!PyUnicode_IS_COMPACT(obj));
816 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200817 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200818 assert(unicode->data.any == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200819 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200820 /* Actually, it should neither be interned nor be anything else: */
821 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200822
823#ifdef Py_DEBUG
824 ++unicode_ready_calls;
825#endif
826
827 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200828 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200829 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200830 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200831
832 if (maxchar < 256) {
833 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
834 if (!unicode->data.any) {
835 PyErr_NoMemory();
836 return -1;
837 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200838 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200839 _PyUnicode_WSTR(unicode), end,
840 PyUnicode_1BYTE_DATA(unicode));
841 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
842 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
843 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
844 if (maxchar < 128) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200845 _PyUnicode_UTF8(unicode) = unicode->data.any;
846 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200847 }
848 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200849 _PyUnicode_UTF8(unicode) = NULL;
850 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200851 }
852 PyObject_FREE(_PyUnicode_WSTR(unicode));
853 _PyUnicode_WSTR(unicode) = NULL;
854 _PyUnicode_WSTR_LENGTH(unicode) = 0;
855 }
856 /* In this case we might have to convert down from 4-byte native
857 wchar_t to 2-byte unicode. */
858 else if (maxchar < 65536) {
859 assert(num_surrogates == 0 &&
860 "FindMaxCharAndNumSurrogatePairs() messed up");
861
Victor Stinner506f5922011-09-28 22:34:18 +0200862#if SIZEOF_WCHAR_T == 2
863 /* We can share representations and are done. */
864 unicode->data.any = _PyUnicode_WSTR(unicode);
865 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
866 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
867 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200868 _PyUnicode_UTF8(unicode) = NULL;
869 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200870#else
871 /* sizeof(wchar_t) == 4 */
872 unicode->data.any = PyObject_MALLOC(
873 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
874 if (!unicode->data.any) {
875 PyErr_NoMemory();
876 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877 }
Victor Stinner506f5922011-09-28 22:34:18 +0200878 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
879 _PyUnicode_WSTR(unicode), end,
880 PyUnicode_2BYTE_DATA(unicode));
881 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
882 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
883 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200884 _PyUnicode_UTF8(unicode) = NULL;
885 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200886 PyObject_FREE(_PyUnicode_WSTR(unicode));
887 _PyUnicode_WSTR(unicode) = NULL;
888 _PyUnicode_WSTR_LENGTH(unicode) = 0;
889#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200890 }
891 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
892 else {
893#if SIZEOF_WCHAR_T == 2
894 /* in case the native representation is 2-bytes, we need to allocate a
895 new normalized 4-byte version. */
896 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
897 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
898 if (!unicode->data.any) {
899 PyErr_NoMemory();
900 return -1;
901 }
902 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
903 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200904 _PyUnicode_UTF8(unicode) = NULL;
905 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
907 unicode) < 0) {
908 assert(0 && "ConvertWideCharToUCS4 failed");
909 return -1;
910 }
911 PyObject_FREE(_PyUnicode_WSTR(unicode));
912 _PyUnicode_WSTR(unicode) = NULL;
913 _PyUnicode_WSTR_LENGTH(unicode) = 0;
914#else
915 assert(num_surrogates == 0);
916
917 unicode->data.any = _PyUnicode_WSTR(unicode);
918 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200919 _PyUnicode_UTF8(unicode) = NULL;
920 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
922#endif
923 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
924 }
925 _PyUnicode_STATE(unicode).ready = 1;
926 return 0;
927}
928
Alexander Belopolsky40018472011-02-26 01:02:56 +0000929static void
930unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000931{
Walter Dörwald16807132007-05-25 13:52:07 +0000932 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000933 case SSTATE_NOT_INTERNED:
934 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000935
Benjamin Peterson29060642009-01-31 22:14:21 +0000936 case SSTATE_INTERNED_MORTAL:
937 /* revive dead object temporarily for DelItem */
938 Py_REFCNT(unicode) = 3;
939 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
940 Py_FatalError(
941 "deletion of interned string failed");
942 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000943
Benjamin Peterson29060642009-01-31 22:14:21 +0000944 case SSTATE_INTERNED_IMMORTAL:
945 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000946
Benjamin Peterson29060642009-01-31 22:14:21 +0000947 default:
948 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000949 }
950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 if (_PyUnicode_WSTR(unicode) &&
952 (!PyUnicode_IS_READY(unicode) ||
953 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
954 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200955 if (!PyUnicode_IS_COMPACT_ASCII(unicode)
956 && _PyUnicode_UTF8(unicode)
957 && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
958 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200959
960 if (PyUnicode_IS_COMPACT(unicode)) {
961 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000962 }
963 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200964 if (unicode->data.any)
965 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000966 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000967 }
968}
969
Alexander Belopolsky40018472011-02-26 01:02:56 +0000970static int
971_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000972{
973 register PyUnicodeObject *v;
974
975 /* Argument checks */
976 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000977 PyErr_BadInternalCall();
978 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000979 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000980 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
982 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000983 PyErr_BadInternalCall();
984 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000985 }
986
987 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988 possible since these are being shared.
989 The same goes for new-representation unicode objects or objects which
990 have already been readied.
991 For these, we simply return a fresh copy with the same Unicode content.
992 */
993 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
994 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
995 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000996 PyUnicodeObject *w = _PyUnicode_New(length);
997 if (w == NULL)
998 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200999 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
1000 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +00001001 Py_DECREF(*unicode);
1002 *unicode = w;
1003 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001004 }
1005
1006 /* Note that we don't have to modify *unicode for unshared Unicode
1007 objects, since we can modify them in-place. */
1008 return unicode_resize(v, length);
1009}
1010
Alexander Belopolsky40018472011-02-26 01:02:56 +00001011int
1012PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001013{
1014 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
1015}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001017static PyObject*
1018get_latin1_char(unsigned char ch)
1019{
1020 PyUnicodeObject *unicode = unicode_latin1[ch];
1021 if (!unicode) {
1022 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
1023 if (!unicode)
1024 return NULL;
1025 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1026 unicode_latin1[ch] = unicode;
1027 }
1028 Py_INCREF(unicode);
1029 return (PyObject *)unicode;
1030}
1031
Alexander Belopolsky40018472011-02-26 01:02:56 +00001032PyObject *
1033PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034{
1035 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001036 Py_UCS4 maxchar = 0;
1037 Py_ssize_t num_surrogates;
1038
1039 if (u == NULL)
1040 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001042 /* If the Unicode data is known at construction time, we can apply
1043 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 /* Optimization for empty strings */
1046 if (size == 0 && unicode_empty != NULL) {
1047 Py_INCREF(unicode_empty);
1048 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001049 }
Tim Petersced69f82003-09-16 20:30:58 +00001050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 /* Single character Unicode objects in the Latin-1 range are
1052 shared when using this constructor */
1053 if (size == 1 && *u < 256)
1054 return get_latin1_char((unsigned char)*u);
1055
1056 /* If not empty and not single character, copy the Unicode data
1057 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001058 if (find_maxchar_surrogates(u, u + size,
1059 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 return NULL;
1061
1062 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1063 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064 if (!unicode)
1065 return NULL;
1066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 switch (PyUnicode_KIND(unicode)) {
1068 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001069 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1071 break;
1072 case PyUnicode_2BYTE_KIND:
1073#if Py_UNICODE_SIZE == 2
1074 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1075#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001076 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001077 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1078#endif
1079 break;
1080 case PyUnicode_4BYTE_KIND:
1081#if SIZEOF_WCHAR_T == 2
1082 /* This is the only case which has to process surrogates, thus
1083 a simple copy loop is not enough and we need a function. */
1084 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1085 Py_DECREF(unicode);
1086 return NULL;
1087 }
1088#else
1089 assert(num_surrogates == 0);
1090 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1091#endif
1092 break;
1093 default:
1094 assert(0 && "Impossible state");
1095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096
1097 return (PyObject *)unicode;
1098}
1099
Alexander Belopolsky40018472011-02-26 01:02:56 +00001100PyObject *
1101PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001102{
1103 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001104
Benjamin Peterson14339b62009-01-31 16:36:08 +00001105 if (size < 0) {
1106 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001107 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001108 return NULL;
1109 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001110
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001111 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001112 some optimizations which share commonly used objects.
1113 Also, this means the input must be UTF-8, so fall back to the
1114 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001115 if (u != NULL) {
1116
Benjamin Peterson29060642009-01-31 22:14:21 +00001117 /* Optimization for empty strings */
1118 if (size == 0 && unicode_empty != NULL) {
1119 Py_INCREF(unicode_empty);
1120 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001121 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001122
1123 /* Single characters are shared when using this constructor.
1124 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 if (size == 1 && Py_CHARMASK(*u) < 128)
1126 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001127
1128 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001129 }
1130
Walter Dörwald55507312007-05-18 13:12:10 +00001131 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001132 if (!unicode)
1133 return NULL;
1134
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001135 return (PyObject *)unicode;
1136}
1137
Alexander Belopolsky40018472011-02-26 01:02:56 +00001138PyObject *
1139PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001140{
1141 size_t size = strlen(u);
1142 if (size > PY_SSIZE_T_MAX) {
1143 PyErr_SetString(PyExc_OverflowError, "input too long");
1144 return NULL;
1145 }
1146
1147 return PyUnicode_FromStringAndSize(u, size);
1148}
1149
Victor Stinnere57b1c02011-09-28 22:20:48 +02001150static PyObject*
1151_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001152{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153 PyObject *res;
1154 unsigned char max = 127;
1155 Py_ssize_t i;
1156 for (i = 0; i < size; i++) {
1157 if (u[i] & 0x80) {
1158 max = 255;
1159 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001160 }
1161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 res = PyUnicode_New(size, max);
1163 if (!res)
1164 return NULL;
1165 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1166 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001167}
1168
Victor Stinnere57b1c02011-09-28 22:20:48 +02001169static PyObject*
1170_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171{
1172 PyObject *res;
1173 Py_UCS2 max = 0;
1174 Py_ssize_t i;
1175 for (i = 0; i < size; i++)
1176 if (u[i] > max)
1177 max = u[i];
1178 res = PyUnicode_New(size, max);
1179 if (!res)
1180 return NULL;
1181 if (max >= 256)
1182 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1183 else
1184 for (i = 0; i < size; i++)
1185 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1186 return res;
1187}
1188
Victor Stinnere57b1c02011-09-28 22:20:48 +02001189static PyObject*
1190_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001191{
1192 PyObject *res;
1193 Py_UCS4 max = 0;
1194 Py_ssize_t i;
1195 for (i = 0; i < size; i++)
1196 if (u[i] > max)
1197 max = u[i];
1198 res = PyUnicode_New(size, max);
1199 if (!res)
1200 return NULL;
1201 if (max >= 0x10000)
1202 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1203 else {
1204 int kind = PyUnicode_KIND(res);
1205 void *data = PyUnicode_DATA(res);
1206 for (i = 0; i < size; i++)
1207 PyUnicode_WRITE(kind, data, i, u[i]);
1208 }
1209 return res;
1210}
1211
1212PyObject*
1213PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1214{
1215 switch(kind) {
1216 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001217 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001219 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001220 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001221 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001223 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 return NULL;
1225}
1226
Victor Stinner034f6cf2011-09-30 02:26:44 +02001227PyObject*
1228PyUnicode_Copy(PyObject *unicode)
1229{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001230 Py_ssize_t size;
1231 PyObject *copy;
1232 void *data;
1233
Victor Stinner034f6cf2011-09-30 02:26:44 +02001234 if (!PyUnicode_Check(unicode)) {
1235 PyErr_BadInternalCall();
1236 return NULL;
1237 }
1238 if (PyUnicode_READY(unicode))
1239 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001240
1241 size = PyUnicode_GET_LENGTH(unicode);
1242 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1243 if (!copy)
1244 return NULL;
1245 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1246
1247 data = PyUnicode_DATA(unicode);
1248 switch (PyUnicode_KIND(unicode))
1249 {
1250 case PyUnicode_1BYTE_KIND:
1251 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1252 break;
1253 case PyUnicode_2BYTE_KIND:
1254 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1255 break;
1256 case PyUnicode_4BYTE_KIND:
1257 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1258 break;
1259 default:
1260 assert(0);
1261 break;
1262 }
1263 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001264}
1265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001266
Victor Stinnerbc603d12011-10-02 01:00:40 +02001267/* Widen Unicode objects to larger buffers. Don't write terminating null
1268 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269
1270void*
1271_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1272{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001273 Py_ssize_t len;
1274 void *result;
1275 unsigned int skind;
1276
1277 if (PyUnicode_READY(s))
1278 return NULL;
1279
1280 len = PyUnicode_GET_LENGTH(s);
1281 skind = PyUnicode_KIND(s);
1282 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001283 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1284 return NULL;
1285 }
1286 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001287 case PyUnicode_2BYTE_KIND:
1288 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1289 if (!result)
1290 return PyErr_NoMemory();
1291 assert(skind == PyUnicode_1BYTE_KIND);
1292 _PyUnicode_CONVERT_BYTES(
1293 Py_UCS1, Py_UCS2,
1294 PyUnicode_1BYTE_DATA(s),
1295 PyUnicode_1BYTE_DATA(s) + len,
1296 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001298 case PyUnicode_4BYTE_KIND:
1299 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1300 if (!result)
1301 return PyErr_NoMemory();
1302 if (skind == PyUnicode_2BYTE_KIND) {
1303 _PyUnicode_CONVERT_BYTES(
1304 Py_UCS2, Py_UCS4,
1305 PyUnicode_2BYTE_DATA(s),
1306 PyUnicode_2BYTE_DATA(s) + len,
1307 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001309 else {
1310 assert(skind == PyUnicode_1BYTE_KIND);
1311 _PyUnicode_CONVERT_BYTES(
1312 Py_UCS1, Py_UCS4,
1313 PyUnicode_1BYTE_DATA(s),
1314 PyUnicode_1BYTE_DATA(s) + len,
1315 result);
1316 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001318 default:
1319 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001321 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 return NULL;
1323}
1324
1325static Py_UCS4*
1326as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1327 int copy_null)
1328{
1329 int kind;
1330 void *data;
1331 Py_ssize_t len, targetlen;
1332 if (PyUnicode_READY(string) == -1)
1333 return NULL;
1334 kind = PyUnicode_KIND(string);
1335 data = PyUnicode_DATA(string);
1336 len = PyUnicode_GET_LENGTH(string);
1337 targetlen = len;
1338 if (copy_null)
1339 targetlen++;
1340 if (!target) {
1341 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1342 PyErr_NoMemory();
1343 return NULL;
1344 }
1345 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1346 if (!target) {
1347 PyErr_NoMemory();
1348 return NULL;
1349 }
1350 }
1351 else {
1352 if (targetsize < targetlen) {
1353 PyErr_Format(PyExc_SystemError,
1354 "string is longer than the buffer");
1355 if (copy_null && 0 < targetsize)
1356 target[0] = 0;
1357 return NULL;
1358 }
1359 }
1360 if (kind != PyUnicode_4BYTE_KIND) {
1361 Py_ssize_t i;
1362 for (i = 0; i < len; i++)
1363 target[i] = PyUnicode_READ(kind, data, i);
1364 }
1365 else
1366 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1367 if (copy_null)
1368 target[len] = 0;
1369 return target;
1370}
1371
1372Py_UCS4*
1373PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1374 int copy_null)
1375{
1376 if (target == NULL || targetsize < 1) {
1377 PyErr_BadInternalCall();
1378 return NULL;
1379 }
1380 return as_ucs4(string, target, targetsize, copy_null);
1381}
1382
1383Py_UCS4*
1384PyUnicode_AsUCS4Copy(PyObject *string)
1385{
1386 return as_ucs4(string, NULL, 0, 1);
1387}
1388
1389#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001390
Alexander Belopolsky40018472011-02-26 01:02:56 +00001391PyObject *
1392PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001395 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001397 PyErr_BadInternalCall();
1398 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001399 }
1400
Martin v. Löwis790465f2008-04-05 20:41:37 +00001401 if (size == -1) {
1402 size = wcslen(w);
1403 }
1404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406}
1407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001409
Walter Dörwald346737f2007-05-31 10:44:43 +00001410static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001411makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1412 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001413{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001414 *fmt++ = '%';
1415 if (width) {
1416 if (zeropad)
1417 *fmt++ = '0';
1418 fmt += sprintf(fmt, "%d", width);
1419 }
1420 if (precision)
1421 fmt += sprintf(fmt, ".%d", precision);
1422 if (longflag)
1423 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001424 else if (longlongflag) {
1425 /* longlongflag should only ever be nonzero on machines with
1426 HAVE_LONG_LONG defined */
1427#ifdef HAVE_LONG_LONG
1428 char *f = PY_FORMAT_LONG_LONG;
1429 while (*f)
1430 *fmt++ = *f++;
1431#else
1432 /* we shouldn't ever get here */
1433 assert(0);
1434 *fmt++ = 'l';
1435#endif
1436 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001437 else if (size_tflag) {
1438 char *f = PY_FORMAT_SIZE_T;
1439 while (*f)
1440 *fmt++ = *f++;
1441 }
1442 *fmt++ = c;
1443 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001444}
1445
Victor Stinner96865452011-03-01 23:44:09 +00001446/* helper for PyUnicode_FromFormatV() */
1447
1448static const char*
1449parse_format_flags(const char *f,
1450 int *p_width, int *p_precision,
1451 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1452{
1453 int width, precision, longflag, longlongflag, size_tflag;
1454
1455 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1456 f++;
1457 width = 0;
1458 while (Py_ISDIGIT((unsigned)*f))
1459 width = (width*10) + *f++ - '0';
1460 precision = 0;
1461 if (*f == '.') {
1462 f++;
1463 while (Py_ISDIGIT((unsigned)*f))
1464 precision = (precision*10) + *f++ - '0';
1465 if (*f == '%') {
1466 /* "%.3%s" => f points to "3" */
1467 f--;
1468 }
1469 }
1470 if (*f == '\0') {
1471 /* bogus format "%.1" => go backward, f points to "1" */
1472 f--;
1473 }
1474 if (p_width != NULL)
1475 *p_width = width;
1476 if (p_precision != NULL)
1477 *p_precision = precision;
1478
1479 /* Handle %ld, %lu, %lld and %llu. */
1480 longflag = 0;
1481 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001482 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001483
1484 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001485 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001486 longflag = 1;
1487 ++f;
1488 }
1489#ifdef HAVE_LONG_LONG
1490 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001491 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001492 longlongflag = 1;
1493 f += 2;
1494 }
1495#endif
1496 }
1497 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001498 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001499 size_tflag = 1;
1500 ++f;
1501 }
1502 if (p_longflag != NULL)
1503 *p_longflag = longflag;
1504 if (p_longlongflag != NULL)
1505 *p_longlongflag = longlongflag;
1506 if (p_size_tflag != NULL)
1507 *p_size_tflag = size_tflag;
1508 return f;
1509}
1510
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001511/* maximum number of characters required for output of %ld. 21 characters
1512 allows for 64-bit integers (in decimal) and an optional sign. */
1513#define MAX_LONG_CHARS 21
1514/* maximum number of characters required for output of %lld.
1515 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1516 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1517#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1518
Walter Dörwaldd2034312007-05-18 16:29:38 +00001519PyObject *
1520PyUnicode_FromFormatV(const char *format, va_list vargs)
1521{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001522 va_list count;
1523 Py_ssize_t callcount = 0;
1524 PyObject **callresults = NULL;
1525 PyObject **callresult = NULL;
1526 Py_ssize_t n = 0;
1527 int width = 0;
1528 int precision = 0;
1529 int zeropad;
1530 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001532 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001533 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1535 Py_UCS4 argmaxchar;
1536 Py_ssize_t numbersize = 0;
1537 char *numberresults = NULL;
1538 char *numberresult = NULL;
1539 Py_ssize_t i;
1540 int kind;
1541 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001542
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001543 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001544 /* step 1: count the number of %S/%R/%A/%s format specifications
1545 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1546 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 * result in an array)
1548 * also esimate a upper bound for all the number formats in the string,
1549 * numbers will be formated in step 3 and be keept in a '\0'-separated
1550 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001551 for (f = format; *f; f++) {
1552 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001553 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1555 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1556 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1557 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001560#ifdef HAVE_LONG_LONG
1561 if (longlongflag) {
1562 if (width < MAX_LONG_LONG_CHARS)
1563 width = MAX_LONG_LONG_CHARS;
1564 }
1565 else
1566#endif
1567 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1568 including sign. Decimal takes the most space. This
1569 isn't enough for octal. If a width is specified we
1570 need more (which we allocate later). */
1571 if (width < MAX_LONG_CHARS)
1572 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573
1574 /* account for the size + '\0' to separate numbers
1575 inside of the numberresults buffer */
1576 numbersize += (width + 1);
1577 }
1578 }
1579 else if ((unsigned char)*f > 127) {
1580 PyErr_Format(PyExc_ValueError,
1581 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1582 "string, got a non-ASCII byte: 0x%02x",
1583 (unsigned char)*f);
1584 return NULL;
1585 }
1586 }
1587 /* step 2: allocate memory for the results of
1588 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1589 if (callcount) {
1590 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1591 if (!callresults) {
1592 PyErr_NoMemory();
1593 return NULL;
1594 }
1595 callresult = callresults;
1596 }
1597 /* step 2.5: allocate memory for the results of formating numbers */
1598 if (numbersize) {
1599 numberresults = PyObject_Malloc(numbersize);
1600 if (!numberresults) {
1601 PyErr_NoMemory();
1602 goto fail;
1603 }
1604 numberresult = numberresults;
1605 }
1606
1607 /* step 3: format numbers and figure out how large a buffer we need */
1608 for (f = format; *f; f++) {
1609 if (*f == '%') {
1610 const char* p;
1611 int longflag;
1612 int longlongflag;
1613 int size_tflag;
1614 int numprinted;
1615
1616 p = f;
1617 zeropad = (f[1] == '0');
1618 f = parse_format_flags(f, &width, &precision,
1619 &longflag, &longlongflag, &size_tflag);
1620 switch (*f) {
1621 case 'c':
1622 {
1623 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001624 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001625 n++;
1626 break;
1627 }
1628 case '%':
1629 n++;
1630 break;
1631 case 'i':
1632 case 'd':
1633 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1634 width, precision, *f);
1635 if (longflag)
1636 numprinted = sprintf(numberresult, fmt,
1637 va_arg(count, long));
1638#ifdef HAVE_LONG_LONG
1639 else if (longlongflag)
1640 numprinted = sprintf(numberresult, fmt,
1641 va_arg(count, PY_LONG_LONG));
1642#endif
1643 else if (size_tflag)
1644 numprinted = sprintf(numberresult, fmt,
1645 va_arg(count, Py_ssize_t));
1646 else
1647 numprinted = sprintf(numberresult, fmt,
1648 va_arg(count, int));
1649 n += numprinted;
1650 /* advance by +1 to skip over the '\0' */
1651 numberresult += (numprinted + 1);
1652 assert(*(numberresult - 1) == '\0');
1653 assert(*(numberresult - 2) != '\0');
1654 assert(numprinted >= 0);
1655 assert(numberresult <= numberresults + numbersize);
1656 break;
1657 case 'u':
1658 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1659 width, precision, 'u');
1660 if (longflag)
1661 numprinted = sprintf(numberresult, fmt,
1662 va_arg(count, unsigned long));
1663#ifdef HAVE_LONG_LONG
1664 else if (longlongflag)
1665 numprinted = sprintf(numberresult, fmt,
1666 va_arg(count, unsigned PY_LONG_LONG));
1667#endif
1668 else if (size_tflag)
1669 numprinted = sprintf(numberresult, fmt,
1670 va_arg(count, size_t));
1671 else
1672 numprinted = sprintf(numberresult, fmt,
1673 va_arg(count, unsigned int));
1674 n += numprinted;
1675 numberresult += (numprinted + 1);
1676 assert(*(numberresult - 1) == '\0');
1677 assert(*(numberresult - 2) != '\0');
1678 assert(numprinted >= 0);
1679 assert(numberresult <= numberresults + numbersize);
1680 break;
1681 case 'x':
1682 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1683 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1684 n += numprinted;
1685 numberresult += (numprinted + 1);
1686 assert(*(numberresult - 1) == '\0');
1687 assert(*(numberresult - 2) != '\0');
1688 assert(numprinted >= 0);
1689 assert(numberresult <= numberresults + numbersize);
1690 break;
1691 case 'p':
1692 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1693 /* %p is ill-defined: ensure leading 0x. */
1694 if (numberresult[1] == 'X')
1695 numberresult[1] = 'x';
1696 else if (numberresult[1] != 'x') {
1697 memmove(numberresult + 2, numberresult,
1698 strlen(numberresult) + 1);
1699 numberresult[0] = '0';
1700 numberresult[1] = 'x';
1701 numprinted += 2;
1702 }
1703 n += numprinted;
1704 numberresult += (numprinted + 1);
1705 assert(*(numberresult - 1) == '\0');
1706 assert(*(numberresult - 2) != '\0');
1707 assert(numprinted >= 0);
1708 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001709 break;
1710 case 's':
1711 {
1712 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001713 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001714 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1715 if (!str)
1716 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001717 /* since PyUnicode_DecodeUTF8 returns already flexible
1718 unicode objects, there is no need to call ready on them */
1719 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001720 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001722 /* Remember the str and switch to the next slot */
1723 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001724 break;
1725 }
1726 case 'U':
1727 {
1728 PyObject *obj = va_arg(count, PyObject *);
1729 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730 if (PyUnicode_READY(obj) == -1)
1731 goto fail;
1732 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001733 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001735 break;
1736 }
1737 case 'V':
1738 {
1739 PyObject *obj = va_arg(count, PyObject *);
1740 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001741 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001742 assert(obj || str);
1743 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001744 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 if (PyUnicode_READY(obj) == -1)
1746 goto fail;
1747 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001748 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001750 *callresult++ = NULL;
1751 }
1752 else {
1753 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1754 if (!str_obj)
1755 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001757 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001759 *callresult++ = str_obj;
1760 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001761 break;
1762 }
1763 case 'S':
1764 {
1765 PyObject *obj = va_arg(count, PyObject *);
1766 PyObject *str;
1767 assert(obj);
1768 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001770 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001772 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001774 /* Remember the str and switch to the next slot */
1775 *callresult++ = str;
1776 break;
1777 }
1778 case 'R':
1779 {
1780 PyObject *obj = va_arg(count, PyObject *);
1781 PyObject *repr;
1782 assert(obj);
1783 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001785 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001786 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001787 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001789 /* Remember the repr and switch to the next slot */
1790 *callresult++ = repr;
1791 break;
1792 }
1793 case 'A':
1794 {
1795 PyObject *obj = va_arg(count, PyObject *);
1796 PyObject *ascii;
1797 assert(obj);
1798 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001800 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001802 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001804 /* Remember the repr and switch to the next slot */
1805 *callresult++ = ascii;
1806 break;
1807 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001808 default:
1809 /* if we stumble upon an unknown
1810 formatting code, copy the rest of
1811 the format string to the output
1812 string. (we cannot just skip the
1813 code, since there's no way to know
1814 what's in the argument list) */
1815 n += strlen(p);
1816 goto expand;
1817 }
1818 } else
1819 n++;
1820 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001821 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001822 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001824 we don't have to resize the string.
1825 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001827 if (!string)
1828 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 kind = PyUnicode_KIND(string);
1830 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001831 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001835 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001836 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001837
1838 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1840 /* checking for == because the last argument could be a empty
1841 string, which causes i to point to end, the assert at the end of
1842 the loop */
1843 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001844
Benjamin Peterson14339b62009-01-31 16:36:08 +00001845 switch (*f) {
1846 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001847 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 const int ordinal = va_arg(vargs, int);
1849 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001850 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001851 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001852 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001853 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001854 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001855 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001856 case 'p':
1857 /* unused, since we already have the result */
1858 if (*f == 'p')
1859 (void) va_arg(vargs, void *);
1860 else
1861 (void) va_arg(vargs, int);
1862 /* extract the result from numberresults and append. */
1863 for (; *numberresult; ++i, ++numberresult)
1864 PyUnicode_WRITE(kind, data, i, *numberresult);
1865 /* skip over the separating '\0' */
1866 assert(*numberresult == '\0');
1867 numberresult++;
1868 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001869 break;
1870 case 's':
1871 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001872 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001874 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875 size = PyUnicode_GET_LENGTH(*callresult);
1876 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001877 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1878 *callresult, 0,
1879 size) < 0)
1880 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001881 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001882 /* We're done with the unicode()/repr() => forget it */
1883 Py_DECREF(*callresult);
1884 /* switch to next unicode()/repr() result */
1885 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001886 break;
1887 }
1888 case 'U':
1889 {
1890 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001891 Py_ssize_t size;
1892 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1893 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001894 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1895 obj, 0,
1896 size) < 0)
1897 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001899 break;
1900 }
1901 case 'V':
1902 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001904 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001905 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001906 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 size = PyUnicode_GET_LENGTH(obj);
1908 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001909 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1910 obj, 0,
1911 size) < 0)
1912 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001914 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915 size = PyUnicode_GET_LENGTH(*callresult);
1916 assert(PyUnicode_KIND(*callresult) <=
1917 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001918 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1919 *callresult,
1920 0, size) < 0)
1921 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001923 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001924 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001925 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001926 break;
1927 }
1928 case 'S':
1929 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001930 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001931 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001932 /* unused, since we already have the result */
1933 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001934 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001935 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1936 *callresult, 0,
1937 PyUnicode_GET_LENGTH(*callresult)) < 0)
1938 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001940 /* We're done with the unicode()/repr() => forget it */
1941 Py_DECREF(*callresult);
1942 /* switch to next unicode()/repr() result */
1943 ++callresult;
1944 break;
1945 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001946 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001948 break;
1949 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 for (; *p; ++p, ++i)
1951 PyUnicode_WRITE(kind, data, i, *p);
1952 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001953 goto end;
1954 }
Victor Stinner1205f272010-09-11 00:54:47 +00001955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001956 else {
1957 assert(i < PyUnicode_GET_LENGTH(string));
1958 PyUnicode_WRITE(kind, data, i++, *f);
1959 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001962
Benjamin Peterson29060642009-01-31 22:14:21 +00001963 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001964 if (callresults)
1965 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966 if (numberresults)
1967 PyObject_Free(numberresults);
1968 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001969 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001970 if (callresults) {
1971 PyObject **callresult2 = callresults;
1972 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001973 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001974 ++callresult2;
1975 }
1976 PyObject_Free(callresults);
1977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 if (numberresults)
1979 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001980 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001981}
1982
Walter Dörwaldd2034312007-05-18 16:29:38 +00001983PyObject *
1984PyUnicode_FromFormat(const char *format, ...)
1985{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001986 PyObject* ret;
1987 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001988
1989#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001990 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001991#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001992 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001993#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001994 ret = PyUnicode_FromFormatV(format, vargs);
1995 va_end(vargs);
1996 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001997}
1998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999#ifdef HAVE_WCHAR_H
2000
Victor Stinner5593d8a2010-10-02 11:11:27 +00002001/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2002 convert a Unicode object to a wide character string.
2003
Victor Stinnerd88d9832011-09-06 02:00:05 +02002004 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002005 character) required to convert the unicode object. Ignore size argument.
2006
Victor Stinnerd88d9832011-09-06 02:00:05 +02002007 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002008 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002009 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002010static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002011unicode_aswidechar(PyUnicodeObject *unicode,
2012 wchar_t *w,
2013 Py_ssize_t size)
2014{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002015 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 const wchar_t *wstr;
2017
2018 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2019 if (wstr == NULL)
2020 return -1;
2021
Victor Stinner5593d8a2010-10-02 11:11:27 +00002022 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002023 if (size > res)
2024 size = res + 1;
2025 else
2026 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002028 return res;
2029 }
2030 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002032}
2033
2034Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002035PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002036 wchar_t *w,
2037 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038{
2039 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002040 PyErr_BadInternalCall();
2041 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002043 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044}
2045
Victor Stinner137c34c2010-09-29 10:25:54 +00002046wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002047PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002048 Py_ssize_t *size)
2049{
2050 wchar_t* buffer;
2051 Py_ssize_t buflen;
2052
2053 if (unicode == NULL) {
2054 PyErr_BadInternalCall();
2055 return NULL;
2056 }
2057
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002058 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 if (buflen == -1)
2060 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002061 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002062 PyErr_NoMemory();
2063 return NULL;
2064 }
2065
Victor Stinner137c34c2010-09-29 10:25:54 +00002066 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2067 if (buffer == NULL) {
2068 PyErr_NoMemory();
2069 return NULL;
2070 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002071 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002072 if (buflen == -1)
2073 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002074 if (size != NULL)
2075 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002076 return buffer;
2077}
2078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002079#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080
Alexander Belopolsky40018472011-02-26 01:02:56 +00002081PyObject *
2082PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002083{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002085 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002086 PyErr_SetString(PyExc_ValueError,
2087 "chr() arg not in range(0x110000)");
2088 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002089 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 if (ordinal < 256)
2092 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002094 v = PyUnicode_New(1, ordinal);
2095 if (v == NULL)
2096 return NULL;
2097 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2098 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002099}
2100
Alexander Belopolsky40018472011-02-26 01:02:56 +00002101PyObject *
2102PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002104 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002105 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002106 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002107 if (PyUnicode_READY(obj))
2108 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002109 Py_INCREF(obj);
2110 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002111 }
2112 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002113 /* For a Unicode subtype that's not a Unicode object,
2114 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002115 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002116 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002117 PyErr_Format(PyExc_TypeError,
2118 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002119 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002120 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002121}
2122
Alexander Belopolsky40018472011-02-26 01:02:56 +00002123PyObject *
2124PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002125 const char *encoding,
2126 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002127{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002128 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002129 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002130
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002132 PyErr_BadInternalCall();
2133 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002135
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002136 /* Decoding bytes objects is the most common case and should be fast */
2137 if (PyBytes_Check(obj)) {
2138 if (PyBytes_GET_SIZE(obj) == 0) {
2139 Py_INCREF(unicode_empty);
2140 v = (PyObject *) unicode_empty;
2141 }
2142 else {
2143 v = PyUnicode_Decode(
2144 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2145 encoding, errors);
2146 }
2147 return v;
2148 }
2149
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002150 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002151 PyErr_SetString(PyExc_TypeError,
2152 "decoding str is not supported");
2153 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002154 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002155
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002156 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2157 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2158 PyErr_Format(PyExc_TypeError,
2159 "coercing to str: need bytes, bytearray "
2160 "or buffer-like object, %.80s found",
2161 Py_TYPE(obj)->tp_name);
2162 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002163 }
Tim Petersced69f82003-09-16 20:30:58 +00002164
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002165 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002166 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002167 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 }
Tim Petersced69f82003-09-16 20:30:58 +00002169 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002170 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002171
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002172 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002173 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174}
2175
Victor Stinner600d3be2010-06-10 12:00:55 +00002176/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002177 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2178 1 on success. */
2179static int
2180normalize_encoding(const char *encoding,
2181 char *lower,
2182 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002184 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002185 char *l;
2186 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002187
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002188 e = encoding;
2189 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002190 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002191 while (*e) {
2192 if (l == l_end)
2193 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002194 if (Py_ISUPPER(*e)) {
2195 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002196 }
2197 else if (*e == '_') {
2198 *l++ = '-';
2199 e++;
2200 }
2201 else {
2202 *l++ = *e++;
2203 }
2204 }
2205 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002206 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002207}
2208
Alexander Belopolsky40018472011-02-26 01:02:56 +00002209PyObject *
2210PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002211 Py_ssize_t size,
2212 const char *encoding,
2213 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002214{
2215 PyObject *buffer = NULL, *unicode;
2216 Py_buffer info;
2217 char lower[11]; /* Enough for any encoding shortcut */
2218
2219 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002220 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002221
2222 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002223 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002224 if ((strcmp(lower, "utf-8") == 0) ||
2225 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002226 return PyUnicode_DecodeUTF8(s, size, errors);
2227 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002228 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002229 (strcmp(lower, "iso-8859-1") == 0))
2230 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002231#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002232 else if (strcmp(lower, "mbcs") == 0)
2233 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002234#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002235 else if (strcmp(lower, "ascii") == 0)
2236 return PyUnicode_DecodeASCII(s, size, errors);
2237 else if (strcmp(lower, "utf-16") == 0)
2238 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2239 else if (strcmp(lower, "utf-32") == 0)
2240 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242
2243 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002244 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002245 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002246 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002247 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 if (buffer == NULL)
2249 goto onError;
2250 unicode = PyCodec_Decode(buffer, encoding, errors);
2251 if (unicode == NULL)
2252 goto onError;
2253 if (!PyUnicode_Check(unicode)) {
2254 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002255 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002256 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257 Py_DECREF(unicode);
2258 goto onError;
2259 }
2260 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 if (PyUnicode_READY(unicode)) {
2262 Py_DECREF(unicode);
2263 return NULL;
2264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002266
Benjamin Peterson29060642009-01-31 22:14:21 +00002267 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 Py_XDECREF(buffer);
2269 return NULL;
2270}
2271
Alexander Belopolsky40018472011-02-26 01:02:56 +00002272PyObject *
2273PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002274 const char *encoding,
2275 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002276{
2277 PyObject *v;
2278
2279 if (!PyUnicode_Check(unicode)) {
2280 PyErr_BadArgument();
2281 goto onError;
2282 }
2283
2284 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002285 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002286
2287 /* Decode via the codec registry */
2288 v = PyCodec_Decode(unicode, encoding, errors);
2289 if (v == NULL)
2290 goto onError;
2291 return v;
2292
Benjamin Peterson29060642009-01-31 22:14:21 +00002293 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002294 return NULL;
2295}
2296
Alexander Belopolsky40018472011-02-26 01:02:56 +00002297PyObject *
2298PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002299 const char *encoding,
2300 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002301{
2302 PyObject *v;
2303
2304 if (!PyUnicode_Check(unicode)) {
2305 PyErr_BadArgument();
2306 goto onError;
2307 }
2308
2309 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002310 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002311
2312 /* Decode via the codec registry */
2313 v = PyCodec_Decode(unicode, encoding, errors);
2314 if (v == NULL)
2315 goto onError;
2316 if (!PyUnicode_Check(v)) {
2317 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002318 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002319 Py_TYPE(v)->tp_name);
2320 Py_DECREF(v);
2321 goto onError;
2322 }
2323 return v;
2324
Benjamin Peterson29060642009-01-31 22:14:21 +00002325 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002326 return NULL;
2327}
2328
Alexander Belopolsky40018472011-02-26 01:02:56 +00002329PyObject *
2330PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002331 Py_ssize_t size,
2332 const char *encoding,
2333 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334{
2335 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002336
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337 unicode = PyUnicode_FromUnicode(s, size);
2338 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2341 Py_DECREF(unicode);
2342 return v;
2343}
2344
Alexander Belopolsky40018472011-02-26 01:02:56 +00002345PyObject *
2346PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002347 const char *encoding,
2348 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002349{
2350 PyObject *v;
2351
2352 if (!PyUnicode_Check(unicode)) {
2353 PyErr_BadArgument();
2354 goto onError;
2355 }
2356
2357 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002358 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002359
2360 /* Encode via the codec registry */
2361 v = PyCodec_Encode(unicode, encoding, errors);
2362 if (v == NULL)
2363 goto onError;
2364 return v;
2365
Benjamin Peterson29060642009-01-31 22:14:21 +00002366 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002367 return NULL;
2368}
2369
Victor Stinnerad158722010-10-27 00:25:46 +00002370PyObject *
2371PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002372{
Victor Stinner99b95382011-07-04 14:23:54 +02002373#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002374 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2375 PyUnicode_GET_SIZE(unicode),
2376 NULL);
2377#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002379#else
Victor Stinner793b5312011-04-27 00:24:21 +02002380 PyInterpreterState *interp = PyThreadState_GET()->interp;
2381 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2382 cannot use it to encode and decode filenames before it is loaded. Load
2383 the Python codec requires to encode at least its own filename. Use the C
2384 version of the locale codec until the codec registry is initialized and
2385 the Python codec is loaded.
2386
2387 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2388 cannot only rely on it: check also interp->fscodec_initialized for
2389 subinterpreters. */
2390 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002391 return PyUnicode_AsEncodedString(unicode,
2392 Py_FileSystemDefaultEncoding,
2393 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002394 }
2395 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002396 /* locale encoding with surrogateescape */
2397 wchar_t *wchar;
2398 char *bytes;
2399 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002400 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002401
2402 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2403 if (wchar == NULL)
2404 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002405 bytes = _Py_wchar2char(wchar, &error_pos);
2406 if (bytes == NULL) {
2407 if (error_pos != (size_t)-1) {
2408 char *errmsg = strerror(errno);
2409 PyObject *exc = NULL;
2410 if (errmsg == NULL)
2411 errmsg = "Py_wchar2char() failed";
2412 raise_encode_exception(&exc,
2413 "filesystemencoding",
2414 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2415 error_pos, error_pos+1,
2416 errmsg);
2417 Py_XDECREF(exc);
2418 }
2419 else
2420 PyErr_NoMemory();
2421 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002422 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002423 }
2424 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002425
2426 bytes_obj = PyBytes_FromString(bytes);
2427 PyMem_Free(bytes);
2428 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002429 }
Victor Stinnerad158722010-10-27 00:25:46 +00002430#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002431}
2432
Alexander Belopolsky40018472011-02-26 01:02:56 +00002433PyObject *
2434PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002435 const char *encoding,
2436 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002437{
2438 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002439 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002440
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441 if (!PyUnicode_Check(unicode)) {
2442 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002443 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002444 }
Fred Drakee4315f52000-05-09 19:53:39 +00002445
Victor Stinner2f283c22011-03-02 01:21:46 +00002446 if (encoding == NULL) {
2447 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002449 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002451 }
Fred Drakee4315f52000-05-09 19:53:39 +00002452
2453 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002454 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002455 if ((strcmp(lower, "utf-8") == 0) ||
2456 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002457 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002458 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002460 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002462 }
Victor Stinner37296e82010-06-10 13:36:23 +00002463 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002464 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002465 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002466 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002467#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002468 else if (strcmp(lower, "mbcs") == 0)
2469 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2470 PyUnicode_GET_SIZE(unicode),
2471 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002472#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002473 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476
2477 /* Encode via the codec registry */
2478 v = PyCodec_Encode(unicode, encoding, errors);
2479 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002480 return NULL;
2481
2482 /* The normal path */
2483 if (PyBytes_Check(v))
2484 return v;
2485
2486 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002487 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002488 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002489 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002490
2491 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2492 "encoder %s returned bytearray instead of bytes",
2493 encoding);
2494 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002495 Py_DECREF(v);
2496 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002497 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002498
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002499 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2500 Py_DECREF(v);
2501 return b;
2502 }
2503
2504 PyErr_Format(PyExc_TypeError,
2505 "encoder did not return a bytes object (type=%.400s)",
2506 Py_TYPE(v)->tp_name);
2507 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002508 return NULL;
2509}
2510
Alexander Belopolsky40018472011-02-26 01:02:56 +00002511PyObject *
2512PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002513 const char *encoding,
2514 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002515{
2516 PyObject *v;
2517
2518 if (!PyUnicode_Check(unicode)) {
2519 PyErr_BadArgument();
2520 goto onError;
2521 }
2522
2523 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002524 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002525
2526 /* Encode via the codec registry */
2527 v = PyCodec_Encode(unicode, encoding, errors);
2528 if (v == NULL)
2529 goto onError;
2530 if (!PyUnicode_Check(v)) {
2531 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002532 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002533 Py_TYPE(v)->tp_name);
2534 Py_DECREF(v);
2535 goto onError;
2536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002538
Benjamin Peterson29060642009-01-31 22:14:21 +00002539 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 return NULL;
2541}
2542
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002543PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002544PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002545 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002546 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2547}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002548
Christian Heimes5894ba72007-11-04 11:43:14 +00002549PyObject*
2550PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2551{
Victor Stinner99b95382011-07-04 14:23:54 +02002552#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002553 return PyUnicode_DecodeMBCS(s, size, NULL);
2554#elif defined(__APPLE__)
2555 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2556#else
Victor Stinner793b5312011-04-27 00:24:21 +02002557 PyInterpreterState *interp = PyThreadState_GET()->interp;
2558 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2559 cannot use it to encode and decode filenames before it is loaded. Load
2560 the Python codec requires to encode at least its own filename. Use the C
2561 version of the locale codec until the codec registry is initialized and
2562 the Python codec is loaded.
2563
2564 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2565 cannot only rely on it: check also interp->fscodec_initialized for
2566 subinterpreters. */
2567 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002568 return PyUnicode_Decode(s, size,
2569 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002570 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002571 }
2572 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002573 /* locale encoding with surrogateescape */
2574 wchar_t *wchar;
2575 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002576 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002577
2578 if (s[size] != '\0' || size != strlen(s)) {
2579 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2580 return NULL;
2581 }
2582
Victor Stinner168e1172010-10-16 23:16:16 +00002583 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002584 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002585 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002586
Victor Stinner168e1172010-10-16 23:16:16 +00002587 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002588 PyMem_Free(wchar);
2589 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002590 }
Victor Stinnerad158722010-10-27 00:25:46 +00002591#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002592}
2593
Martin v. Löwis011e8422009-05-05 04:43:17 +00002594
2595int
2596PyUnicode_FSConverter(PyObject* arg, void* addr)
2597{
2598 PyObject *output = NULL;
2599 Py_ssize_t size;
2600 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002601 if (arg == NULL) {
2602 Py_DECREF(*(PyObject**)addr);
2603 return 1;
2604 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002605 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002606 output = arg;
2607 Py_INCREF(output);
2608 }
2609 else {
2610 arg = PyUnicode_FromObject(arg);
2611 if (!arg)
2612 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002613 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002614 Py_DECREF(arg);
2615 if (!output)
2616 return 0;
2617 if (!PyBytes_Check(output)) {
2618 Py_DECREF(output);
2619 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2620 return 0;
2621 }
2622 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002623 size = PyBytes_GET_SIZE(output);
2624 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002625 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002626 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002627 Py_DECREF(output);
2628 return 0;
2629 }
2630 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002631 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002632}
2633
2634
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002635int
2636PyUnicode_FSDecoder(PyObject* arg, void* addr)
2637{
2638 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002639 if (arg == NULL) {
2640 Py_DECREF(*(PyObject**)addr);
2641 return 1;
2642 }
2643 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 if (PyUnicode_READY(arg))
2645 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002646 output = arg;
2647 Py_INCREF(output);
2648 }
2649 else {
2650 arg = PyBytes_FromObject(arg);
2651 if (!arg)
2652 return 0;
2653 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2654 PyBytes_GET_SIZE(arg));
2655 Py_DECREF(arg);
2656 if (!output)
2657 return 0;
2658 if (!PyUnicode_Check(output)) {
2659 Py_DECREF(output);
2660 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2661 return 0;
2662 }
2663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002664 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2665 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002666 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2667 Py_DECREF(output);
2668 return 0;
2669 }
2670 *(PyObject**)addr = output;
2671 return Py_CLEANUP_SUPPORTED;
2672}
2673
2674
Martin v. Löwis5b222132007-06-10 09:51:05 +00002675char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002676PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002677{
Christian Heimesf3863112007-11-22 07:46:41 +00002678 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002679 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2680
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002681 if (!PyUnicode_Check(unicode)) {
2682 PyErr_BadArgument();
2683 return NULL;
2684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002685 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002686 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002687
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002688 if (PyUnicode_UTF8(unicode) == NULL) {
2689 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2691 if (bytes == NULL)
2692 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002693 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2694 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002695 Py_DECREF(bytes);
2696 return NULL;
2697 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002698 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2699 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 Py_DECREF(bytes);
2701 }
2702
2703 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002704 *psize = PyUnicode_UTF8_LENGTH(unicode);
2705 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002706}
2707
2708char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002709PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002710{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2712}
2713
2714#ifdef Py_DEBUG
2715int unicode_as_unicode_calls = 0;
2716#endif
2717
2718
2719Py_UNICODE *
2720PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2721{
2722 PyUnicodeObject *u;
2723 const unsigned char *one_byte;
2724#if SIZEOF_WCHAR_T == 4
2725 const Py_UCS2 *two_bytes;
2726#else
2727 const Py_UCS4 *four_bytes;
2728 const Py_UCS4 *ucs4_end;
2729 Py_ssize_t num_surrogates;
2730#endif
2731 wchar_t *w;
2732 wchar_t *wchar_end;
2733
2734 if (!PyUnicode_Check(unicode)) {
2735 PyErr_BadArgument();
2736 return NULL;
2737 }
2738 u = (PyUnicodeObject*)unicode;
2739 if (_PyUnicode_WSTR(u) == NULL) {
2740 /* Non-ASCII compact unicode object */
2741 assert(_PyUnicode_KIND(u) != 0);
2742 assert(PyUnicode_IS_READY(u));
2743
2744#ifdef Py_DEBUG
2745 ++unicode_as_unicode_calls;
2746#endif
2747
2748 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2749#if SIZEOF_WCHAR_T == 2
2750 four_bytes = PyUnicode_4BYTE_DATA(u);
2751 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2752 num_surrogates = 0;
2753
2754 for (; four_bytes < ucs4_end; ++four_bytes) {
2755 if (*four_bytes > 0xFFFF)
2756 ++num_surrogates;
2757 }
2758
2759 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2760 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2761 if (!_PyUnicode_WSTR(u)) {
2762 PyErr_NoMemory();
2763 return NULL;
2764 }
2765 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2766
2767 w = _PyUnicode_WSTR(u);
2768 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2769 four_bytes = PyUnicode_4BYTE_DATA(u);
2770 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2771 if (*four_bytes > 0xFFFF) {
2772 /* encode surrogate pair in this case */
2773 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2774 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2775 }
2776 else
2777 *w = *four_bytes;
2778
2779 if (w > wchar_end) {
2780 assert(0 && "Miscalculated string end");
2781 }
2782 }
2783 *w = 0;
2784#else
2785 /* sizeof(wchar_t) == 4 */
2786 Py_FatalError("Impossible unicode object state, wstr and str "
2787 "should share memory already.");
2788 return NULL;
2789#endif
2790 }
2791 else {
2792 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2793 (_PyUnicode_LENGTH(u) + 1));
2794 if (!_PyUnicode_WSTR(u)) {
2795 PyErr_NoMemory();
2796 return NULL;
2797 }
2798 if (!PyUnicode_IS_COMPACT_ASCII(u))
2799 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2800 w = _PyUnicode_WSTR(u);
2801 wchar_end = w + _PyUnicode_LENGTH(u);
2802
2803 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2804 one_byte = PyUnicode_1BYTE_DATA(u);
2805 for (; w < wchar_end; ++one_byte, ++w)
2806 *w = *one_byte;
2807 /* null-terminate the wstr */
2808 *w = 0;
2809 }
2810 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2811#if SIZEOF_WCHAR_T == 4
2812 two_bytes = PyUnicode_2BYTE_DATA(u);
2813 for (; w < wchar_end; ++two_bytes, ++w)
2814 *w = *two_bytes;
2815 /* null-terminate the wstr */
2816 *w = 0;
2817#else
2818 /* sizeof(wchar_t) == 2 */
2819 PyObject_FREE(_PyUnicode_WSTR(u));
2820 _PyUnicode_WSTR(u) = NULL;
2821 Py_FatalError("Impossible unicode object state, wstr "
2822 "and str should share memory already.");
2823 return NULL;
2824#endif
2825 }
2826 else {
2827 assert(0 && "This should never happen.");
2828 }
2829 }
2830 }
2831 if (size != NULL)
2832 *size = PyUnicode_WSTR_LENGTH(u);
2833 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002834}
2835
Alexander Belopolsky40018472011-02-26 01:02:56 +00002836Py_UNICODE *
2837PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002839 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840}
2841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002842
Alexander Belopolsky40018472011-02-26 01:02:56 +00002843Py_ssize_t
2844PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845{
2846 if (!PyUnicode_Check(unicode)) {
2847 PyErr_BadArgument();
2848 goto onError;
2849 }
2850 return PyUnicode_GET_SIZE(unicode);
2851
Benjamin Peterson29060642009-01-31 22:14:21 +00002852 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 return -1;
2854}
2855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002856Py_ssize_t
2857PyUnicode_GetLength(PyObject *unicode)
2858{
Victor Stinner5a706cf2011-10-02 00:36:53 +02002859 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002860 PyErr_BadArgument();
2861 return -1;
2862 }
2863
2864 return PyUnicode_GET_LENGTH(unicode);
2865}
2866
2867Py_UCS4
2868PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2869{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02002870 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
2871 PyErr_BadArgument();
2872 return (Py_UCS4)-1;
2873 }
2874 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
2875 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002876 return (Py_UCS4)-1;
2877 }
2878 return PyUnicode_READ_CHAR(unicode, index);
2879}
2880
2881int
2882PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2883{
2884 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02002885 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002886 return -1;
2887 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02002888 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
2889 PyErr_SetString(PyExc_IndexError, "string index out of range");
2890 return -1;
2891 }
2892 if (_PyUnicode_Dirty(unicode))
2893 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002894 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2895 index, ch);
2896 return 0;
2897}
2898
Alexander Belopolsky40018472011-02-26 01:02:56 +00002899const char *
2900PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002901{
Victor Stinner42cb4622010-09-01 19:39:01 +00002902 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002903}
2904
Victor Stinner554f3f02010-06-16 23:33:54 +00002905/* create or adjust a UnicodeDecodeError */
2906static void
2907make_decode_exception(PyObject **exceptionObject,
2908 const char *encoding,
2909 const char *input, Py_ssize_t length,
2910 Py_ssize_t startpos, Py_ssize_t endpos,
2911 const char *reason)
2912{
2913 if (*exceptionObject == NULL) {
2914 *exceptionObject = PyUnicodeDecodeError_Create(
2915 encoding, input, length, startpos, endpos, reason);
2916 }
2917 else {
2918 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2919 goto onError;
2920 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2921 goto onError;
2922 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2923 goto onError;
2924 }
2925 return;
2926
2927onError:
2928 Py_DECREF(*exceptionObject);
2929 *exceptionObject = NULL;
2930}
2931
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002932/* error handling callback helper:
2933 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002934 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002935 and adjust various state variables.
2936 return 0 on success, -1 on error
2937*/
2938
Alexander Belopolsky40018472011-02-26 01:02:56 +00002939static int
2940unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002941 const char *encoding, const char *reason,
2942 const char **input, const char **inend, Py_ssize_t *startinpos,
2943 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2944 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002945{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002946 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002947
2948 PyObject *restuple = NULL;
2949 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002950 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002951 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002952 Py_ssize_t requiredsize;
2953 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002955 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002956 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002957 int res = -1;
2958
2959 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002960 *errorHandler = PyCodec_LookupError(errors);
2961 if (*errorHandler == NULL)
2962 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002963 }
2964
Victor Stinner554f3f02010-06-16 23:33:54 +00002965 make_decode_exception(exceptionObject,
2966 encoding,
2967 *input, *inend - *input,
2968 *startinpos, *endinpos,
2969 reason);
2970 if (*exceptionObject == NULL)
2971 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002972
2973 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2974 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002975 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002976 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002977 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002979 }
2980 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002981 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002982
2983 /* Copy back the bytes variables, which might have been modified by the
2984 callback */
2985 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2986 if (!inputobj)
2987 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002988 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002989 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002990 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002991 *input = PyBytes_AS_STRING(inputobj);
2992 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002993 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002994 /* we can DECREF safely, as the exception has another reference,
2995 so the object won't go away. */
2996 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002997
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002998 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002999 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003000 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003001 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3002 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003003 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003004
3005 /* need more space? (at least enough for what we
3006 have+the replacement+the rest of the string (starting
3007 at the new input position), so we won't have to check space
3008 when there are no errors in the rest of the string) */
3009 repptr = PyUnicode_AS_UNICODE(repunicode);
3010 repsize = PyUnicode_GET_SIZE(repunicode);
3011 requiredsize = *outpos + repsize + insize-newpos;
3012 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003013 if (requiredsize<2*outsize)
3014 requiredsize = 2*outsize;
3015 if (_PyUnicode_Resize(output, requiredsize) < 0)
3016 goto onError;
3017 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003018 }
3019 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003020 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003021 Py_UNICODE_COPY(*outptr, repptr, repsize);
3022 *outptr += repsize;
3023 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003024
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 /* we made it! */
3026 res = 0;
3027
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003029 Py_XDECREF(restuple);
3030 return res;
3031}
3032
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003033/* --- UTF-7 Codec -------------------------------------------------------- */
3034
Antoine Pitrou244651a2009-05-04 18:56:13 +00003035/* See RFC2152 for details. We encode conservatively and decode liberally. */
3036
3037/* Three simple macros defining base-64. */
3038
3039/* Is c a base-64 character? */
3040
3041#define IS_BASE64(c) \
3042 (((c) >= 'A' && (c) <= 'Z') || \
3043 ((c) >= 'a' && (c) <= 'z') || \
3044 ((c) >= '0' && (c) <= '9') || \
3045 (c) == '+' || (c) == '/')
3046
3047/* given that c is a base-64 character, what is its base-64 value? */
3048
3049#define FROM_BASE64(c) \
3050 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3051 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3052 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3053 (c) == '+' ? 62 : 63)
3054
3055/* What is the base-64 character of the bottom 6 bits of n? */
3056
3057#define TO_BASE64(n) \
3058 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3059
3060/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3061 * decoded as itself. We are permissive on decoding; the only ASCII
3062 * byte not decoding to itself is the + which begins a base64
3063 * string. */
3064
3065#define DECODE_DIRECT(c) \
3066 ((c) <= 127 && (c) != '+')
3067
3068/* The UTF-7 encoder treats ASCII characters differently according to
3069 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3070 * the above). See RFC2152. This array identifies these different
3071 * sets:
3072 * 0 : "Set D"
3073 * alphanumeric and '(),-./:?
3074 * 1 : "Set O"
3075 * !"#$%&*;<=>@[]^_`{|}
3076 * 2 : "whitespace"
3077 * ht nl cr sp
3078 * 3 : special (must be base64 encoded)
3079 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3080 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003081
Tim Petersced69f82003-09-16 20:30:58 +00003082static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003083char utf7_category[128] = {
3084/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3085 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3086/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3087 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3088/* sp ! " # $ % & ' ( ) * + , - . / */
3089 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3090/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3091 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3092/* @ A B C D E F G H I J K L M N O */
3093 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3094/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3096/* ` a b c d e f g h i j k l m n o */
3097 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3098/* p q r s t u v w x y z { | } ~ del */
3099 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003100};
3101
Antoine Pitrou244651a2009-05-04 18:56:13 +00003102/* ENCODE_DIRECT: this character should be encoded as itself. The
3103 * answer depends on whether we are encoding set O as itself, and also
3104 * on whether we are encoding whitespace as itself. RFC2152 makes it
3105 * clear that the answers to these questions vary between
3106 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003107
Antoine Pitrou244651a2009-05-04 18:56:13 +00003108#define ENCODE_DIRECT(c, directO, directWS) \
3109 ((c) < 128 && (c) > 0 && \
3110 ((utf7_category[(c)] == 0) || \
3111 (directWS && (utf7_category[(c)] == 2)) || \
3112 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003113
Alexander Belopolsky40018472011-02-26 01:02:56 +00003114PyObject *
3115PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003116 Py_ssize_t size,
3117 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003118{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003119 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3120}
3121
Antoine Pitrou244651a2009-05-04 18:56:13 +00003122/* The decoder. The only state we preserve is our read position,
3123 * i.e. how many characters we have consumed. So if we end in the
3124 * middle of a shift sequence we have to back off the read position
3125 * and the output to the beginning of the sequence, otherwise we lose
3126 * all the shift state (seen bits, number of bits seen, high
3127 * surrogate). */
3128
Alexander Belopolsky40018472011-02-26 01:02:56 +00003129PyObject *
3130PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003131 Py_ssize_t size,
3132 const char *errors,
3133 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003134{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003135 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003136 Py_ssize_t startinpos;
3137 Py_ssize_t endinpos;
3138 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003139 const char *e;
3140 PyUnicodeObject *unicode;
3141 Py_UNICODE *p;
3142 const char *errmsg = "";
3143 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003144 Py_UNICODE *shiftOutStart;
3145 unsigned int base64bits = 0;
3146 unsigned long base64buffer = 0;
3147 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003148 PyObject *errorHandler = NULL;
3149 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003150
3151 unicode = _PyUnicode_New(size);
3152 if (!unicode)
3153 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003154 if (size == 0) {
3155 if (consumed)
3156 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003157 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003158 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003160 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003161 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003162 e = s + size;
3163
3164 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003165 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003166 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003167 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003168
Antoine Pitrou244651a2009-05-04 18:56:13 +00003169 if (inShift) { /* in a base-64 section */
3170 if (IS_BASE64(ch)) { /* consume a base-64 character */
3171 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3172 base64bits += 6;
3173 s++;
3174 if (base64bits >= 16) {
3175 /* we have enough bits for a UTF-16 value */
3176 Py_UNICODE outCh = (Py_UNICODE)
3177 (base64buffer >> (base64bits-16));
3178 base64bits -= 16;
3179 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3180 if (surrogate) {
3181 /* expecting a second surrogate */
3182 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3183#ifdef Py_UNICODE_WIDE
3184 *p++ = (((surrogate & 0x3FF)<<10)
3185 | (outCh & 0x3FF)) + 0x10000;
3186#else
3187 *p++ = surrogate;
3188 *p++ = outCh;
3189#endif
3190 surrogate = 0;
3191 }
3192 else {
3193 surrogate = 0;
3194 errmsg = "second surrogate missing";
3195 goto utf7Error;
3196 }
3197 }
3198 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3199 /* first surrogate */
3200 surrogate = outCh;
3201 }
3202 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3203 errmsg = "unexpected second surrogate";
3204 goto utf7Error;
3205 }
3206 else {
3207 *p++ = outCh;
3208 }
3209 }
3210 }
3211 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003212 inShift = 0;
3213 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003214 if (surrogate) {
3215 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003216 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003217 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003218 if (base64bits > 0) { /* left-over bits */
3219 if (base64bits >= 6) {
3220 /* We've seen at least one base-64 character */
3221 errmsg = "partial character in shift sequence";
3222 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003223 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003224 else {
3225 /* Some bits remain; they should be zero */
3226 if (base64buffer != 0) {
3227 errmsg = "non-zero padding bits in shift sequence";
3228 goto utf7Error;
3229 }
3230 }
3231 }
3232 if (ch != '-') {
3233 /* '-' is absorbed; other terminating
3234 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003235 *p++ = ch;
3236 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003237 }
3238 }
3239 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003241 s++; /* consume '+' */
3242 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003243 s++;
3244 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003245 }
3246 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003247 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003248 shiftOutStart = p;
3249 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003250 }
3251 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003252 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003253 *p++ = ch;
3254 s++;
3255 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003256 else {
3257 startinpos = s-starts;
3258 s++;
3259 errmsg = "unexpected special character";
3260 goto utf7Error;
3261 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003262 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003263utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003264 outpos = p-PyUnicode_AS_UNICODE(unicode);
3265 endinpos = s-starts;
3266 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 errors, &errorHandler,
3268 "utf7", errmsg,
3269 &starts, &e, &startinpos, &endinpos, &exc, &s,
3270 &unicode, &outpos, &p))
3271 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003272 }
3273
Antoine Pitrou244651a2009-05-04 18:56:13 +00003274 /* end of string */
3275
3276 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3277 /* if we're in an inconsistent state, that's an error */
3278 if (surrogate ||
3279 (base64bits >= 6) ||
3280 (base64bits > 0 && base64buffer != 0)) {
3281 outpos = p-PyUnicode_AS_UNICODE(unicode);
3282 endinpos = size;
3283 if (unicode_decode_call_errorhandler(
3284 errors, &errorHandler,
3285 "utf7", "unterminated shift sequence",
3286 &starts, &e, &startinpos, &endinpos, &exc, &s,
3287 &unicode, &outpos, &p))
3288 goto onError;
3289 if (s < e)
3290 goto restart;
3291 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003292 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003293
3294 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003295 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003296 if (inShift) {
3297 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003298 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003299 }
3300 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003301 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003302 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003303 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003304
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003305 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003306 goto onError;
3307
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308 Py_XDECREF(errorHandler);
3309 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003310 if (PyUnicode_READY(unicode) == -1) {
3311 Py_DECREF(unicode);
3312 return NULL;
3313 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003314 return (PyObject *)unicode;
3315
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317 Py_XDECREF(errorHandler);
3318 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003319 Py_DECREF(unicode);
3320 return NULL;
3321}
3322
3323
Alexander Belopolsky40018472011-02-26 01:02:56 +00003324PyObject *
3325PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003326 Py_ssize_t size,
3327 int base64SetO,
3328 int base64WhiteSpace,
3329 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003330{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003331 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003332 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003333 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003334 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003335 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003336 unsigned int base64bits = 0;
3337 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003338 char * out;
3339 char * start;
3340
3341 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003342 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003343
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003344 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003345 return PyErr_NoMemory();
3346
Antoine Pitrou244651a2009-05-04 18:56:13 +00003347 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003348 if (v == NULL)
3349 return NULL;
3350
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003351 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003352 for (;i < size; ++i) {
3353 Py_UNICODE ch = s[i];
3354
Antoine Pitrou244651a2009-05-04 18:56:13 +00003355 if (inShift) {
3356 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3357 /* shifting out */
3358 if (base64bits) { /* output remaining bits */
3359 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3360 base64buffer = 0;
3361 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003362 }
3363 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003364 /* Characters not in the BASE64 set implicitly unshift the sequence
3365 so no '-' is required, except if the character is itself a '-' */
3366 if (IS_BASE64(ch) || ch == '-') {
3367 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003368 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003369 *out++ = (char) ch;
3370 }
3371 else {
3372 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003373 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003374 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003375 else { /* not in a shift sequence */
3376 if (ch == '+') {
3377 *out++ = '+';
3378 *out++ = '-';
3379 }
3380 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3381 *out++ = (char) ch;
3382 }
3383 else {
3384 *out++ = '+';
3385 inShift = 1;
3386 goto encode_char;
3387 }
3388 }
3389 continue;
3390encode_char:
3391#ifdef Py_UNICODE_WIDE
3392 if (ch >= 0x10000) {
3393 /* code first surrogate */
3394 base64bits += 16;
3395 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3396 while (base64bits >= 6) {
3397 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3398 base64bits -= 6;
3399 }
3400 /* prepare second surrogate */
3401 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3402 }
3403#endif
3404 base64bits += 16;
3405 base64buffer = (base64buffer << 16) | ch;
3406 while (base64bits >= 6) {
3407 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3408 base64bits -= 6;
3409 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003410 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003411 if (base64bits)
3412 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3413 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003414 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003415 if (_PyBytes_Resize(&v, out - start) < 0)
3416 return NULL;
3417 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003418}
3419
Antoine Pitrou244651a2009-05-04 18:56:13 +00003420#undef IS_BASE64
3421#undef FROM_BASE64
3422#undef TO_BASE64
3423#undef DECODE_DIRECT
3424#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003425
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426/* --- UTF-8 Codec -------------------------------------------------------- */
3427
Tim Petersced69f82003-09-16 20:30:58 +00003428static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003430 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3431 illegal prefix. See RFC 3629 for details */
3432 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3433 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003434 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3436 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3437 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3438 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003439 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3442 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003443 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3444 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3445 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3446 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3447 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448};
3449
Alexander Belopolsky40018472011-02-26 01:02:56 +00003450PyObject *
3451PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003452 Py_ssize_t size,
3453 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454{
Walter Dörwald69652032004-09-07 20:24:22 +00003455 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3456}
3457
Antoine Pitrouab868312009-01-10 15:40:25 +00003458/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3459#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3460
3461/* Mask to quickly check whether a C 'long' contains a
3462 non-ASCII, UTF8-encoded char. */
3463#if (SIZEOF_LONG == 8)
3464# define ASCII_CHAR_MASK 0x8080808080808080L
3465#elif (SIZEOF_LONG == 4)
3466# define ASCII_CHAR_MASK 0x80808080L
3467#else
3468# error C 'long' size should be either 4 or 8!
3469#endif
3470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003471/* Scans a UTF-8 string and returns the maximum character to be expected,
3472 the size of the decoded unicode string and if any major errors were
3473 encountered.
3474
3475 This function does check basic UTF-8 sanity, it does however NOT CHECK
3476 if the string contains surrogates, and if all continuation bytes are
3477 within the correct ranges, these checks are performed in
3478 PyUnicode_DecodeUTF8Stateful.
3479
3480 If it sets has_errors to 1, it means the value of unicode_size and max_char
3481 will be bogus and you should not rely on useful information in them.
3482 */
3483static Py_UCS4
3484utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3485 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3486 int *has_errors)
3487{
3488 Py_ssize_t n;
3489 Py_ssize_t char_count = 0;
3490 Py_UCS4 max_char = 127, new_max;
3491 Py_UCS4 upper_bound;
3492 const unsigned char *p = (const unsigned char *)s;
3493 const unsigned char *end = p + string_size;
3494 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3495 int err = 0;
3496
3497 for (; p < end && !err; ++p, ++char_count) {
3498 /* Only check value if it's not a ASCII char... */
3499 if (*p < 0x80) {
3500 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3501 an explanation. */
3502 if (!((size_t) p & LONG_PTR_MASK)) {
3503 /* Help register allocation */
3504 register const unsigned char *_p = p;
3505 while (_p < aligned_end) {
3506 unsigned long value = *(unsigned long *) _p;
3507 if (value & ASCII_CHAR_MASK)
3508 break;
3509 _p += SIZEOF_LONG;
3510 char_count += SIZEOF_LONG;
3511 }
3512 p = _p;
3513 if (p == end)
3514 break;
3515 }
3516 }
3517 if (*p >= 0x80) {
3518 n = utf8_code_length[*p];
3519 new_max = max_char;
3520 switch (n) {
3521 /* invalid start byte */
3522 case 0:
3523 err = 1;
3524 break;
3525 case 2:
3526 /* Code points between 0x00FF and 0x07FF inclusive.
3527 Approximate the upper bound of the code point,
3528 if this flips over 255 we can be sure it will be more
3529 than 255 and the string will need 2 bytes per code coint,
3530 if it stays under or equal to 255, we can be sure 1 byte
3531 is enough.
3532 ((*p & 0b00011111) << 6) | 0b00111111 */
3533 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3534 if (max_char < upper_bound)
3535 new_max = upper_bound;
3536 /* Ensure we track at least that we left ASCII space. */
3537 if (new_max < 128)
3538 new_max = 128;
3539 break;
3540 case 3:
3541 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3542 always > 255 and <= 65535 and will always need 2 bytes. */
3543 if (max_char < 65535)
3544 new_max = 65535;
3545 break;
3546 case 4:
3547 /* Code point will be above 0xFFFF for sure in this case. */
3548 new_max = 65537;
3549 break;
3550 /* Internal error, this should be caught by the first if */
3551 case 1:
3552 default:
3553 assert(0 && "Impossible case in utf8_max_char_and_size");
3554 err = 1;
3555 }
3556 /* Instead of number of overall bytes for this code point,
3557 n containts the number of following bytes: */
3558 --n;
3559 /* Check if the follow up chars are all valid continuation bytes */
3560 if (n >= 1) {
3561 const unsigned char *cont;
3562 if ((p + n) >= end) {
3563 if (consumed == 0)
3564 /* incomplete data, non-incremental decoding */
3565 err = 1;
3566 break;
3567 }
3568 for (cont = p + 1; cont < (p + n); ++cont) {
3569 if ((*cont & 0xc0) != 0x80) {
3570 err = 1;
3571 break;
3572 }
3573 }
3574 p += n;
3575 }
3576 else
3577 err = 1;
3578 max_char = new_max;
3579 }
3580 }
3581
3582 if (unicode_size)
3583 *unicode_size = char_count;
3584 if (has_errors)
3585 *has_errors = err;
3586 return max_char;
3587}
3588
3589/* Similar to PyUnicode_WRITE but can also write into wstr field
3590 of the legacy unicode representation */
3591#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3592 do { \
3593 const int k_ = (kind); \
3594 if (k_ == PyUnicode_WCHAR_KIND) \
3595 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3596 else if (k_ == PyUnicode_1BYTE_KIND) \
3597 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3598 else if (k_ == PyUnicode_2BYTE_KIND) \
3599 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3600 else \
3601 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3602 } while (0)
3603
Alexander Belopolsky40018472011-02-26 01:02:56 +00003604PyObject *
3605PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003606 Py_ssize_t size,
3607 const char *errors,
3608 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003609{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003612 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003613 Py_ssize_t startinpos;
3614 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003615 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003617 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003618 PyObject *errorHandler = NULL;
3619 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003620 Py_UCS4 maxchar = 0;
3621 Py_ssize_t unicode_size;
3622 Py_ssize_t i;
3623 int kind;
3624 void *data;
3625 int has_errors;
3626 Py_UNICODE *error_outptr;
3627#if SIZEOF_WCHAR_T == 2
3628 Py_ssize_t wchar_offset = 0;
3629#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630
Walter Dörwald69652032004-09-07 20:24:22 +00003631 if (size == 0) {
3632 if (consumed)
3633 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003634 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003636 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3637 consumed, &has_errors);
3638 if (has_errors) {
3639 unicode = _PyUnicode_New(size);
3640 if (!unicode)
3641 return NULL;
3642 kind = PyUnicode_WCHAR_KIND;
3643 data = PyUnicode_AS_UNICODE(unicode);
3644 assert(data != NULL);
3645 }
3646 else {
3647 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3648 if (!unicode)
3649 return NULL;
3650 /* When the string is ASCII only, just use memcpy and return.
3651 unicode_size may be != size if there is an incomplete UTF-8
3652 sequence at the end of the ASCII block. */
3653 if (maxchar < 128 && size == unicode_size) {
3654 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3655 return (PyObject *)unicode;
3656 }
3657 kind = PyUnicode_KIND(unicode);
3658 data = PyUnicode_DATA(unicode);
3659 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003661 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003663 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664
3665 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003666 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003667
3668 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003669 /* Fast path for runs of ASCII characters. Given that common UTF-8
3670 input will consist of an overwhelming majority of ASCII
3671 characters, we try to optimize for this case by checking
3672 as many characters as a C 'long' can contain.
3673 First, check if we can do an aligned read, as most CPUs have
3674 a penalty for unaligned reads.
3675 */
3676 if (!((size_t) s & LONG_PTR_MASK)) {
3677 /* Help register allocation */
3678 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003679 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003680 while (_s < aligned_end) {
3681 /* Read a whole long at a time (either 4 or 8 bytes),
3682 and do a fast unrolled copy if it only contains ASCII
3683 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003684 unsigned long value = *(unsigned long *) _s;
3685 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003686 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003687 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3688 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3689 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3690 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003691#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003692 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3693 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3694 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3695 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003696#endif
3697 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003698 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003699 }
3700 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003701 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003702 if (s == e)
3703 break;
3704 ch = (unsigned char)*s;
3705 }
3706 }
3707
3708 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003709 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710 s++;
3711 continue;
3712 }
3713
3714 n = utf8_code_length[ch];
3715
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003716 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003717 if (consumed)
3718 break;
3719 else {
3720 errmsg = "unexpected end of data";
3721 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003722 endinpos = startinpos+1;
3723 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3724 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003725 goto utf8Error;
3726 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003727 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728
3729 switch (n) {
3730
3731 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003732 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003733 startinpos = s-starts;
3734 endinpos = startinpos+1;
3735 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003736
3737 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003738 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003739 startinpos = s-starts;
3740 endinpos = startinpos+1;
3741 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742
3743 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003744 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003745 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003746 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003747 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003748 goto utf8Error;
3749 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003751 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 break;
3754
3755 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003756 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3757 will result in surrogates in range d800-dfff. Surrogates are
3758 not valid UTF-8 so they are rejected.
3759 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3760 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003761 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003762 (s[2] & 0xc0) != 0x80 ||
3763 ((unsigned char)s[0] == 0xE0 &&
3764 (unsigned char)s[1] < 0xA0) ||
3765 ((unsigned char)s[0] == 0xED &&
3766 (unsigned char)s[1] > 0x9F)) {
3767 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003768 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003769 endinpos = startinpos + 1;
3770
3771 /* if s[1] first two bits are 1 and 0, then the invalid
3772 continuation byte is s[2], so increment endinpos by 1,
3773 if not, s[1] is invalid and endinpos doesn't need to
3774 be incremented. */
3775 if ((s[1] & 0xC0) == 0x80)
3776 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003777 goto utf8Error;
3778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003780 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003782 break;
3783
3784 case 4:
3785 if ((s[1] & 0xc0) != 0x80 ||
3786 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003787 (s[3] & 0xc0) != 0x80 ||
3788 ((unsigned char)s[0] == 0xF0 &&
3789 (unsigned char)s[1] < 0x90) ||
3790 ((unsigned char)s[0] == 0xF4 &&
3791 (unsigned char)s[1] > 0x8F)) {
3792 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003793 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003794 endinpos = startinpos + 1;
3795 if ((s[1] & 0xC0) == 0x80) {
3796 endinpos++;
3797 if ((s[2] & 0xC0) == 0x80)
3798 endinpos++;
3799 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003800 goto utf8Error;
3801 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003802 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003803 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3804 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003806 /* If the string is flexible or we have native UCS-4, write
3807 directly.. */
3808 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3809 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 else {
3812 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814 /* translate from 10000..10FFFF to 0..FFFF */
3815 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817 /* high surrogate = top 10 bits added to D800 */
3818 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3819 (Py_UNICODE)(0xD800 + (ch >> 10)));
3820
3821 /* low surrogate = bottom 10 bits added to DC00 */
3822 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3823 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3824 }
3825#if SIZEOF_WCHAR_T == 2
3826 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003827#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 }
3830 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003831 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003832
Benjamin Peterson29060642009-01-31 22:14:21 +00003833 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003834 /* If this is not yet a resizable string, make it one.. */
3835 if (kind != PyUnicode_WCHAR_KIND) {
3836 const Py_UNICODE *u;
3837 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3838 if (!new_unicode)
3839 goto onError;
3840 u = PyUnicode_AsUnicode((PyObject *)unicode);
3841 if (!u)
3842 goto onError;
3843#if SIZEOF_WCHAR_T == 2
3844 i += wchar_offset;
3845#endif
3846 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3847 Py_DECREF(unicode);
3848 unicode = new_unicode;
3849 kind = 0;
3850 data = PyUnicode_AS_UNICODE(new_unicode);
3851 assert(data != NULL);
3852 }
3853 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003854 if (unicode_decode_call_errorhandler(
3855 errors, &errorHandler,
3856 "utf8", errmsg,
3857 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003858 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003859 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860 /* Update data because unicode_decode_call_errorhandler might have
3861 re-created or resized the unicode object. */
3862 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003863 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003865 /* Ensure the unicode_size calculation above was correct: */
3866 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3867
Walter Dörwald69652032004-09-07 20:24:22 +00003868 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003869 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871 /* Adjust length and ready string when it contained errors and
3872 is of the old resizable kind. */
3873 if (kind == PyUnicode_WCHAR_KIND) {
3874 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3875 PyUnicode_READY(unicode) == -1)
3876 goto onError;
3877 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003879 Py_XDECREF(errorHandler);
3880 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 if (PyUnicode_READY(unicode) == -1) {
3882 Py_DECREF(unicode);
3883 return NULL;
3884 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885 return (PyObject *)unicode;
3886
Benjamin Peterson29060642009-01-31 22:14:21 +00003887 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003888 Py_XDECREF(errorHandler);
3889 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 Py_DECREF(unicode);
3891 return NULL;
3892}
3893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003894#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003895
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003896#ifdef __APPLE__
3897
3898/* Simplified UTF-8 decoder using surrogateescape error handler,
3899 used to decode the command line arguments on Mac OS X. */
3900
3901wchar_t*
3902_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3903{
3904 int n;
3905 const char *e;
3906 wchar_t *unicode, *p;
3907
3908 /* Note: size will always be longer than the resulting Unicode
3909 character count */
3910 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3911 PyErr_NoMemory();
3912 return NULL;
3913 }
3914 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3915 if (!unicode)
3916 return NULL;
3917
3918 /* Unpack UTF-8 encoded data */
3919 p = unicode;
3920 e = s + size;
3921 while (s < e) {
3922 Py_UCS4 ch = (unsigned char)*s;
3923
3924 if (ch < 0x80) {
3925 *p++ = (wchar_t)ch;
3926 s++;
3927 continue;
3928 }
3929
3930 n = utf8_code_length[ch];
3931 if (s + n > e) {
3932 goto surrogateescape;
3933 }
3934
3935 switch (n) {
3936 case 0:
3937 case 1:
3938 goto surrogateescape;
3939
3940 case 2:
3941 if ((s[1] & 0xc0) != 0x80)
3942 goto surrogateescape;
3943 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3944 assert ((ch > 0x007F) && (ch <= 0x07FF));
3945 *p++ = (wchar_t)ch;
3946 break;
3947
3948 case 3:
3949 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3950 will result in surrogates in range d800-dfff. Surrogates are
3951 not valid UTF-8 so they are rejected.
3952 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3953 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3954 if ((s[1] & 0xc0) != 0x80 ||
3955 (s[2] & 0xc0) != 0x80 ||
3956 ((unsigned char)s[0] == 0xE0 &&
3957 (unsigned char)s[1] < 0xA0) ||
3958 ((unsigned char)s[0] == 0xED &&
3959 (unsigned char)s[1] > 0x9F)) {
3960
3961 goto surrogateescape;
3962 }
3963 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3964 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003965 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003966 break;
3967
3968 case 4:
3969 if ((s[1] & 0xc0) != 0x80 ||
3970 (s[2] & 0xc0) != 0x80 ||
3971 (s[3] & 0xc0) != 0x80 ||
3972 ((unsigned char)s[0] == 0xF0 &&
3973 (unsigned char)s[1] < 0x90) ||
3974 ((unsigned char)s[0] == 0xF4 &&
3975 (unsigned char)s[1] > 0x8F)) {
3976 goto surrogateescape;
3977 }
3978 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3979 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3980 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3981
3982#if SIZEOF_WCHAR_T == 4
3983 *p++ = (wchar_t)ch;
3984#else
3985 /* compute and append the two surrogates: */
3986
3987 /* translate from 10000..10FFFF to 0..FFFF */
3988 ch -= 0x10000;
3989
3990 /* high surrogate = top 10 bits added to D800 */
3991 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3992
3993 /* low surrogate = bottom 10 bits added to DC00 */
3994 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3995#endif
3996 break;
3997 }
3998 s += n;
3999 continue;
4000
4001 surrogateescape:
4002 *p++ = 0xDC00 + ch;
4003 s++;
4004 }
4005 *p = L'\0';
4006 return unicode;
4007}
4008
4009#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011/* Primary internal function which creates utf8 encoded bytes objects.
4012
4013 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004014 and allocate exactly as much space needed at the end. Else allocate the
4015 maximum possible needed (4 result bytes per Unicode character), and return
4016 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004017*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004018PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020{
Tim Peters602f7402002-04-27 18:03:26 +00004021#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004022
Guido van Rossum98297ee2007-11-06 21:34:58 +00004023 Py_ssize_t i; /* index into s of next input byte */
4024 PyObject *result; /* result string object */
4025 char *p; /* next free byte in output buffer */
4026 Py_ssize_t nallocated; /* number of result bytes allocated */
4027 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004028 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004029 PyObject *errorHandler = NULL;
4030 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004031 int kind;
4032 void *data;
4033 Py_ssize_t size;
4034 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4035#if SIZEOF_WCHAR_T == 2
4036 Py_ssize_t wchar_offset = 0;
4037#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039 if (!PyUnicode_Check(unicode)) {
4040 PyErr_BadArgument();
4041 return NULL;
4042 }
4043
4044 if (PyUnicode_READY(unicode) == -1)
4045 return NULL;
4046
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004047 if (PyUnicode_UTF8(unicode))
4048 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4049 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050
4051 kind = PyUnicode_KIND(unicode);
4052 data = PyUnicode_DATA(unicode);
4053 size = PyUnicode_GET_LENGTH(unicode);
4054
Tim Peters602f7402002-04-27 18:03:26 +00004055 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056
Tim Peters602f7402002-04-27 18:03:26 +00004057 if (size <= MAX_SHORT_UNICHARS) {
4058 /* Write into the stack buffer; nallocated can't overflow.
4059 * At the end, we'll allocate exactly as much heap space as it
4060 * turns out we need.
4061 */
4062 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004063 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004064 p = stackbuf;
4065 }
4066 else {
4067 /* Overallocate on the heap, and give the excess back at the end. */
4068 nallocated = size * 4;
4069 if (nallocated / 4 != size) /* overflow! */
4070 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004071 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004072 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004073 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004074 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004075 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004076
Tim Peters602f7402002-04-27 18:03:26 +00004077 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004078 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004079
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004080 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004081 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004083
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004085 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004086 *p++ = (char)(0xc0 | (ch >> 6));
4087 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004088 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089 Py_ssize_t newpos;
4090 PyObject *rep;
4091 Py_ssize_t repsize, k, startpos;
4092 startpos = i-1;
4093#if SIZEOF_WCHAR_T == 2
4094 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004095#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004096 rep = unicode_encode_call_errorhandler(
4097 errors, &errorHandler, "utf-8", "surrogates not allowed",
4098 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4099 &exc, startpos, startpos+1, &newpos);
4100 if (!rep)
4101 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004103 if (PyBytes_Check(rep))
4104 repsize = PyBytes_GET_SIZE(rep);
4105 else
4106 repsize = PyUnicode_GET_SIZE(rep);
4107
4108 if (repsize > 4) {
4109 Py_ssize_t offset;
4110
4111 if (result == NULL)
4112 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004113 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004114 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004116 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4117 /* integer overflow */
4118 PyErr_NoMemory();
4119 goto error;
4120 }
4121 nallocated += repsize - 4;
4122 if (result != NULL) {
4123 if (_PyBytes_Resize(&result, nallocated) < 0)
4124 goto error;
4125 } else {
4126 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004127 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004128 goto error;
4129 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4130 }
4131 p = PyBytes_AS_STRING(result) + offset;
4132 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004134 if (PyBytes_Check(rep)) {
4135 char *prep = PyBytes_AS_STRING(rep);
4136 for(k = repsize; k > 0; k--)
4137 *p++ = *prep++;
4138 } else /* rep is unicode */ {
4139 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4140 Py_UNICODE c;
4141
4142 for(k=0; k<repsize; k++) {
4143 c = prep[k];
4144 if (0x80 <= c) {
4145 raise_encode_exception(&exc, "utf-8",
4146 PyUnicode_AS_UNICODE(unicode),
4147 size, i-1, i,
4148 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004149 goto error;
4150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004151 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004152 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004153 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004154 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004155 } else if (ch < 0x10000) {
4156 *p++ = (char)(0xe0 | (ch >> 12));
4157 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4158 *p++ = (char)(0x80 | (ch & 0x3f));
4159 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004160 /* Encode UCS4 Unicode ordinals */
4161 *p++ = (char)(0xf0 | (ch >> 18));
4162 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4163 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4164 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004165#if SIZEOF_WCHAR_T == 2
4166 wchar_offset++;
4167#endif
Tim Peters602f7402002-04-27 18:03:26 +00004168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004170
Guido van Rossum98297ee2007-11-06 21:34:58 +00004171 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004172 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004173 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004174 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004175 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004176 }
4177 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004178 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004179 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004180 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004181 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004183
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004184 Py_XDECREF(errorHandler);
4185 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004186 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004187 error:
4188 Py_XDECREF(errorHandler);
4189 Py_XDECREF(exc);
4190 Py_XDECREF(result);
4191 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004192
Tim Peters602f7402002-04-27 18:03:26 +00004193#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194}
4195
Alexander Belopolsky40018472011-02-26 01:02:56 +00004196PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004197PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4198 Py_ssize_t size,
4199 const char *errors)
4200{
4201 PyObject *v, *unicode;
4202
4203 unicode = PyUnicode_FromUnicode(s, size);
4204 if (unicode == NULL)
4205 return NULL;
4206 v = _PyUnicode_AsUTF8String(unicode, errors);
4207 Py_DECREF(unicode);
4208 return v;
4209}
4210
4211PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004212PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004214 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215}
4216
Walter Dörwald41980ca2007-08-16 21:55:45 +00004217/* --- UTF-32 Codec ------------------------------------------------------- */
4218
4219PyObject *
4220PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004221 Py_ssize_t size,
4222 const char *errors,
4223 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004224{
4225 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4226}
4227
4228PyObject *
4229PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004230 Py_ssize_t size,
4231 const char *errors,
4232 int *byteorder,
4233 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004234{
4235 const char *starts = s;
4236 Py_ssize_t startinpos;
4237 Py_ssize_t endinpos;
4238 Py_ssize_t outpos;
4239 PyUnicodeObject *unicode;
4240 Py_UNICODE *p;
4241#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004242 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004243 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004244#else
4245 const int pairs = 0;
4246#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004247 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004248 int bo = 0; /* assume native ordering by default */
4249 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004250 /* Offsets from q for retrieving bytes in the right order. */
4251#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4252 int iorder[] = {0, 1, 2, 3};
4253#else
4254 int iorder[] = {3, 2, 1, 0};
4255#endif
4256 PyObject *errorHandler = NULL;
4257 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004258
Walter Dörwald41980ca2007-08-16 21:55:45 +00004259 q = (unsigned char *)s;
4260 e = q + size;
4261
4262 if (byteorder)
4263 bo = *byteorder;
4264
4265 /* Check for BOM marks (U+FEFF) in the input and adjust current
4266 byte order setting accordingly. In native mode, the leading BOM
4267 mark is skipped, in all other modes, it is copied to the output
4268 stream as-is (giving a ZWNBSP character). */
4269 if (bo == 0) {
4270 if (size >= 4) {
4271 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004272 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004273#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 if (bom == 0x0000FEFF) {
4275 q += 4;
4276 bo = -1;
4277 }
4278 else if (bom == 0xFFFE0000) {
4279 q += 4;
4280 bo = 1;
4281 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004282#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 if (bom == 0x0000FEFF) {
4284 q += 4;
4285 bo = 1;
4286 }
4287 else if (bom == 0xFFFE0000) {
4288 q += 4;
4289 bo = -1;
4290 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004291#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004293 }
4294
4295 if (bo == -1) {
4296 /* force LE */
4297 iorder[0] = 0;
4298 iorder[1] = 1;
4299 iorder[2] = 2;
4300 iorder[3] = 3;
4301 }
4302 else if (bo == 1) {
4303 /* force BE */
4304 iorder[0] = 3;
4305 iorder[1] = 2;
4306 iorder[2] = 1;
4307 iorder[3] = 0;
4308 }
4309
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004310 /* On narrow builds we split characters outside the BMP into two
4311 codepoints => count how much extra space we need. */
4312#ifndef Py_UNICODE_WIDE
4313 for (qq = q; qq < e; qq += 4)
4314 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4315 pairs++;
4316#endif
4317
4318 /* This might be one to much, because of a BOM */
4319 unicode = _PyUnicode_New((size+3)/4+pairs);
4320 if (!unicode)
4321 return NULL;
4322 if (size == 0)
4323 return (PyObject *)unicode;
4324
4325 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004326 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004327
Walter Dörwald41980ca2007-08-16 21:55:45 +00004328 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004329 Py_UCS4 ch;
4330 /* remaining bytes at the end? (size should be divisible by 4) */
4331 if (e-q<4) {
4332 if (consumed)
4333 break;
4334 errmsg = "truncated data";
4335 startinpos = ((const char *)q)-starts;
4336 endinpos = ((const char *)e)-starts;
4337 goto utf32Error;
4338 /* The remaining input chars are ignored if the callback
4339 chooses to skip the input */
4340 }
4341 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4342 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004343
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 if (ch >= 0x110000)
4345 {
4346 errmsg = "codepoint not in range(0x110000)";
4347 startinpos = ((const char *)q)-starts;
4348 endinpos = startinpos+4;
4349 goto utf32Error;
4350 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004351#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 if (ch >= 0x10000)
4353 {
4354 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4355 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4356 }
4357 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004358#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004359 *p++ = ch;
4360 q += 4;
4361 continue;
4362 utf32Error:
4363 outpos = p-PyUnicode_AS_UNICODE(unicode);
4364 if (unicode_decode_call_errorhandler(
4365 errors, &errorHandler,
4366 "utf32", errmsg,
4367 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4368 &unicode, &outpos, &p))
4369 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004370 }
4371
4372 if (byteorder)
4373 *byteorder = bo;
4374
4375 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004377
4378 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004379 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004380 goto onError;
4381
4382 Py_XDECREF(errorHandler);
4383 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004384 if (PyUnicode_READY(unicode) == -1) {
4385 Py_DECREF(unicode);
4386 return NULL;
4387 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004388 return (PyObject *)unicode;
4389
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004391 Py_DECREF(unicode);
4392 Py_XDECREF(errorHandler);
4393 Py_XDECREF(exc);
4394 return NULL;
4395}
4396
4397PyObject *
4398PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 Py_ssize_t size,
4400 const char *errors,
4401 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004402{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004403 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004404 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004405 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004406#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004407 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004408#else
4409 const int pairs = 0;
4410#endif
4411 /* Offsets from p for storing byte pairs in the right order. */
4412#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4413 int iorder[] = {0, 1, 2, 3};
4414#else
4415 int iorder[] = {3, 2, 1, 0};
4416#endif
4417
Benjamin Peterson29060642009-01-31 22:14:21 +00004418#define STORECHAR(CH) \
4419 do { \
4420 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4421 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4422 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4423 p[iorder[0]] = (CH) & 0xff; \
4424 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004425 } while(0)
4426
4427 /* In narrow builds we can output surrogate pairs as one codepoint,
4428 so we need less space. */
4429#ifndef Py_UNICODE_WIDE
4430 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4432 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4433 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004434#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004435 nsize = (size - pairs + (byteorder == 0));
4436 bytesize = nsize * 4;
4437 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004439 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004440 if (v == NULL)
4441 return NULL;
4442
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004443 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004444 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004446 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004447 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004448
4449 if (byteorder == -1) {
4450 /* force LE */
4451 iorder[0] = 0;
4452 iorder[1] = 1;
4453 iorder[2] = 2;
4454 iorder[3] = 3;
4455 }
4456 else if (byteorder == 1) {
4457 /* force BE */
4458 iorder[0] = 3;
4459 iorder[1] = 2;
4460 iorder[2] = 1;
4461 iorder[3] = 0;
4462 }
4463
4464 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004466#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004467 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4468 Py_UCS4 ch2 = *s;
4469 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4470 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4471 s++;
4472 size--;
4473 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004474 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004475#endif
4476 STORECHAR(ch);
4477 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004478
4479 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004480 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004481#undef STORECHAR
4482}
4483
Alexander Belopolsky40018472011-02-26 01:02:56 +00004484PyObject *
4485PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004486{
4487 if (!PyUnicode_Check(unicode)) {
4488 PyErr_BadArgument();
4489 return NULL;
4490 }
4491 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004492 PyUnicode_GET_SIZE(unicode),
4493 NULL,
4494 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004495}
4496
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497/* --- UTF-16 Codec ------------------------------------------------------- */
4498
Tim Peters772747b2001-08-09 22:21:55 +00004499PyObject *
4500PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 Py_ssize_t size,
4502 const char *errors,
4503 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504{
Walter Dörwald69652032004-09-07 20:24:22 +00004505 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4506}
4507
Antoine Pitrouab868312009-01-10 15:40:25 +00004508/* Two masks for fast checking of whether a C 'long' may contain
4509 UTF16-encoded surrogate characters. This is an efficient heuristic,
4510 assuming that non-surrogate characters with a code point >= 0x8000 are
4511 rare in most input.
4512 FAST_CHAR_MASK is used when the input is in native byte ordering,
4513 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004514*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004515#if (SIZEOF_LONG == 8)
4516# define FAST_CHAR_MASK 0x8000800080008000L
4517# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4518#elif (SIZEOF_LONG == 4)
4519# define FAST_CHAR_MASK 0x80008000L
4520# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4521#else
4522# error C 'long' size should be either 4 or 8!
4523#endif
4524
Walter Dörwald69652032004-09-07 20:24:22 +00004525PyObject *
4526PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 Py_ssize_t size,
4528 const char *errors,
4529 int *byteorder,
4530 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004531{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004533 Py_ssize_t startinpos;
4534 Py_ssize_t endinpos;
4535 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 PyUnicodeObject *unicode;
4537 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004538 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004539 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004540 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004541 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004542 /* Offsets from q for retrieving byte pairs in the right order. */
4543#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4544 int ihi = 1, ilo = 0;
4545#else
4546 int ihi = 0, ilo = 1;
4547#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548 PyObject *errorHandler = NULL;
4549 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550
4551 /* Note: size will always be longer than the resulting Unicode
4552 character count */
4553 unicode = _PyUnicode_New(size);
4554 if (!unicode)
4555 return NULL;
4556 if (size == 0)
4557 return (PyObject *)unicode;
4558
4559 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004560 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004561 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004562 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563
4564 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004565 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004566
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004567 /* Check for BOM marks (U+FEFF) in the input and adjust current
4568 byte order setting accordingly. In native mode, the leading BOM
4569 mark is skipped, in all other modes, it is copied to the output
4570 stream as-is (giving a ZWNBSP character). */
4571 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004572 if (size >= 2) {
4573 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004574#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004575 if (bom == 0xFEFF) {
4576 q += 2;
4577 bo = -1;
4578 }
4579 else if (bom == 0xFFFE) {
4580 q += 2;
4581 bo = 1;
4582 }
Tim Petersced69f82003-09-16 20:30:58 +00004583#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 if (bom == 0xFEFF) {
4585 q += 2;
4586 bo = 1;
4587 }
4588 else if (bom == 0xFFFE) {
4589 q += 2;
4590 bo = -1;
4591 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004592#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595
Tim Peters772747b2001-08-09 22:21:55 +00004596 if (bo == -1) {
4597 /* force LE */
4598 ihi = 1;
4599 ilo = 0;
4600 }
4601 else if (bo == 1) {
4602 /* force BE */
4603 ihi = 0;
4604 ilo = 1;
4605 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004606#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4607 native_ordering = ilo < ihi;
4608#else
4609 native_ordering = ilo > ihi;
4610#endif
Tim Peters772747b2001-08-09 22:21:55 +00004611
Antoine Pitrouab868312009-01-10 15:40:25 +00004612 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004613 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004614 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004615 /* First check for possible aligned read of a C 'long'. Unaligned
4616 reads are more expensive, better to defer to another iteration. */
4617 if (!((size_t) q & LONG_PTR_MASK)) {
4618 /* Fast path for runs of non-surrogate chars. */
4619 register const unsigned char *_q = q;
4620 Py_UNICODE *_p = p;
4621 if (native_ordering) {
4622 /* Native ordering is simple: as long as the input cannot
4623 possibly contain a surrogate char, do an unrolled copy
4624 of several 16-bit code points to the target object.
4625 The non-surrogate check is done on several input bytes
4626 at a time (as many as a C 'long' can contain). */
4627 while (_q < aligned_end) {
4628 unsigned long data = * (unsigned long *) _q;
4629 if (data & FAST_CHAR_MASK)
4630 break;
4631 _p[0] = ((unsigned short *) _q)[0];
4632 _p[1] = ((unsigned short *) _q)[1];
4633#if (SIZEOF_LONG == 8)
4634 _p[2] = ((unsigned short *) _q)[2];
4635 _p[3] = ((unsigned short *) _q)[3];
4636#endif
4637 _q += SIZEOF_LONG;
4638 _p += SIZEOF_LONG / 2;
4639 }
4640 }
4641 else {
4642 /* Byteswapped ordering is similar, but we must decompose
4643 the copy bytewise, and take care of zero'ing out the
4644 upper bytes if the target object is in 32-bit units
4645 (that is, in UCS-4 builds). */
4646 while (_q < aligned_end) {
4647 unsigned long data = * (unsigned long *) _q;
4648 if (data & SWAPPED_FAST_CHAR_MASK)
4649 break;
4650 /* Zero upper bytes in UCS-4 builds */
4651#if (Py_UNICODE_SIZE > 2)
4652 _p[0] = 0;
4653 _p[1] = 0;
4654#if (SIZEOF_LONG == 8)
4655 _p[2] = 0;
4656 _p[3] = 0;
4657#endif
4658#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004659 /* Issue #4916; UCS-4 builds on big endian machines must
4660 fill the two last bytes of each 4-byte unit. */
4661#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4662# define OFF 2
4663#else
4664# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004665#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004666 ((unsigned char *) _p)[OFF + 1] = _q[0];
4667 ((unsigned char *) _p)[OFF + 0] = _q[1];
4668 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4669 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4670#if (SIZEOF_LONG == 8)
4671 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4672 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4673 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4674 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4675#endif
4676#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004677 _q += SIZEOF_LONG;
4678 _p += SIZEOF_LONG / 2;
4679 }
4680 }
4681 p = _p;
4682 q = _q;
4683 if (q >= e)
4684 break;
4685 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004687
Benjamin Peterson14339b62009-01-31 16:36:08 +00004688 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004689
4690 if (ch < 0xD800 || ch > 0xDFFF) {
4691 *p++ = ch;
4692 continue;
4693 }
4694
4695 /* UTF-16 code pair: */
4696 if (q > e) {
4697 errmsg = "unexpected end of data";
4698 startinpos = (((const char *)q) - 2) - starts;
4699 endinpos = ((const char *)e) + 1 - starts;
4700 goto utf16Error;
4701 }
4702 if (0xD800 <= ch && ch <= 0xDBFF) {
4703 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4704 q += 2;
4705 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004706#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004707 *p++ = ch;
4708 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004709#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004710 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004711#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004712 continue;
4713 }
4714 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004715 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004716 startinpos = (((const char *)q)-4)-starts;
4717 endinpos = startinpos+2;
4718 goto utf16Error;
4719 }
4720
Benjamin Peterson14339b62009-01-31 16:36:08 +00004721 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004722 errmsg = "illegal encoding";
4723 startinpos = (((const char *)q)-2)-starts;
4724 endinpos = startinpos+2;
4725 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004726
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 utf16Error:
4728 outpos = p - PyUnicode_AS_UNICODE(unicode);
4729 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004730 errors,
4731 &errorHandler,
4732 "utf16", errmsg,
4733 &starts,
4734 (const char **)&e,
4735 &startinpos,
4736 &endinpos,
4737 &exc,
4738 (const char **)&q,
4739 &unicode,
4740 &outpos,
4741 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004744 /* remaining byte at the end? (size should be even) */
4745 if (e == q) {
4746 if (!consumed) {
4747 errmsg = "truncated data";
4748 startinpos = ((const char *)q) - starts;
4749 endinpos = ((const char *)e) + 1 - starts;
4750 outpos = p - PyUnicode_AS_UNICODE(unicode);
4751 if (unicode_decode_call_errorhandler(
4752 errors,
4753 &errorHandler,
4754 "utf16", errmsg,
4755 &starts,
4756 (const char **)&e,
4757 &startinpos,
4758 &endinpos,
4759 &exc,
4760 (const char **)&q,
4761 &unicode,
4762 &outpos,
4763 &p))
4764 goto onError;
4765 /* The remaining input chars are ignored if the callback
4766 chooses to skip the input */
4767 }
4768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769
4770 if (byteorder)
4771 *byteorder = bo;
4772
Walter Dörwald69652032004-09-07 20:24:22 +00004773 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004774 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004775
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 goto onError;
4779
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 Py_XDECREF(errorHandler);
4781 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004782 if (PyUnicode_READY(unicode) == -1) {
4783 Py_DECREF(unicode);
4784 return NULL;
4785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 return (PyObject *)unicode;
4787
Benjamin Peterson29060642009-01-31 22:14:21 +00004788 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790 Py_XDECREF(errorHandler);
4791 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 return NULL;
4793}
4794
Antoine Pitrouab868312009-01-10 15:40:25 +00004795#undef FAST_CHAR_MASK
4796#undef SWAPPED_FAST_CHAR_MASK
4797
Tim Peters772747b2001-08-09 22:21:55 +00004798PyObject *
4799PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004800 Py_ssize_t size,
4801 const char *errors,
4802 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004804 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004805 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004806 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004807#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004808 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004809#else
4810 const int pairs = 0;
4811#endif
Tim Peters772747b2001-08-09 22:21:55 +00004812 /* Offsets from p for storing byte pairs in the right order. */
4813#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4814 int ihi = 1, ilo = 0;
4815#else
4816 int ihi = 0, ilo = 1;
4817#endif
4818
Benjamin Peterson29060642009-01-31 22:14:21 +00004819#define STORECHAR(CH) \
4820 do { \
4821 p[ihi] = ((CH) >> 8) & 0xff; \
4822 p[ilo] = (CH) & 0xff; \
4823 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004824 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004826#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004827 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004828 if (s[i] >= 0x10000)
4829 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004830#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004831 /* 2 * (size + pairs + (byteorder == 0)) */
4832 if (size > PY_SSIZE_T_MAX ||
4833 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004834 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004835 nsize = size + pairs + (byteorder == 0);
4836 bytesize = nsize * 2;
4837 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004838 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004839 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 if (v == NULL)
4841 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004843 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004846 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004847 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004848
4849 if (byteorder == -1) {
4850 /* force LE */
4851 ihi = 1;
4852 ilo = 0;
4853 }
4854 else if (byteorder == 1) {
4855 /* force BE */
4856 ihi = 0;
4857 ilo = 1;
4858 }
4859
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004860 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 Py_UNICODE ch = *s++;
4862 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004863#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004864 if (ch >= 0x10000) {
4865 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4866 ch = 0xD800 | ((ch-0x10000) >> 10);
4867 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004868#endif
Tim Peters772747b2001-08-09 22:21:55 +00004869 STORECHAR(ch);
4870 if (ch2)
4871 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004872 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004873
4874 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004875 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004876#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877}
4878
Alexander Belopolsky40018472011-02-26 01:02:56 +00004879PyObject *
4880PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881{
4882 if (!PyUnicode_Check(unicode)) {
4883 PyErr_BadArgument();
4884 return NULL;
4885 }
4886 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004887 PyUnicode_GET_SIZE(unicode),
4888 NULL,
4889 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890}
4891
4892/* --- Unicode Escape Codec ----------------------------------------------- */
4893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004894/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4895 if all the escapes in the string make it still a valid ASCII string.
4896 Returns -1 if any escapes were found which cause the string to
4897 pop out of ASCII range. Otherwise returns the length of the
4898 required buffer to hold the string.
4899 */
4900Py_ssize_t
4901length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4902{
4903 const unsigned char *p = (const unsigned char *)s;
4904 const unsigned char *end = p + size;
4905 Py_ssize_t length = 0;
4906
4907 if (size < 0)
4908 return -1;
4909
4910 for (; p < end; ++p) {
4911 if (*p > 127) {
4912 /* Non-ASCII */
4913 return -1;
4914 }
4915 else if (*p != '\\') {
4916 /* Normal character */
4917 ++length;
4918 }
4919 else {
4920 /* Backslash-escape, check next char */
4921 ++p;
4922 /* Escape sequence reaches till end of string or
4923 non-ASCII follow-up. */
4924 if (p >= end || *p > 127)
4925 return -1;
4926 switch (*p) {
4927 case '\n':
4928 /* backslash + \n result in zero characters */
4929 break;
4930 case '\\': case '\'': case '\"':
4931 case 'b': case 'f': case 't':
4932 case 'n': case 'r': case 'v': case 'a':
4933 ++length;
4934 break;
4935 case '0': case '1': case '2': case '3':
4936 case '4': case '5': case '6': case '7':
4937 case 'x': case 'u': case 'U': case 'N':
4938 /* these do not guarantee ASCII characters */
4939 return -1;
4940 default:
4941 /* count the backslash + the other character */
4942 length += 2;
4943 }
4944 }
4945 }
4946 return length;
4947}
4948
4949/* Similar to PyUnicode_WRITE but either write into wstr field
4950 or treat string as ASCII. */
4951#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4952 do { \
4953 if ((kind) != PyUnicode_WCHAR_KIND) \
4954 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4955 else \
4956 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4957 } while (0)
4958
4959#define WRITE_WSTR(buf, index, value) \
4960 assert(kind == PyUnicode_WCHAR_KIND), \
4961 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4962
4963
Fredrik Lundh06d12682001-01-24 07:59:11 +00004964static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004965
Alexander Belopolsky40018472011-02-26 01:02:56 +00004966PyObject *
4967PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004968 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02004969 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004971 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004972 Py_ssize_t startinpos;
4973 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004974 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004976 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004978 char* message;
4979 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004980 PyObject *errorHandler = NULL;
4981 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004982 Py_ssize_t ascii_length;
4983 Py_ssize_t i;
4984 int kind;
4985 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004987 ascii_length = length_of_escaped_ascii_string(s, size);
4988
4989 /* After length_of_escaped_ascii_string() there are two alternatives,
4990 either the string is pure ASCII with named escapes like \n, etc.
4991 and we determined it's exact size (common case)
4992 or it contains \x, \u, ... escape sequences. then we create a
4993 legacy wchar string and resize it at the end of this function. */
4994 if (ascii_length >= 0) {
4995 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4996 if (!v)
4997 goto onError;
4998 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4999 kind = PyUnicode_1BYTE_KIND;
5000 data = PyUnicode_DATA(v);
5001 }
5002 else {
5003 /* Escaped strings will always be longer than the resulting
5004 Unicode string, so we start with size here and then reduce the
5005 length after conversion to the true value.
5006 (but if the error callback returns a long replacement string
5007 we'll have to allocate more space) */
5008 v = _PyUnicode_New(size);
5009 if (!v)
5010 goto onError;
5011 kind = PyUnicode_WCHAR_KIND;
5012 data = PyUnicode_AS_UNICODE(v);
5013 }
5014
Guido van Rossumd57fd912000-03-10 22:53:23 +00005015 if (size == 0)
5016 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005017 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005018 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005019
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 while (s < end) {
5021 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005022 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005023 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005025 if (kind == PyUnicode_WCHAR_KIND) {
5026 assert(i < _PyUnicode_WSTR_LENGTH(v));
5027 }
5028 else {
5029 /* The only case in which i == ascii_length is a backslash
5030 followed by a newline. */
5031 assert(i <= ascii_length);
5032 }
5033
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034 /* Non-escape characters are interpreted as Unicode ordinals */
5035 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005036 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 continue;
5038 }
5039
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005040 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041 /* \ - Escapes */
5042 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005043 c = *s++;
5044 if (s > end)
5045 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005046
5047 if (kind == PyUnicode_WCHAR_KIND) {
5048 assert(i < _PyUnicode_WSTR_LENGTH(v));
5049 }
5050 else {
5051 /* The only case in which i == ascii_length is a backslash
5052 followed by a newline. */
5053 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5054 }
5055
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005056 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057
Benjamin Peterson29060642009-01-31 22:14:21 +00005058 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005060 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5061 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5062 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5063 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5064 /* FF */
5065 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5066 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5067 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5068 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5069 /* VT */
5070 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5071 /* BEL, not classic C */
5072 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073
Benjamin Peterson29060642009-01-31 22:14:21 +00005074 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075 case '0': case '1': case '2': case '3':
5076 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005077 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005078 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005079 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005080 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005081 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005083 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084 break;
5085
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 /* hex escapes */
5087 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005089 digits = 2;
5090 message = "truncated \\xXX escape";
5091 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005095 digits = 4;
5096 message = "truncated \\uXXXX escape";
5097 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098
Benjamin Peterson29060642009-01-31 22:14:21 +00005099 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005100 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005101 digits = 8;
5102 message = "truncated \\UXXXXXXXX escape";
5103 hexescape:
5104 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005105 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005106 if (s+digits>end) {
5107 endinpos = size;
5108 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 errors, &errorHandler,
5110 "unicodeescape", "end of string in escape sequence",
5111 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005112 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005114 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005115 goto nextByte;
5116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005117 for (j = 0; j < digits; ++j) {
5118 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005119 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005120 endinpos = (s+j+1)-starts;
5121 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005122 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005123 errors, &errorHandler,
5124 "unicodeescape", message,
5125 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005126 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005127 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005128 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005130 }
5131 chr = (chr<<4) & ~0xF;
5132 if (c >= '0' && c <= '9')
5133 chr += c - '0';
5134 else if (c >= 'a' && c <= 'f')
5135 chr += 10 + c - 'a';
5136 else
5137 chr += 10 + c - 'A';
5138 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005139 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005140 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005141 /* _decoding_error will have already written into the
5142 target buffer. */
5143 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005144 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005145 /* when we get here, chr is a 32-bit unicode character */
5146 if (chr <= 0xffff)
5147 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005148 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005149 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005150 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005151 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005152#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005153 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005154#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005155 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005156 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5157 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005158#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005159 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005160 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005161 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005162 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005163 errors, &errorHandler,
5164 "unicodeescape", "illegal Unicode character",
5165 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005166 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005167 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005168 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005169 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005170 break;
5171
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005173 case 'N':
5174 message = "malformed \\N character escape";
5175 if (ucnhash_CAPI == NULL) {
5176 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005177 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5178 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005179 if (ucnhash_CAPI == NULL)
5180 goto ucnhashError;
5181 }
5182 if (*s == '{') {
5183 const char *start = s+1;
5184 /* look for the closing brace */
5185 while (*s != '}' && s < end)
5186 s++;
5187 if (s > start && s < end && *s == '}') {
5188 /* found a name. look it up in the unicode database */
5189 message = "unknown Unicode character name";
5190 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005191 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5192 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005193 goto store;
5194 }
5195 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005196 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005197 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005198 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 errors, &errorHandler,
5200 "unicodeescape", message,
5201 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005202 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005203 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005204 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005205 break;
5206
5207 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005208 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005209 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005210 message = "\\ at end of string";
5211 s--;
5212 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005213 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005215 errors, &errorHandler,
5216 "unicodeescape", message,
5217 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005218 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005219 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005220 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005221 }
5222 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005223 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5224 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005225 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005226 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005229 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005231 /* Ensure the length prediction worked in case of ASCII strings */
5232 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5233
5234 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5235 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005236 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005237 Py_XDECREF(errorHandler);
5238 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005240
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005242 PyErr_SetString(
5243 PyExc_UnicodeError,
5244 "\\N escapes not supported (can't load unicodedata module)"
5245 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005246 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005247 Py_XDECREF(errorHandler);
5248 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005249 return NULL;
5250
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005253 Py_XDECREF(errorHandler);
5254 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 return NULL;
5256}
5257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005258#undef WRITE_ASCII_OR_WSTR
5259#undef WRITE_WSTR
5260
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261/* Return a Unicode-Escape string version of the Unicode object.
5262
5263 If quotes is true, the string is enclosed in u"" or u'' quotes as
5264 appropriate.
5265
5266*/
5267
Walter Dörwald79e913e2007-05-12 11:08:06 +00005268static const char *hexdigits = "0123456789abcdef";
5269
Alexander Belopolsky40018472011-02-26 01:02:56 +00005270PyObject *
5271PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005272 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005274 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005277#ifdef Py_UNICODE_WIDE
5278 const Py_ssize_t expandsize = 10;
5279#else
5280 const Py_ssize_t expandsize = 6;
5281#endif
5282
Thomas Wouters89f507f2006-12-13 04:49:30 +00005283 /* XXX(nnorwitz): rather than over-allocating, it would be
5284 better to choose a different scheme. Perhaps scan the
5285 first N-chars of the string and allocate based on that size.
5286 */
5287 /* Initial allocation is based on the longest-possible unichr
5288 escape.
5289
5290 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5291 unichr, so in this case it's the longest unichr escape. In
5292 narrow (UTF-16) builds this is five chars per source unichr
5293 since there are two unichrs in the surrogate pair, so in narrow
5294 (UTF-16) builds it's not the longest unichr escape.
5295
5296 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5297 so in the narrow (UTF-16) build case it's the longest unichr
5298 escape.
5299 */
5300
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005301 if (size == 0)
5302 return PyBytes_FromStringAndSize(NULL, 0);
5303
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005304 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005305 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005306
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005307 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005308 2
5309 + expandsize*size
5310 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 if (repr == NULL)
5312 return NULL;
5313
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005314 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 while (size-- > 0) {
5317 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005318
Walter Dörwald79e913e2007-05-12 11:08:06 +00005319 /* Escape backslashes */
5320 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321 *p++ = '\\';
5322 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005323 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005324 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005325
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005326#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005327 /* Map 21-bit characters to '\U00xxxxxx' */
5328 else if (ch >= 0x10000) {
5329 *p++ = '\\';
5330 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005331 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5332 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5333 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5334 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5335 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5336 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5337 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5338 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005340 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005341#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5343 else if (ch >= 0xD800 && ch < 0xDC00) {
5344 Py_UNICODE ch2;
5345 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005346
Benjamin Peterson29060642009-01-31 22:14:21 +00005347 ch2 = *s++;
5348 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005349 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5351 *p++ = '\\';
5352 *p++ = 'U';
5353 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5354 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5355 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5356 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5357 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5358 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5359 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5360 *p++ = hexdigits[ucs & 0x0000000F];
5361 continue;
5362 }
5363 /* Fall through: isolated surrogates are copied as-is */
5364 s--;
5365 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005366 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005367#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005368
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005370 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 *p++ = '\\';
5372 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005373 *p++ = hexdigits[(ch >> 12) & 0x000F];
5374 *p++ = hexdigits[(ch >> 8) & 0x000F];
5375 *p++ = hexdigits[(ch >> 4) & 0x000F];
5376 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005378
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005379 /* Map special whitespace to '\t', \n', '\r' */
5380 else if (ch == '\t') {
5381 *p++ = '\\';
5382 *p++ = 't';
5383 }
5384 else if (ch == '\n') {
5385 *p++ = '\\';
5386 *p++ = 'n';
5387 }
5388 else if (ch == '\r') {
5389 *p++ = '\\';
5390 *p++ = 'r';
5391 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005392
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005393 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005394 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005396 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005397 *p++ = hexdigits[(ch >> 4) & 0x000F];
5398 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005399 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005400
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 /* Copy everything else as-is */
5402 else
5403 *p++ = (char) ch;
5404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005406 assert(p - PyBytes_AS_STRING(repr) > 0);
5407 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5408 return NULL;
5409 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410}
5411
Alexander Belopolsky40018472011-02-26 01:02:56 +00005412PyObject *
5413PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005415 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416 if (!PyUnicode_Check(unicode)) {
5417 PyErr_BadArgument();
5418 return NULL;
5419 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005420 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5421 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005422 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423}
5424
5425/* --- Raw Unicode Escape Codec ------------------------------------------- */
5426
Alexander Belopolsky40018472011-02-26 01:02:56 +00005427PyObject *
5428PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005429 Py_ssize_t size,
5430 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005432 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005433 Py_ssize_t startinpos;
5434 Py_ssize_t endinpos;
5435 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005437 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 const char *end;
5439 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005440 PyObject *errorHandler = NULL;
5441 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005442
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 /* Escaped strings will always be longer than the resulting
5444 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005445 length after conversion to the true value. (But decoding error
5446 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 v = _PyUnicode_New(size);
5448 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005449 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005451 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005452 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 end = s + size;
5454 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 unsigned char c;
5456 Py_UCS4 x;
5457 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005458 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 /* Non-escape characters are interpreted as Unicode ordinals */
5461 if (*s != '\\') {
5462 *p++ = (unsigned char)*s++;
5463 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005464 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 startinpos = s-starts;
5466
5467 /* \u-escapes are only interpreted iff the number of leading
5468 backslashes if odd */
5469 bs = s;
5470 for (;s < end;) {
5471 if (*s != '\\')
5472 break;
5473 *p++ = (unsigned char)*s++;
5474 }
5475 if (((s - bs) & 1) == 0 ||
5476 s >= end ||
5477 (*s != 'u' && *s != 'U')) {
5478 continue;
5479 }
5480 p--;
5481 count = *s=='u' ? 4 : 8;
5482 s++;
5483
5484 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5485 outpos = p-PyUnicode_AS_UNICODE(v);
5486 for (x = 0, i = 0; i < count; ++i, ++s) {
5487 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005488 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 endinpos = s-starts;
5490 if (unicode_decode_call_errorhandler(
5491 errors, &errorHandler,
5492 "rawunicodeescape", "truncated \\uXXXX",
5493 &starts, &end, &startinpos, &endinpos, &exc, &s,
5494 &v, &outpos, &p))
5495 goto onError;
5496 goto nextByte;
5497 }
5498 x = (x<<4) & ~0xF;
5499 if (c >= '0' && c <= '9')
5500 x += c - '0';
5501 else if (c >= 'a' && c <= 'f')
5502 x += 10 + c - 'a';
5503 else
5504 x += 10 + c - 'A';
5505 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005506 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 /* UCS-2 character */
5508 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005509 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 /* UCS-4 character. Either store directly, or as
5511 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005512#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005514#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 x -= 0x10000L;
5516 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5517 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005518#endif
5519 } else {
5520 endinpos = s-starts;
5521 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005522 if (unicode_decode_call_errorhandler(
5523 errors, &errorHandler,
5524 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 &starts, &end, &startinpos, &endinpos, &exc, &s,
5526 &v, &outpos, &p))
5527 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005528 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 nextByte:
5530 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005532 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005533 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005534 Py_XDECREF(errorHandler);
5535 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005536 if (PyUnicode_READY(v) == -1) {
5537 Py_DECREF(v);
5538 return NULL;
5539 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005541
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005544 Py_XDECREF(errorHandler);
5545 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 return NULL;
5547}
5548
Alexander Belopolsky40018472011-02-26 01:02:56 +00005549PyObject *
5550PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005551 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005553 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 char *p;
5555 char *q;
5556
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005557#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005558 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005559#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005560 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005561#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005562
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005563 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005565
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005566 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 if (repr == NULL)
5568 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005569 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005570 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005572 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 while (size-- > 0) {
5574 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005575#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 /* Map 32-bit characters to '\Uxxxxxxxx' */
5577 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005578 *p++ = '\\';
5579 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005580 *p++ = hexdigits[(ch >> 28) & 0xf];
5581 *p++ = hexdigits[(ch >> 24) & 0xf];
5582 *p++ = hexdigits[(ch >> 20) & 0xf];
5583 *p++ = hexdigits[(ch >> 16) & 0xf];
5584 *p++ = hexdigits[(ch >> 12) & 0xf];
5585 *p++ = hexdigits[(ch >> 8) & 0xf];
5586 *p++ = hexdigits[(ch >> 4) & 0xf];
5587 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005588 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005589 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005590#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5592 if (ch >= 0xD800 && ch < 0xDC00) {
5593 Py_UNICODE ch2;
5594 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005595
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 ch2 = *s++;
5597 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005598 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5600 *p++ = '\\';
5601 *p++ = 'U';
5602 *p++ = hexdigits[(ucs >> 28) & 0xf];
5603 *p++ = hexdigits[(ucs >> 24) & 0xf];
5604 *p++ = hexdigits[(ucs >> 20) & 0xf];
5605 *p++ = hexdigits[(ucs >> 16) & 0xf];
5606 *p++ = hexdigits[(ucs >> 12) & 0xf];
5607 *p++ = hexdigits[(ucs >> 8) & 0xf];
5608 *p++ = hexdigits[(ucs >> 4) & 0xf];
5609 *p++ = hexdigits[ucs & 0xf];
5610 continue;
5611 }
5612 /* Fall through: isolated surrogates are copied as-is */
5613 s--;
5614 size++;
5615 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005616#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 /* Map 16-bit characters to '\uxxxx' */
5618 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 *p++ = '\\';
5620 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005621 *p++ = hexdigits[(ch >> 12) & 0xf];
5622 *p++ = hexdigits[(ch >> 8) & 0xf];
5623 *p++ = hexdigits[(ch >> 4) & 0xf];
5624 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 /* Copy everything else as-is */
5627 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 *p++ = (char) ch;
5629 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005630 size = p - q;
5631
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005632 assert(size > 0);
5633 if (_PyBytes_Resize(&repr, size) < 0)
5634 return NULL;
5635 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636}
5637
Alexander Belopolsky40018472011-02-26 01:02:56 +00005638PyObject *
5639PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005641 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005643 PyErr_BadArgument();
5644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005646 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5647 PyUnicode_GET_SIZE(unicode));
5648
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005649 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650}
5651
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005652/* --- Unicode Internal Codec ------------------------------------------- */
5653
Alexander Belopolsky40018472011-02-26 01:02:56 +00005654PyObject *
5655_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005656 Py_ssize_t size,
5657 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005658{
5659 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005660 Py_ssize_t startinpos;
5661 Py_ssize_t endinpos;
5662 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005663 PyUnicodeObject *v;
5664 Py_UNICODE *p;
5665 const char *end;
5666 const char *reason;
5667 PyObject *errorHandler = NULL;
5668 PyObject *exc = NULL;
5669
Neal Norwitzd43069c2006-01-08 01:12:10 +00005670#ifdef Py_UNICODE_WIDE
5671 Py_UNICODE unimax = PyUnicode_GetMax();
5672#endif
5673
Thomas Wouters89f507f2006-12-13 04:49:30 +00005674 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005675 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5676 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005678 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5679 as string was created with the old API. */
5680 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005682 p = PyUnicode_AS_UNICODE(v);
5683 end = s + size;
5684
5685 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005686 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005687 /* We have to sanity check the raw data, otherwise doom looms for
5688 some malformed UCS-4 data. */
5689 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005690#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005691 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005692#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005693 end-s < Py_UNICODE_SIZE
5694 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005696 startinpos = s - starts;
5697 if (end-s < Py_UNICODE_SIZE) {
5698 endinpos = end-starts;
5699 reason = "truncated input";
5700 }
5701 else {
5702 endinpos = s - starts + Py_UNICODE_SIZE;
5703 reason = "illegal code point (> 0x10FFFF)";
5704 }
5705 outpos = p - PyUnicode_AS_UNICODE(v);
5706 if (unicode_decode_call_errorhandler(
5707 errors, &errorHandler,
5708 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005709 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005710 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005711 goto onError;
5712 }
5713 }
5714 else {
5715 p++;
5716 s += Py_UNICODE_SIZE;
5717 }
5718 }
5719
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005720 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005721 goto onError;
5722 Py_XDECREF(errorHandler);
5723 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005724 if (PyUnicode_READY(v) == -1) {
5725 Py_DECREF(v);
5726 return NULL;
5727 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005728 return (PyObject *)v;
5729
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005731 Py_XDECREF(v);
5732 Py_XDECREF(errorHandler);
5733 Py_XDECREF(exc);
5734 return NULL;
5735}
5736
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737/* --- Latin-1 Codec ------------------------------------------------------ */
5738
Alexander Belopolsky40018472011-02-26 01:02:56 +00005739PyObject *
5740PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005741 Py_ssize_t size,
5742 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005745 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746}
5747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005748/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005749static void
5750make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005751 const char *encoding,
5752 const Py_UNICODE *unicode, Py_ssize_t size,
5753 Py_ssize_t startpos, Py_ssize_t endpos,
5754 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005757 *exceptionObject = PyUnicodeEncodeError_Create(
5758 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 }
5760 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5762 goto onError;
5763 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5764 goto onError;
5765 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5766 goto onError;
5767 return;
5768 onError:
5769 Py_DECREF(*exceptionObject);
5770 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 }
5772}
5773
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005775static void
5776raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005777 const char *encoding,
5778 const Py_UNICODE *unicode, Py_ssize_t size,
5779 Py_ssize_t startpos, Py_ssize_t endpos,
5780 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781{
5782 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005786}
5787
5788/* error handling callback helper:
5789 build arguments, call the callback and check the arguments,
5790 put the result into newpos and return the replacement string, which
5791 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005792static PyObject *
5793unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005794 PyObject **errorHandler,
5795 const char *encoding, const char *reason,
5796 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5797 Py_ssize_t startpos, Py_ssize_t endpos,
5798 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005799{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005800 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005801
5802 PyObject *restuple;
5803 PyObject *resunicode;
5804
5805 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005807 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 }
5810
5811 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005813 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005814 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005815
5816 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005817 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005820 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005821 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 Py_DECREF(restuple);
5823 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005825 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 &resunicode, newpos)) {
5827 Py_DECREF(restuple);
5828 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005830 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5831 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5832 Py_DECREF(restuple);
5833 return NULL;
5834 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005835 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005837 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5839 Py_DECREF(restuple);
5840 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005841 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005842 Py_INCREF(resunicode);
5843 Py_DECREF(restuple);
5844 return resunicode;
5845}
5846
Alexander Belopolsky40018472011-02-26 01:02:56 +00005847static PyObject *
5848unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005849 Py_ssize_t size,
5850 const char *errors,
5851 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005852{
5853 /* output object */
5854 PyObject *res;
5855 /* pointers to the beginning and end+1 of input */
5856 const Py_UNICODE *startp = p;
5857 const Py_UNICODE *endp = p + size;
5858 /* pointer to the beginning of the unencodable characters */
5859 /* const Py_UNICODE *badp = NULL; */
5860 /* pointer into the output */
5861 char *str;
5862 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005863 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005864 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5865 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866 PyObject *errorHandler = NULL;
5867 PyObject *exc = NULL;
5868 /* the following variable is used for caching string comparisons
5869 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5870 int known_errorHandler = -1;
5871
5872 /* allocate enough for a simple encoding without
5873 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005874 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005875 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005876 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005877 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005878 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005879 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880 ressize = size;
5881
5882 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005884
Benjamin Peterson29060642009-01-31 22:14:21 +00005885 /* can we encode this? */
5886 if (c<limit) {
5887 /* no overflow check, because we know that the space is enough */
5888 *str++ = (char)c;
5889 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005890 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 else {
5892 Py_ssize_t unicodepos = p-startp;
5893 Py_ssize_t requiredsize;
5894 PyObject *repunicode;
5895 Py_ssize_t repsize;
5896 Py_ssize_t newpos;
5897 Py_ssize_t respos;
5898 Py_UNICODE *uni2;
5899 /* startpos for collecting unencodable chars */
5900 const Py_UNICODE *collstart = p;
5901 const Py_UNICODE *collend = p;
5902 /* find all unecodable characters */
5903 while ((collend < endp) && ((*collend)>=limit))
5904 ++collend;
5905 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5906 if (known_errorHandler==-1) {
5907 if ((errors==NULL) || (!strcmp(errors, "strict")))
5908 known_errorHandler = 1;
5909 else if (!strcmp(errors, "replace"))
5910 known_errorHandler = 2;
5911 else if (!strcmp(errors, "ignore"))
5912 known_errorHandler = 3;
5913 else if (!strcmp(errors, "xmlcharrefreplace"))
5914 known_errorHandler = 4;
5915 else
5916 known_errorHandler = 0;
5917 }
5918 switch (known_errorHandler) {
5919 case 1: /* strict */
5920 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5921 goto onError;
5922 case 2: /* replace */
5923 while (collstart++<collend)
5924 *str++ = '?'; /* fall through */
5925 case 3: /* ignore */
5926 p = collend;
5927 break;
5928 case 4: /* xmlcharrefreplace */
5929 respos = str - PyBytes_AS_STRING(res);
5930 /* determine replacement size (temporarily (mis)uses p) */
5931 for (p = collstart, repsize = 0; p < collend; ++p) {
5932 if (*p<10)
5933 repsize += 2+1+1;
5934 else if (*p<100)
5935 repsize += 2+2+1;
5936 else if (*p<1000)
5937 repsize += 2+3+1;
5938 else if (*p<10000)
5939 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005940#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 else
5942 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005943#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 else if (*p<100000)
5945 repsize += 2+5+1;
5946 else if (*p<1000000)
5947 repsize += 2+6+1;
5948 else
5949 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005950#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 }
5952 requiredsize = respos+repsize+(endp-collend);
5953 if (requiredsize > ressize) {
5954 if (requiredsize<2*ressize)
5955 requiredsize = 2*ressize;
5956 if (_PyBytes_Resize(&res, requiredsize))
5957 goto onError;
5958 str = PyBytes_AS_STRING(res) + respos;
5959 ressize = requiredsize;
5960 }
5961 /* generate replacement (temporarily (mis)uses p) */
5962 for (p = collstart; p < collend; ++p) {
5963 str += sprintf(str, "&#%d;", (int)*p);
5964 }
5965 p = collend;
5966 break;
5967 default:
5968 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5969 encoding, reason, startp, size, &exc,
5970 collstart-startp, collend-startp, &newpos);
5971 if (repunicode == NULL)
5972 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005973 if (PyBytes_Check(repunicode)) {
5974 /* Directly copy bytes result to output. */
5975 repsize = PyBytes_Size(repunicode);
5976 if (repsize > 1) {
5977 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005978 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005979 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5980 Py_DECREF(repunicode);
5981 goto onError;
5982 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005983 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005984 ressize += repsize-1;
5985 }
5986 memcpy(str, PyBytes_AsString(repunicode), repsize);
5987 str += repsize;
5988 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005989 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005990 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005991 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 /* need more space? (at least enough for what we
5993 have+the replacement+the rest of the string, so
5994 we won't have to check space for encodable characters) */
5995 respos = str - PyBytes_AS_STRING(res);
5996 repsize = PyUnicode_GET_SIZE(repunicode);
5997 requiredsize = respos+repsize+(endp-collend);
5998 if (requiredsize > ressize) {
5999 if (requiredsize<2*ressize)
6000 requiredsize = 2*ressize;
6001 if (_PyBytes_Resize(&res, requiredsize)) {
6002 Py_DECREF(repunicode);
6003 goto onError;
6004 }
6005 str = PyBytes_AS_STRING(res) + respos;
6006 ressize = requiredsize;
6007 }
6008 /* check if there is anything unencodable in the replacement
6009 and copy it to the output */
6010 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6011 c = *uni2;
6012 if (c >= limit) {
6013 raise_encode_exception(&exc, encoding, startp, size,
6014 unicodepos, unicodepos+1, reason);
6015 Py_DECREF(repunicode);
6016 goto onError;
6017 }
6018 *str = (char)c;
6019 }
6020 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006021 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006022 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006023 }
6024 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006025 /* Resize if we allocated to much */
6026 size = str - PyBytes_AS_STRING(res);
6027 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006028 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006029 if (_PyBytes_Resize(&res, size) < 0)
6030 goto onError;
6031 }
6032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033 Py_XDECREF(errorHandler);
6034 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006035 return res;
6036
6037 onError:
6038 Py_XDECREF(res);
6039 Py_XDECREF(errorHandler);
6040 Py_XDECREF(exc);
6041 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042}
6043
Alexander Belopolsky40018472011-02-26 01:02:56 +00006044PyObject *
6045PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006046 Py_ssize_t size,
6047 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006049 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050}
6051
Alexander Belopolsky40018472011-02-26 01:02:56 +00006052PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006053_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054{
6055 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 PyErr_BadArgument();
6057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006059 if (PyUnicode_READY(unicode) == -1)
6060 return NULL;
6061 /* Fast path: if it is a one-byte string, construct
6062 bytes object directly. */
6063 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6064 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6065 PyUnicode_GET_LENGTH(unicode));
6066 /* Non-Latin-1 characters present. Defer to above function to
6067 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006070 errors);
6071}
6072
6073PyObject*
6074PyUnicode_AsLatin1String(PyObject *unicode)
6075{
6076 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077}
6078
6079/* --- 7-bit ASCII Codec -------------------------------------------------- */
6080
Alexander Belopolsky40018472011-02-26 01:02:56 +00006081PyObject *
6082PyUnicode_DecodeASCII(const char *s,
6083 Py_ssize_t size,
6084 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006086 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 PyUnicodeObject *v;
6088 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006089 Py_ssize_t startinpos;
6090 Py_ssize_t endinpos;
6091 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006092 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006093 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006094 PyObject *errorHandler = NULL;
6095 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006096 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006097
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006099 if (size == 1 && *(unsigned char*)s < 128)
6100 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6101
6102 /* Fast path. Assume the input actually *is* ASCII, and allocate
6103 a single-block Unicode object with that assumption. If there is
6104 an error, drop the object and start over. */
6105 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6106 if (v == NULL)
6107 goto onError;
6108 d = PyUnicode_1BYTE_DATA(v);
6109 for (i = 0; i < size; i++) {
6110 unsigned char ch = ((unsigned char*)s)[i];
6111 if (ch < 128)
6112 d[i] = ch;
6113 else
6114 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006115 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006116 if (i == size)
6117 return (PyObject*)v;
6118 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006119
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 v = _PyUnicode_New(size);
6121 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 e = s + size;
6127 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 register unsigned char c = (unsigned char)*s;
6129 if (c < 128) {
6130 *p++ = c;
6131 ++s;
6132 }
6133 else {
6134 startinpos = s-starts;
6135 endinpos = startinpos + 1;
6136 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6137 if (unicode_decode_call_errorhandler(
6138 errors, &errorHandler,
6139 "ascii", "ordinal not in range(128)",
6140 &starts, &e, &startinpos, &endinpos, &exc, &s,
6141 &v, &outpos, &p))
6142 goto onError;
6143 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006145 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006146 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6147 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006148 Py_XDECREF(errorHandler);
6149 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006150 if (PyUnicode_READY(v) == -1) {
6151 Py_DECREF(v);
6152 return NULL;
6153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006155
Benjamin Peterson29060642009-01-31 22:14:21 +00006156 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 Py_XDECREF(errorHandler);
6159 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 return NULL;
6161}
6162
Alexander Belopolsky40018472011-02-26 01:02:56 +00006163PyObject *
6164PyUnicode_EncodeASCII(const Py_UNICODE *p,
6165 Py_ssize_t size,
6166 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006168 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169}
6170
Alexander Belopolsky40018472011-02-26 01:02:56 +00006171PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006172_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173{
6174 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006175 PyErr_BadArgument();
6176 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006178 if (PyUnicode_READY(unicode) == -1)
6179 return NULL;
6180 /* Fast path: if it is an ASCII-only string, construct bytes object
6181 directly. Else defer to above function to raise the exception. */
6182 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6183 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6184 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006187 errors);
6188}
6189
6190PyObject *
6191PyUnicode_AsASCIIString(PyObject *unicode)
6192{
6193 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194}
6195
Victor Stinner99b95382011-07-04 14:23:54 +02006196#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006197
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006198/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006199
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006200#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006201#define NEED_RETRY
6202#endif
6203
6204/* XXX This code is limited to "true" double-byte encodings, as
6205 a) it assumes an incomplete character consists of a single byte, and
6206 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006208
Alexander Belopolsky40018472011-02-26 01:02:56 +00006209static int
6210is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006211{
6212 const char *curr = s + offset;
6213
6214 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006215 const char *prev = CharPrev(s, curr);
6216 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006217 }
6218 return 0;
6219}
6220
6221/*
6222 * Decode MBCS string into unicode object. If 'final' is set, converts
6223 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6224 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006225static int
6226decode_mbcs(PyUnicodeObject **v,
6227 const char *s, /* MBCS string */
6228 int size, /* sizeof MBCS string */
6229 int final,
6230 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006231{
6232 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006233 Py_ssize_t n;
6234 DWORD usize;
6235 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006236
6237 assert(size >= 0);
6238
Victor Stinner554f3f02010-06-16 23:33:54 +00006239 /* check and handle 'errors' arg */
6240 if (errors==NULL || strcmp(errors, "strict")==0)
6241 flags = MB_ERR_INVALID_CHARS;
6242 else if (strcmp(errors, "ignore")==0)
6243 flags = 0;
6244 else {
6245 PyErr_Format(PyExc_ValueError,
6246 "mbcs encoding does not support errors='%s'",
6247 errors);
6248 return -1;
6249 }
6250
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006251 /* Skip trailing lead-byte unless 'final' is set */
6252 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006254
6255 /* First get the size of the result */
6256 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006257 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6258 if (usize==0)
6259 goto mbcs_decode_error;
6260 } else
6261 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006262
6263 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 /* Create unicode object */
6265 *v = _PyUnicode_New(usize);
6266 if (*v == NULL)
6267 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006268 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006269 }
6270 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 /* Extend unicode object */
6272 n = PyUnicode_GET_SIZE(*v);
6273 if (_PyUnicode_Resize(v, n + usize) < 0)
6274 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006275 }
6276
6277 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006278 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006280 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6281 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006283 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006284 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006285
6286mbcs_decode_error:
6287 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6288 we raise a UnicodeDecodeError - else it is a 'generic'
6289 windows error
6290 */
6291 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6292 /* Ideally, we should get reason from FormatMessage - this
6293 is the Windows 2000 English version of the message
6294 */
6295 PyObject *exc = NULL;
6296 const char *reason = "No mapping for the Unicode character exists "
6297 "in the target multi-byte code page.";
6298 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6299 if (exc != NULL) {
6300 PyCodec_StrictErrors(exc);
6301 Py_DECREF(exc);
6302 }
6303 } else {
6304 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6305 }
6306 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006307}
6308
Alexander Belopolsky40018472011-02-26 01:02:56 +00006309PyObject *
6310PyUnicode_DecodeMBCSStateful(const char *s,
6311 Py_ssize_t size,
6312 const char *errors,
6313 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006314{
6315 PyUnicodeObject *v = NULL;
6316 int done;
6317
6318 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006320
6321#ifdef NEED_RETRY
6322 retry:
6323 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006324 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006325 else
6326#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006327 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006328
6329 if (done < 0) {
6330 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006332 }
6333
6334 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006336
6337#ifdef NEED_RETRY
6338 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 s += done;
6340 size -= done;
6341 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006342 }
6343#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006344 if (PyUnicode_READY(v) == -1) {
6345 Py_DECREF(v);
6346 return NULL;
6347 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006348 return (PyObject *)v;
6349}
6350
Alexander Belopolsky40018472011-02-26 01:02:56 +00006351PyObject *
6352PyUnicode_DecodeMBCS(const char *s,
6353 Py_ssize_t size,
6354 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006355{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006356 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6357}
6358
6359/*
6360 * Convert unicode into string object (MBCS).
6361 * Returns 0 if succeed, -1 otherwise.
6362 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006363static int
6364encode_mbcs(PyObject **repr,
6365 const Py_UNICODE *p, /* unicode */
6366 int size, /* size of unicode */
6367 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006368{
Victor Stinner554f3f02010-06-16 23:33:54 +00006369 BOOL usedDefaultChar = FALSE;
6370 BOOL *pusedDefaultChar;
6371 int mbcssize;
6372 Py_ssize_t n;
6373 PyObject *exc = NULL;
6374 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006375
6376 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006377
Victor Stinner554f3f02010-06-16 23:33:54 +00006378 /* check and handle 'errors' arg */
6379 if (errors==NULL || strcmp(errors, "strict")==0) {
6380 flags = WC_NO_BEST_FIT_CHARS;
6381 pusedDefaultChar = &usedDefaultChar;
6382 } else if (strcmp(errors, "replace")==0) {
6383 flags = 0;
6384 pusedDefaultChar = NULL;
6385 } else {
6386 PyErr_Format(PyExc_ValueError,
6387 "mbcs encoding does not support errors='%s'",
6388 errors);
6389 return -1;
6390 }
6391
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006392 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006393 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006394 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6395 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 if (mbcssize == 0) {
6397 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6398 return -1;
6399 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006400 /* If we used a default char, then we failed! */
6401 if (pusedDefaultChar && *pusedDefaultChar)
6402 goto mbcs_encode_error;
6403 } else {
6404 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006405 }
6406
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006407 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 /* Create string object */
6409 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6410 if (*repr == NULL)
6411 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006412 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006413 }
6414 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 /* Extend string object */
6416 n = PyBytes_Size(*repr);
6417 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6418 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006419 }
6420
6421 /* Do the conversion */
6422 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006424 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6425 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6427 return -1;
6428 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006429 if (pusedDefaultChar && *pusedDefaultChar)
6430 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006431 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006432 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006433
6434mbcs_encode_error:
6435 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6436 Py_XDECREF(exc);
6437 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006438}
6439
Alexander Belopolsky40018472011-02-26 01:02:56 +00006440PyObject *
6441PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6442 Py_ssize_t size,
6443 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006444{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006445 PyObject *repr = NULL;
6446 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006447
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006448#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006450 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006451 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006452 else
6453#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006454 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006455
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006456 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006457 Py_XDECREF(repr);
6458 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006459 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006460
6461#ifdef NEED_RETRY
6462 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 p += INT_MAX;
6464 size -= INT_MAX;
6465 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006466 }
6467#endif
6468
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006469 return repr;
6470}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006471
Alexander Belopolsky40018472011-02-26 01:02:56 +00006472PyObject *
6473PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006474{
6475 if (!PyUnicode_Check(unicode)) {
6476 PyErr_BadArgument();
6477 return NULL;
6478 }
6479 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006480 PyUnicode_GET_SIZE(unicode),
6481 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006482}
6483
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006484#undef NEED_RETRY
6485
Victor Stinner99b95382011-07-04 14:23:54 +02006486#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006487
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488/* --- Character Mapping Codec -------------------------------------------- */
6489
Alexander Belopolsky40018472011-02-26 01:02:56 +00006490PyObject *
6491PyUnicode_DecodeCharmap(const char *s,
6492 Py_ssize_t size,
6493 PyObject *mapping,
6494 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006496 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006497 Py_ssize_t startinpos;
6498 Py_ssize_t endinpos;
6499 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006500 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 PyUnicodeObject *v;
6502 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006503 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006504 PyObject *errorHandler = NULL;
6505 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006506 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006507 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006508
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 /* Default to Latin-1 */
6510 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512
6513 v = _PyUnicode_New(size);
6514 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006519 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006520 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 mapstring = PyUnicode_AS_UNICODE(mapping);
6522 maplen = PyUnicode_GET_SIZE(mapping);
6523 while (s < e) {
6524 unsigned char ch = *s;
6525 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 if (ch < maplen)
6528 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 if (x == 0xfffe) {
6531 /* undefined mapping */
6532 outpos = p-PyUnicode_AS_UNICODE(v);
6533 startinpos = s-starts;
6534 endinpos = startinpos+1;
6535 if (unicode_decode_call_errorhandler(
6536 errors, &errorHandler,
6537 "charmap", "character maps to <undefined>",
6538 &starts, &e, &startinpos, &endinpos, &exc, &s,
6539 &v, &outpos, &p)) {
6540 goto onError;
6541 }
6542 continue;
6543 }
6544 *p++ = x;
6545 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006546 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006547 }
6548 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 while (s < e) {
6550 unsigned char ch = *s;
6551 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006552
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6554 w = PyLong_FromLong((long)ch);
6555 if (w == NULL)
6556 goto onError;
6557 x = PyObject_GetItem(mapping, w);
6558 Py_DECREF(w);
6559 if (x == NULL) {
6560 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6561 /* No mapping found means: mapping is undefined. */
6562 PyErr_Clear();
6563 x = Py_None;
6564 Py_INCREF(x);
6565 } else
6566 goto onError;
6567 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006568
Benjamin Peterson29060642009-01-31 22:14:21 +00006569 /* Apply mapping */
6570 if (PyLong_Check(x)) {
6571 long value = PyLong_AS_LONG(x);
6572 if (value < 0 || value > 65535) {
6573 PyErr_SetString(PyExc_TypeError,
6574 "character mapping must be in range(65536)");
6575 Py_DECREF(x);
6576 goto onError;
6577 }
6578 *p++ = (Py_UNICODE)value;
6579 }
6580 else if (x == Py_None) {
6581 /* undefined mapping */
6582 outpos = p-PyUnicode_AS_UNICODE(v);
6583 startinpos = s-starts;
6584 endinpos = startinpos+1;
6585 if (unicode_decode_call_errorhandler(
6586 errors, &errorHandler,
6587 "charmap", "character maps to <undefined>",
6588 &starts, &e, &startinpos, &endinpos, &exc, &s,
6589 &v, &outpos, &p)) {
6590 Py_DECREF(x);
6591 goto onError;
6592 }
6593 Py_DECREF(x);
6594 continue;
6595 }
6596 else if (PyUnicode_Check(x)) {
6597 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006598
Benjamin Peterson29060642009-01-31 22:14:21 +00006599 if (targetsize == 1)
6600 /* 1-1 mapping */
6601 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006602
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 else if (targetsize > 1) {
6604 /* 1-n mapping */
6605 if (targetsize > extrachars) {
6606 /* resize first */
6607 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6608 Py_ssize_t needed = (targetsize - extrachars) + \
6609 (targetsize << 2);
6610 extrachars += needed;
6611 /* XXX overflow detection missing */
6612 if (_PyUnicode_Resize(&v,
6613 PyUnicode_GET_SIZE(v) + needed) < 0) {
6614 Py_DECREF(x);
6615 goto onError;
6616 }
6617 p = PyUnicode_AS_UNICODE(v) + oldpos;
6618 }
6619 Py_UNICODE_COPY(p,
6620 PyUnicode_AS_UNICODE(x),
6621 targetsize);
6622 p += targetsize;
6623 extrachars -= targetsize;
6624 }
6625 /* 1-0 mapping: skip the character */
6626 }
6627 else {
6628 /* wrong return value */
6629 PyErr_SetString(PyExc_TypeError,
6630 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006631 Py_DECREF(x);
6632 goto onError;
6633 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 Py_DECREF(x);
6635 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 }
6638 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006639 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6640 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641 Py_XDECREF(errorHandler);
6642 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006643 if (PyUnicode_READY(v) == -1) {
6644 Py_DECREF(v);
6645 return NULL;
6646 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006648
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006650 Py_XDECREF(errorHandler);
6651 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 Py_XDECREF(v);
6653 return NULL;
6654}
6655
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006656/* Charmap encoding: the lookup table */
6657
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 PyObject_HEAD
6660 unsigned char level1[32];
6661 int count2, count3;
6662 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006663};
6664
6665static PyObject*
6666encoding_map_size(PyObject *obj, PyObject* args)
6667{
6668 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006669 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006671}
6672
6673static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006674 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 PyDoc_STR("Return the size (in bytes) of this object") },
6676 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006677};
6678
6679static void
6680encoding_map_dealloc(PyObject* o)
6681{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006682 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006683}
6684
6685static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006686 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006687 "EncodingMap", /*tp_name*/
6688 sizeof(struct encoding_map), /*tp_basicsize*/
6689 0, /*tp_itemsize*/
6690 /* methods */
6691 encoding_map_dealloc, /*tp_dealloc*/
6692 0, /*tp_print*/
6693 0, /*tp_getattr*/
6694 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006695 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 0, /*tp_repr*/
6697 0, /*tp_as_number*/
6698 0, /*tp_as_sequence*/
6699 0, /*tp_as_mapping*/
6700 0, /*tp_hash*/
6701 0, /*tp_call*/
6702 0, /*tp_str*/
6703 0, /*tp_getattro*/
6704 0, /*tp_setattro*/
6705 0, /*tp_as_buffer*/
6706 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6707 0, /*tp_doc*/
6708 0, /*tp_traverse*/
6709 0, /*tp_clear*/
6710 0, /*tp_richcompare*/
6711 0, /*tp_weaklistoffset*/
6712 0, /*tp_iter*/
6713 0, /*tp_iternext*/
6714 encoding_map_methods, /*tp_methods*/
6715 0, /*tp_members*/
6716 0, /*tp_getset*/
6717 0, /*tp_base*/
6718 0, /*tp_dict*/
6719 0, /*tp_descr_get*/
6720 0, /*tp_descr_set*/
6721 0, /*tp_dictoffset*/
6722 0, /*tp_init*/
6723 0, /*tp_alloc*/
6724 0, /*tp_new*/
6725 0, /*tp_free*/
6726 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006727};
6728
6729PyObject*
6730PyUnicode_BuildEncodingMap(PyObject* string)
6731{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006732 PyObject *result;
6733 struct encoding_map *mresult;
6734 int i;
6735 int need_dict = 0;
6736 unsigned char level1[32];
6737 unsigned char level2[512];
6738 unsigned char *mlevel1, *mlevel2, *mlevel3;
6739 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006740 int kind;
6741 void *data;
6742 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006744 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006745 PyErr_BadArgument();
6746 return NULL;
6747 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006748 kind = PyUnicode_KIND(string);
6749 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006750 memset(level1, 0xFF, sizeof level1);
6751 memset(level2, 0xFF, sizeof level2);
6752
6753 /* If there isn't a one-to-one mapping of NULL to \0,
6754 or if there are non-BMP characters, we need to use
6755 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006756 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006757 need_dict = 1;
6758 for (i = 1; i < 256; i++) {
6759 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006760 ch = PyUnicode_READ(kind, data, i);
6761 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006762 need_dict = 1;
6763 break;
6764 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006765 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006766 /* unmapped character */
6767 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006768 l1 = ch >> 11;
6769 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006770 if (level1[l1] == 0xFF)
6771 level1[l1] = count2++;
6772 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006773 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006774 }
6775
6776 if (count2 >= 0xFF || count3 >= 0xFF)
6777 need_dict = 1;
6778
6779 if (need_dict) {
6780 PyObject *result = PyDict_New();
6781 PyObject *key, *value;
6782 if (!result)
6783 return NULL;
6784 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006785 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006786 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006787 if (!key || !value)
6788 goto failed1;
6789 if (PyDict_SetItem(result, key, value) == -1)
6790 goto failed1;
6791 Py_DECREF(key);
6792 Py_DECREF(value);
6793 }
6794 return result;
6795 failed1:
6796 Py_XDECREF(key);
6797 Py_XDECREF(value);
6798 Py_DECREF(result);
6799 return NULL;
6800 }
6801
6802 /* Create a three-level trie */
6803 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6804 16*count2 + 128*count3 - 1);
6805 if (!result)
6806 return PyErr_NoMemory();
6807 PyObject_Init(result, &EncodingMapType);
6808 mresult = (struct encoding_map*)result;
6809 mresult->count2 = count2;
6810 mresult->count3 = count3;
6811 mlevel1 = mresult->level1;
6812 mlevel2 = mresult->level23;
6813 mlevel3 = mresult->level23 + 16*count2;
6814 memcpy(mlevel1, level1, 32);
6815 memset(mlevel2, 0xFF, 16*count2);
6816 memset(mlevel3, 0, 128*count3);
6817 count3 = 0;
6818 for (i = 1; i < 256; i++) {
6819 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006820 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006821 /* unmapped character */
6822 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006823 o1 = PyUnicode_READ(kind, data, i)>>11;
6824 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006825 i2 = 16*mlevel1[o1] + o2;
6826 if (mlevel2[i2] == 0xFF)
6827 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006828 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006829 i3 = 128*mlevel2[i2] + o3;
6830 mlevel3[i3] = i;
6831 }
6832 return result;
6833}
6834
6835static int
6836encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6837{
6838 struct encoding_map *map = (struct encoding_map*)mapping;
6839 int l1 = c>>11;
6840 int l2 = (c>>7) & 0xF;
6841 int l3 = c & 0x7F;
6842 int i;
6843
6844#ifdef Py_UNICODE_WIDE
6845 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006847 }
6848#endif
6849 if (c == 0)
6850 return 0;
6851 /* level 1*/
6852 i = map->level1[l1];
6853 if (i == 0xFF) {
6854 return -1;
6855 }
6856 /* level 2*/
6857 i = map->level23[16*i+l2];
6858 if (i == 0xFF) {
6859 return -1;
6860 }
6861 /* level 3 */
6862 i = map->level23[16*map->count2 + 128*i + l3];
6863 if (i == 0) {
6864 return -1;
6865 }
6866 return i;
6867}
6868
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006869/* Lookup the character ch in the mapping. If the character
6870 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006871 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006872static PyObject *
6873charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874{
Christian Heimes217cfd12007-12-02 14:31:20 +00006875 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006876 PyObject *x;
6877
6878 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006880 x = PyObject_GetItem(mapping, w);
6881 Py_DECREF(w);
6882 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6884 /* No mapping found means: mapping is undefined. */
6885 PyErr_Clear();
6886 x = Py_None;
6887 Py_INCREF(x);
6888 return x;
6889 } else
6890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006892 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006894 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 long value = PyLong_AS_LONG(x);
6896 if (value < 0 || value > 255) {
6897 PyErr_SetString(PyExc_TypeError,
6898 "character mapping must be in range(256)");
6899 Py_DECREF(x);
6900 return NULL;
6901 }
6902 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006904 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006905 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 /* wrong return value */
6908 PyErr_Format(PyExc_TypeError,
6909 "character mapping must return integer, bytes or None, not %.400s",
6910 x->ob_type->tp_name);
6911 Py_DECREF(x);
6912 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 }
6914}
6915
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006916static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006917charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006918{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006919 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6920 /* exponentially overallocate to minimize reallocations */
6921 if (requiredsize < 2*outsize)
6922 requiredsize = 2*outsize;
6923 if (_PyBytes_Resize(outobj, requiredsize))
6924 return -1;
6925 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006926}
6927
Benjamin Peterson14339b62009-01-31 16:36:08 +00006928typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006930} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006931/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006932 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006933 space is available. Return a new reference to the object that
6934 was put in the output buffer, or Py_None, if the mapping was undefined
6935 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006936 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006937static charmapencode_result
6938charmapencode_output(Py_UNICODE c, PyObject *mapping,
6939 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006940{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006941 PyObject *rep;
6942 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006943 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006944
Christian Heimes90aa7642007-12-19 02:45:37 +00006945 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006946 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006947 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006948 if (res == -1)
6949 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006950 if (outsize<requiredsize)
6951 if (charmapencode_resize(outobj, outpos, requiredsize))
6952 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006953 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 outstart[(*outpos)++] = (char)res;
6955 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006956 }
6957
6958 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006959 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006960 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006961 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006962 Py_DECREF(rep);
6963 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006964 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 if (PyLong_Check(rep)) {
6966 Py_ssize_t requiredsize = *outpos+1;
6967 if (outsize<requiredsize)
6968 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6969 Py_DECREF(rep);
6970 return enc_EXCEPTION;
6971 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006972 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006974 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 else {
6976 const char *repchars = PyBytes_AS_STRING(rep);
6977 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6978 Py_ssize_t requiredsize = *outpos+repsize;
6979 if (outsize<requiredsize)
6980 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6981 Py_DECREF(rep);
6982 return enc_EXCEPTION;
6983 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006984 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 memcpy(outstart + *outpos, repchars, repsize);
6986 *outpos += repsize;
6987 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006988 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006989 Py_DECREF(rep);
6990 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006991}
6992
6993/* handle an error in PyUnicode_EncodeCharmap
6994 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006995static int
6996charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006997 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006998 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006999 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007000 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007001{
7002 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007003 Py_ssize_t repsize;
7004 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007005 Py_UNICODE *uni2;
7006 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007007 Py_ssize_t collstartpos = *inpos;
7008 Py_ssize_t collendpos = *inpos+1;
7009 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007010 char *encoding = "charmap";
7011 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007012 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007014 /* find all unencodable characters */
7015 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007016 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007017 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 int res = encoding_map_lookup(p[collendpos], mapping);
7019 if (res != -1)
7020 break;
7021 ++collendpos;
7022 continue;
7023 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007024
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 rep = charmapencode_lookup(p[collendpos], mapping);
7026 if (rep==NULL)
7027 return -1;
7028 else if (rep!=Py_None) {
7029 Py_DECREF(rep);
7030 break;
7031 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007032 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007033 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007034 }
7035 /* cache callback name lookup
7036 * (if not done yet, i.e. it's the first error) */
7037 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 if ((errors==NULL) || (!strcmp(errors, "strict")))
7039 *known_errorHandler = 1;
7040 else if (!strcmp(errors, "replace"))
7041 *known_errorHandler = 2;
7042 else if (!strcmp(errors, "ignore"))
7043 *known_errorHandler = 3;
7044 else if (!strcmp(errors, "xmlcharrefreplace"))
7045 *known_errorHandler = 4;
7046 else
7047 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007048 }
7049 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007050 case 1: /* strict */
7051 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7052 return -1;
7053 case 2: /* replace */
7054 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 x = charmapencode_output('?', mapping, res, respos);
7056 if (x==enc_EXCEPTION) {
7057 return -1;
7058 }
7059 else if (x==enc_FAILED) {
7060 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7061 return -1;
7062 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007063 }
7064 /* fall through */
7065 case 3: /* ignore */
7066 *inpos = collendpos;
7067 break;
7068 case 4: /* xmlcharrefreplace */
7069 /* generate replacement (temporarily (mis)uses p) */
7070 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007071 char buffer[2+29+1+1];
7072 char *cp;
7073 sprintf(buffer, "&#%d;", (int)p[collpos]);
7074 for (cp = buffer; *cp; ++cp) {
7075 x = charmapencode_output(*cp, mapping, res, respos);
7076 if (x==enc_EXCEPTION)
7077 return -1;
7078 else if (x==enc_FAILED) {
7079 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7080 return -1;
7081 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007082 }
7083 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007084 *inpos = collendpos;
7085 break;
7086 default:
7087 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 encoding, reason, p, size, exceptionObject,
7089 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007090 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007092 if (PyBytes_Check(repunicode)) {
7093 /* Directly copy bytes result to output. */
7094 Py_ssize_t outsize = PyBytes_Size(*res);
7095 Py_ssize_t requiredsize;
7096 repsize = PyBytes_Size(repunicode);
7097 requiredsize = *respos + repsize;
7098 if (requiredsize > outsize)
7099 /* Make room for all additional bytes. */
7100 if (charmapencode_resize(res, respos, requiredsize)) {
7101 Py_DECREF(repunicode);
7102 return -1;
7103 }
7104 memcpy(PyBytes_AsString(*res) + *respos,
7105 PyBytes_AsString(repunicode), repsize);
7106 *respos += repsize;
7107 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007108 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007109 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007110 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007111 /* generate replacement */
7112 repsize = PyUnicode_GET_SIZE(repunicode);
7113 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 x = charmapencode_output(*uni2, mapping, res, respos);
7115 if (x==enc_EXCEPTION) {
7116 return -1;
7117 }
7118 else if (x==enc_FAILED) {
7119 Py_DECREF(repunicode);
7120 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7121 return -1;
7122 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007123 }
7124 *inpos = newpos;
7125 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007126 }
7127 return 0;
7128}
7129
Alexander Belopolsky40018472011-02-26 01:02:56 +00007130PyObject *
7131PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7132 Py_ssize_t size,
7133 PyObject *mapping,
7134 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007136 /* output object */
7137 PyObject *res = NULL;
7138 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007139 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007140 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007141 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007142 PyObject *errorHandler = NULL;
7143 PyObject *exc = NULL;
7144 /* the following variable is used for caching string comparisons
7145 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7146 * 3=ignore, 4=xmlcharrefreplace */
7147 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148
7149 /* Default to Latin-1 */
7150 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007151 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007153 /* allocate enough for a simple encoding without
7154 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007155 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007156 if (res == NULL)
7157 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007158 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007159 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007161 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 /* try to encode it */
7163 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7164 if (x==enc_EXCEPTION) /* error */
7165 goto onError;
7166 if (x==enc_FAILED) { /* unencodable character */
7167 if (charmap_encoding_error(p, size, &inpos, mapping,
7168 &exc,
7169 &known_errorHandler, &errorHandler, errors,
7170 &res, &respos)) {
7171 goto onError;
7172 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007173 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 else
7175 /* done with this character => adjust input position */
7176 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007179 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007180 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007181 if (_PyBytes_Resize(&res, respos) < 0)
7182 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007183
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007184 Py_XDECREF(exc);
7185 Py_XDECREF(errorHandler);
7186 return res;
7187
Benjamin Peterson29060642009-01-31 22:14:21 +00007188 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007189 Py_XDECREF(res);
7190 Py_XDECREF(exc);
7191 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192 return NULL;
7193}
7194
Alexander Belopolsky40018472011-02-26 01:02:56 +00007195PyObject *
7196PyUnicode_AsCharmapString(PyObject *unicode,
7197 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198{
7199 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 PyErr_BadArgument();
7201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202 }
7203 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007204 PyUnicode_GET_SIZE(unicode),
7205 mapping,
7206 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207}
7208
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007209/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007210static void
7211make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007212 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007213 Py_ssize_t startpos, Py_ssize_t endpos,
7214 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007216 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007217 *exceptionObject = _PyUnicodeTranslateError_Create(
7218 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219 }
7220 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007221 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7222 goto onError;
7223 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7224 goto onError;
7225 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7226 goto onError;
7227 return;
7228 onError:
7229 Py_DECREF(*exceptionObject);
7230 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 }
7232}
7233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007234/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007235static void
7236raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007237 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007238 Py_ssize_t startpos, Py_ssize_t endpos,
7239 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007240{
7241 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007242 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007243 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007244 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007245}
7246
7247/* error handling callback helper:
7248 build arguments, call the callback and check the arguments,
7249 put the result into newpos and return the replacement string, which
7250 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007251static PyObject *
7252unicode_translate_call_errorhandler(const char *errors,
7253 PyObject **errorHandler,
7254 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007255 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007256 Py_ssize_t startpos, Py_ssize_t endpos,
7257 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007258{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007259 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007260
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007261 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007262 PyObject *restuple;
7263 PyObject *resunicode;
7264
7265 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007267 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007268 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007269 }
7270
7271 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007272 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007273 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007274 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007275
7276 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007278 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007280 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007281 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007282 Py_DECREF(restuple);
7283 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007284 }
7285 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007286 &resunicode, &i_newpos)) {
7287 Py_DECREF(restuple);
7288 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007289 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007290 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007291 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007292 else
7293 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007294 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7296 Py_DECREF(restuple);
7297 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007298 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007299 Py_INCREF(resunicode);
7300 Py_DECREF(restuple);
7301 return resunicode;
7302}
7303
7304/* Lookup the character ch in the mapping and put the result in result,
7305 which must be decrefed by the caller.
7306 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007307static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007308charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007309{
Christian Heimes217cfd12007-12-02 14:31:20 +00007310 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007311 PyObject *x;
7312
7313 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007314 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007315 x = PyObject_GetItem(mapping, w);
7316 Py_DECREF(w);
7317 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007318 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7319 /* No mapping found means: use 1:1 mapping. */
7320 PyErr_Clear();
7321 *result = NULL;
7322 return 0;
7323 } else
7324 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007325 }
7326 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 *result = x;
7328 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007329 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007330 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 long value = PyLong_AS_LONG(x);
7332 long max = PyUnicode_GetMax();
7333 if (value < 0 || value > max) {
7334 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007335 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007336 Py_DECREF(x);
7337 return -1;
7338 }
7339 *result = x;
7340 return 0;
7341 }
7342 else if (PyUnicode_Check(x)) {
7343 *result = x;
7344 return 0;
7345 }
7346 else {
7347 /* wrong return value */
7348 PyErr_SetString(PyExc_TypeError,
7349 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007350 Py_DECREF(x);
7351 return -1;
7352 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007353}
7354/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 if not reallocate and adjust various state variables.
7356 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007357static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007358charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007361 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007362 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007363 /* exponentially overallocate to minimize reallocations */
7364 if (requiredsize < 2 * oldsize)
7365 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007366 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7367 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007369 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007370 }
7371 return 0;
7372}
7373/* lookup the character, put the result in the output string and adjust
7374 various state variables. Return a new reference to the object that
7375 was put in the output buffer in *result, or Py_None, if the mapping was
7376 undefined (in which case no character was written).
7377 The called must decref result.
7378 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007379static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007380charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7381 PyObject *mapping, Py_UCS4 **output,
7382 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007383 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007385 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7386 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007387 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007388 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007389 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007390 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007391 }
7392 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007394 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007395 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007396 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007397 }
7398 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007399 Py_ssize_t repsize;
7400 if (PyUnicode_READY(*res) == -1)
7401 return -1;
7402 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 if (repsize==1) {
7404 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007405 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 }
7407 else if (repsize!=0) {
7408 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007409 Py_ssize_t requiredsize = *opos +
7410 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007412 Py_ssize_t i;
7413 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007415 for(i = 0; i < repsize; i++)
7416 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007418 }
7419 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007421 return 0;
7422}
7423
Alexander Belopolsky40018472011-02-26 01:02:56 +00007424PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007425_PyUnicode_TranslateCharmap(PyObject *input,
7426 PyObject *mapping,
7427 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007429 /* input object */
7430 char *idata;
7431 Py_ssize_t size, i;
7432 int kind;
7433 /* output buffer */
7434 Py_UCS4 *output = NULL;
7435 Py_ssize_t osize;
7436 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007437 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007438 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007439 char *reason = "character maps to <undefined>";
7440 PyObject *errorHandler = NULL;
7441 PyObject *exc = NULL;
7442 /* the following variable is used for caching string comparisons
7443 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7444 * 3=ignore, 4=xmlcharrefreplace */
7445 int known_errorHandler = -1;
7446
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 PyErr_BadArgument();
7449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007452 if (PyUnicode_READY(input) == -1)
7453 return NULL;
7454 idata = (char*)PyUnicode_DATA(input);
7455 kind = PyUnicode_KIND(input);
7456 size = PyUnicode_GET_LENGTH(input);
7457 i = 0;
7458
7459 if (size == 0) {
7460 Py_INCREF(input);
7461 return input;
7462 }
7463
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007464 /* allocate enough for a simple 1:1 translation without
7465 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007466 osize = size;
7467 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7468 opos = 0;
7469 if (output == NULL) {
7470 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007472 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007474 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 /* try to encode it */
7476 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007477 if (charmaptranslate_output(input, i, mapping,
7478 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 Py_XDECREF(x);
7480 goto onError;
7481 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007482 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007484 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 else { /* untranslatable character */
7486 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7487 Py_ssize_t repsize;
7488 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007489 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007491 Py_ssize_t collstart = i;
7492 Py_ssize_t collend = i+1;
7493 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007496 while (collend < size) {
7497 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 goto onError;
7499 Py_XDECREF(x);
7500 if (x!=Py_None)
7501 break;
7502 ++collend;
7503 }
7504 /* cache callback name lookup
7505 * (if not done yet, i.e. it's the first error) */
7506 if (known_errorHandler==-1) {
7507 if ((errors==NULL) || (!strcmp(errors, "strict")))
7508 known_errorHandler = 1;
7509 else if (!strcmp(errors, "replace"))
7510 known_errorHandler = 2;
7511 else if (!strcmp(errors, "ignore"))
7512 known_errorHandler = 3;
7513 else if (!strcmp(errors, "xmlcharrefreplace"))
7514 known_errorHandler = 4;
7515 else
7516 known_errorHandler = 0;
7517 }
7518 switch (known_errorHandler) {
7519 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007520 raise_translate_exception(&exc, input, collstart,
7521 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007522 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 case 2: /* replace */
7524 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007525 for (coll = collstart; coll<collend; coll++)
7526 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 /* fall through */
7528 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007529 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 break;
7531 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007532 /* generate replacement (temporarily (mis)uses i) */
7533 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007534 char buffer[2+29+1+1];
7535 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007536 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7537 if (charmaptranslate_makespace(&output, &osize,
7538 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 goto onError;
7540 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007541 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007543 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 break;
7545 default:
7546 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007547 reason, input, &exc,
7548 collstart, collend, &newpos);
7549 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007550 goto onError;
7551 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007552 repsize = PyUnicode_GET_LENGTH(repunicode);
7553 if (charmaptranslate_makespace(&output, &osize,
7554 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 Py_DECREF(repunicode);
7556 goto onError;
7557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007558 for (uni2 = 0; repsize-->0; ++uni2)
7559 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7560 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007561 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007562 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007563 }
7564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007565 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7566 if (!res)
7567 goto onError;
7568 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007569 Py_XDECREF(exc);
7570 Py_XDECREF(errorHandler);
7571 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007574 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007575 Py_XDECREF(exc);
7576 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 return NULL;
7578}
7579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007580/* Deprecated. Use PyUnicode_Translate instead. */
7581PyObject *
7582PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7583 Py_ssize_t size,
7584 PyObject *mapping,
7585 const char *errors)
7586{
7587 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7588 if (!unicode)
7589 return NULL;
7590 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7591}
7592
Alexander Belopolsky40018472011-02-26 01:02:56 +00007593PyObject *
7594PyUnicode_Translate(PyObject *str,
7595 PyObject *mapping,
7596 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597{
7598 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007599
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 str = PyUnicode_FromObject(str);
7601 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007602 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007603 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604 Py_DECREF(str);
7605 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007606
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 Py_XDECREF(str);
7609 return NULL;
7610}
Tim Petersced69f82003-09-16 20:30:58 +00007611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007612static Py_UCS4
7613fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7614{
7615 /* No need to call PyUnicode_READY(self) because this function is only
7616 called as a callback from fixup() which does it already. */
7617 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7618 const int kind = PyUnicode_KIND(self);
7619 void *data = PyUnicode_DATA(self);
7620 Py_UCS4 maxchar = 0, ch, fixed;
7621 Py_ssize_t i;
7622
7623 for (i = 0; i < len; ++i) {
7624 ch = PyUnicode_READ(kind, data, i);
7625 fixed = 0;
7626 if (ch > 127) {
7627 if (Py_UNICODE_ISSPACE(ch))
7628 fixed = ' ';
7629 else {
7630 const int decimal = Py_UNICODE_TODECIMAL(ch);
7631 if (decimal >= 0)
7632 fixed = '0' + decimal;
7633 }
7634 if (fixed != 0) {
7635 if (fixed > maxchar)
7636 maxchar = fixed;
7637 PyUnicode_WRITE(kind, data, i, fixed);
7638 }
7639 else if (ch > maxchar)
7640 maxchar = ch;
7641 }
7642 else if (ch > maxchar)
7643 maxchar = ch;
7644 }
7645
7646 return maxchar;
7647}
7648
7649PyObject *
7650_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7651{
7652 if (!PyUnicode_Check(unicode)) {
7653 PyErr_BadInternalCall();
7654 return NULL;
7655 }
7656 if (PyUnicode_READY(unicode) == -1)
7657 return NULL;
7658 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7659 /* If the string is already ASCII, just return the same string */
7660 Py_INCREF(unicode);
7661 return unicode;
7662 }
7663 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7664}
7665
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007666PyObject *
7667PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7668 Py_ssize_t length)
7669{
7670 PyObject *result;
7671 Py_UNICODE *p; /* write pointer into result */
7672 Py_ssize_t i;
7673 /* Copy to a new string */
7674 result = (PyObject *)_PyUnicode_New(length);
7675 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7676 if (result == NULL)
7677 return result;
7678 p = PyUnicode_AS_UNICODE(result);
7679 /* Iterate over code points */
7680 for (i = 0; i < length; i++) {
7681 Py_UNICODE ch =s[i];
7682 if (ch > 127) {
7683 int decimal = Py_UNICODE_TODECIMAL(ch);
7684 if (decimal >= 0)
7685 p[i] = '0' + decimal;
7686 }
7687 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007688 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7689 Py_DECREF(result);
7690 return NULL;
7691 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007692 return result;
7693}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007694/* --- Decimal Encoder ---------------------------------------------------- */
7695
Alexander Belopolsky40018472011-02-26 01:02:56 +00007696int
7697PyUnicode_EncodeDecimal(Py_UNICODE *s,
7698 Py_ssize_t length,
7699 char *output,
7700 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007701{
7702 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007703 PyObject *errorHandler = NULL;
7704 PyObject *exc = NULL;
7705 const char *encoding = "decimal";
7706 const char *reason = "invalid decimal Unicode string";
7707 /* the following variable is used for caching string comparisons
7708 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7709 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007710
7711 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 PyErr_BadArgument();
7713 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007714 }
7715
7716 p = s;
7717 end = s + length;
7718 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 register Py_UNICODE ch = *p;
7720 int decimal;
7721 PyObject *repunicode;
7722 Py_ssize_t repsize;
7723 Py_ssize_t newpos;
7724 Py_UNICODE *uni2;
7725 Py_UNICODE *collstart;
7726 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007727
Benjamin Peterson29060642009-01-31 22:14:21 +00007728 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007729 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 ++p;
7731 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007732 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 decimal = Py_UNICODE_TODECIMAL(ch);
7734 if (decimal >= 0) {
7735 *output++ = '0' + decimal;
7736 ++p;
7737 continue;
7738 }
7739 if (0 < ch && ch < 256) {
7740 *output++ = (char)ch;
7741 ++p;
7742 continue;
7743 }
7744 /* All other characters are considered unencodable */
7745 collstart = p;
7746 collend = p+1;
7747 while (collend < end) {
7748 if ((0 < *collend && *collend < 256) ||
7749 !Py_UNICODE_ISSPACE(*collend) ||
7750 Py_UNICODE_TODECIMAL(*collend))
7751 break;
7752 }
7753 /* cache callback name lookup
7754 * (if not done yet, i.e. it's the first error) */
7755 if (known_errorHandler==-1) {
7756 if ((errors==NULL) || (!strcmp(errors, "strict")))
7757 known_errorHandler = 1;
7758 else if (!strcmp(errors, "replace"))
7759 known_errorHandler = 2;
7760 else if (!strcmp(errors, "ignore"))
7761 known_errorHandler = 3;
7762 else if (!strcmp(errors, "xmlcharrefreplace"))
7763 known_errorHandler = 4;
7764 else
7765 known_errorHandler = 0;
7766 }
7767 switch (known_errorHandler) {
7768 case 1: /* strict */
7769 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7770 goto onError;
7771 case 2: /* replace */
7772 for (p = collstart; p < collend; ++p)
7773 *output++ = '?';
7774 /* fall through */
7775 case 3: /* ignore */
7776 p = collend;
7777 break;
7778 case 4: /* xmlcharrefreplace */
7779 /* generate replacement (temporarily (mis)uses p) */
7780 for (p = collstart; p < collend; ++p)
7781 output += sprintf(output, "&#%d;", (int)*p);
7782 p = collend;
7783 break;
7784 default:
7785 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7786 encoding, reason, s, length, &exc,
7787 collstart-s, collend-s, &newpos);
7788 if (repunicode == NULL)
7789 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007790 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007791 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007792 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7793 Py_DECREF(repunicode);
7794 goto onError;
7795 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 /* generate replacement */
7797 repsize = PyUnicode_GET_SIZE(repunicode);
7798 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7799 Py_UNICODE ch = *uni2;
7800 if (Py_UNICODE_ISSPACE(ch))
7801 *output++ = ' ';
7802 else {
7803 decimal = Py_UNICODE_TODECIMAL(ch);
7804 if (decimal >= 0)
7805 *output++ = '0' + decimal;
7806 else if (0 < ch && ch < 256)
7807 *output++ = (char)ch;
7808 else {
7809 Py_DECREF(repunicode);
7810 raise_encode_exception(&exc, encoding,
7811 s, length, collstart-s, collend-s, reason);
7812 goto onError;
7813 }
7814 }
7815 }
7816 p = s + newpos;
7817 Py_DECREF(repunicode);
7818 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007819 }
7820 /* 0-terminate the output string */
7821 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007822 Py_XDECREF(exc);
7823 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007824 return 0;
7825
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007827 Py_XDECREF(exc);
7828 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007829 return -1;
7830}
7831
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832/* --- Helpers ------------------------------------------------------------ */
7833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007834#include "stringlib/ucs1lib.h"
7835#include "stringlib/fastsearch.h"
7836#include "stringlib/partition.h"
7837#include "stringlib/split.h"
7838#include "stringlib/count.h"
7839#include "stringlib/find.h"
7840#include "stringlib/localeutil.h"
7841#include "stringlib/undef.h"
7842
7843#include "stringlib/ucs2lib.h"
7844#include "stringlib/fastsearch.h"
7845#include "stringlib/partition.h"
7846#include "stringlib/split.h"
7847#include "stringlib/count.h"
7848#include "stringlib/find.h"
7849#include "stringlib/localeutil.h"
7850#include "stringlib/undef.h"
7851
7852#include "stringlib/ucs4lib.h"
7853#include "stringlib/fastsearch.h"
7854#include "stringlib/partition.h"
7855#include "stringlib/split.h"
7856#include "stringlib/count.h"
7857#include "stringlib/find.h"
7858#include "stringlib/localeutil.h"
7859#include "stringlib/undef.h"
7860
7861static Py_ssize_t
7862any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7863 const Py_UCS1*, Py_ssize_t,
7864 Py_ssize_t, Py_ssize_t),
7865 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7866 const Py_UCS2*, Py_ssize_t,
7867 Py_ssize_t, Py_ssize_t),
7868 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7869 const Py_UCS4*, Py_ssize_t,
7870 Py_ssize_t, Py_ssize_t),
7871 PyObject* s1, PyObject* s2,
7872 Py_ssize_t start,
7873 Py_ssize_t end)
7874{
7875 int kind1, kind2, kind;
7876 void *buf1, *buf2;
7877 Py_ssize_t len1, len2, result;
7878
7879 kind1 = PyUnicode_KIND(s1);
7880 kind2 = PyUnicode_KIND(s2);
7881 kind = kind1 > kind2 ? kind1 : kind2;
7882 buf1 = PyUnicode_DATA(s1);
7883 buf2 = PyUnicode_DATA(s2);
7884 if (kind1 != kind)
7885 buf1 = _PyUnicode_AsKind(s1, kind);
7886 if (!buf1)
7887 return -2;
7888 if (kind2 != kind)
7889 buf2 = _PyUnicode_AsKind(s2, kind);
7890 if (!buf2) {
7891 if (kind1 != kind) PyMem_Free(buf1);
7892 return -2;
7893 }
7894 len1 = PyUnicode_GET_LENGTH(s1);
7895 len2 = PyUnicode_GET_LENGTH(s2);
7896
7897 switch(kind) {
7898 case PyUnicode_1BYTE_KIND:
7899 result = ucs1(buf1, len1, buf2, len2, start, end);
7900 break;
7901 case PyUnicode_2BYTE_KIND:
7902 result = ucs2(buf1, len1, buf2, len2, start, end);
7903 break;
7904 case PyUnicode_4BYTE_KIND:
7905 result = ucs4(buf1, len1, buf2, len2, start, end);
7906 break;
7907 default:
7908 assert(0); result = -2;
7909 }
7910
7911 if (kind1 != kind)
7912 PyMem_Free(buf1);
7913 if (kind2 != kind)
7914 PyMem_Free(buf2);
7915
7916 return result;
7917}
7918
7919Py_ssize_t
7920_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7921 Py_ssize_t n_buffer,
7922 void *digits, Py_ssize_t n_digits,
7923 Py_ssize_t min_width,
7924 const char *grouping,
7925 const char *thousands_sep)
7926{
7927 switch(kind) {
7928 case PyUnicode_1BYTE_KIND:
7929 return _PyUnicode_ucs1_InsertThousandsGrouping(
7930 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7931 min_width, grouping, thousands_sep);
7932 case PyUnicode_2BYTE_KIND:
7933 return _PyUnicode_ucs2_InsertThousandsGrouping(
7934 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7935 min_width, grouping, thousands_sep);
7936 case PyUnicode_4BYTE_KIND:
7937 return _PyUnicode_ucs4_InsertThousandsGrouping(
7938 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7939 min_width, grouping, thousands_sep);
7940 }
7941 assert(0);
7942 return -1;
7943}
7944
7945
Eric Smith8c663262007-08-25 02:26:07 +00007946#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007947#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007948
Thomas Wouters477c8d52006-05-27 19:21:47 +00007949#include "stringlib/count.h"
7950#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007951
Thomas Wouters477c8d52006-05-27 19:21:47 +00007952/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007953#define ADJUST_INDICES(start, end, len) \
7954 if (end > len) \
7955 end = len; \
7956 else if (end < 0) { \
7957 end += len; \
7958 if (end < 0) \
7959 end = 0; \
7960 } \
7961 if (start < 0) { \
7962 start += len; \
7963 if (start < 0) \
7964 start = 0; \
7965 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007966
Alexander Belopolsky40018472011-02-26 01:02:56 +00007967Py_ssize_t
7968PyUnicode_Count(PyObject *str,
7969 PyObject *substr,
7970 Py_ssize_t start,
7971 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007973 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007974 PyUnicodeObject* str_obj;
7975 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007976 int kind1, kind2, kind;
7977 void *buf1 = NULL, *buf2 = NULL;
7978 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007979
Thomas Wouters477c8d52006-05-27 19:21:47 +00007980 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007981 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007983 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02007984 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 Py_DECREF(str_obj);
7986 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 }
Tim Petersced69f82003-09-16 20:30:58 +00007988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007989 kind1 = PyUnicode_KIND(str_obj);
7990 kind2 = PyUnicode_KIND(sub_obj);
7991 kind = kind1 > kind2 ? kind1 : kind2;
7992 buf1 = PyUnicode_DATA(str_obj);
7993 if (kind1 != kind)
7994 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7995 if (!buf1)
7996 goto onError;
7997 buf2 = PyUnicode_DATA(sub_obj);
7998 if (kind2 != kind)
7999 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8000 if (!buf2)
8001 goto onError;
8002 len1 = PyUnicode_GET_LENGTH(str_obj);
8003 len2 = PyUnicode_GET_LENGTH(sub_obj);
8004
8005 ADJUST_INDICES(start, end, len1);
8006 switch(kind) {
8007 case PyUnicode_1BYTE_KIND:
8008 result = ucs1lib_count(
8009 ((Py_UCS1*)buf1) + start, end - start,
8010 buf2, len2, PY_SSIZE_T_MAX
8011 );
8012 break;
8013 case PyUnicode_2BYTE_KIND:
8014 result = ucs2lib_count(
8015 ((Py_UCS2*)buf1) + start, end - start,
8016 buf2, len2, PY_SSIZE_T_MAX
8017 );
8018 break;
8019 case PyUnicode_4BYTE_KIND:
8020 result = ucs4lib_count(
8021 ((Py_UCS4*)buf1) + start, end - start,
8022 buf2, len2, PY_SSIZE_T_MAX
8023 );
8024 break;
8025 default:
8026 assert(0); result = 0;
8027 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008028
8029 Py_DECREF(sub_obj);
8030 Py_DECREF(str_obj);
8031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008032 if (kind1 != kind)
8033 PyMem_Free(buf1);
8034 if (kind2 != kind)
8035 PyMem_Free(buf2);
8036
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008038 onError:
8039 Py_DECREF(sub_obj);
8040 Py_DECREF(str_obj);
8041 if (kind1 != kind && buf1)
8042 PyMem_Free(buf1);
8043 if (kind2 != kind && buf2)
8044 PyMem_Free(buf2);
8045 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046}
8047
Alexander Belopolsky40018472011-02-26 01:02:56 +00008048Py_ssize_t
8049PyUnicode_Find(PyObject *str,
8050 PyObject *sub,
8051 Py_ssize_t start,
8052 Py_ssize_t end,
8053 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008055 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008056
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008058 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008060 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008061 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 Py_DECREF(str);
8063 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 }
Tim Petersced69f82003-09-16 20:30:58 +00008065
Thomas Wouters477c8d52006-05-27 19:21:47 +00008066 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008067 result = any_find_slice(
8068 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8069 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008070 );
8071 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008072 result = any_find_slice(
8073 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8074 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008075 );
8076
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008078 Py_DECREF(sub);
8079
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080 return result;
8081}
8082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008083Py_ssize_t
8084PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8085 Py_ssize_t start, Py_ssize_t end,
8086 int direction)
8087{
8088 char *result;
8089 int kind;
8090 if (PyUnicode_READY(str) == -1)
8091 return -2;
8092 if (end > PyUnicode_GET_LENGTH(str))
8093 end = PyUnicode_GET_LENGTH(str);
8094 kind = PyUnicode_KIND(str);
8095 result = findchar(PyUnicode_1BYTE_DATA(str)
8096 + PyUnicode_KIND_SIZE(kind, start),
8097 kind,
8098 end-start, ch, direction);
8099 if (!result)
8100 return -1;
8101 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8102}
8103
Alexander Belopolsky40018472011-02-26 01:02:56 +00008104static int
8105tailmatch(PyUnicodeObject *self,
8106 PyUnicodeObject *substring,
8107 Py_ssize_t start,
8108 Py_ssize_t end,
8109 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008111 int kind_self;
8112 int kind_sub;
8113 void *data_self;
8114 void *data_sub;
8115 Py_ssize_t offset;
8116 Py_ssize_t i;
8117 Py_ssize_t end_sub;
8118
8119 if (PyUnicode_READY(self) == -1 ||
8120 PyUnicode_READY(substring) == -1)
8121 return 0;
8122
8123 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124 return 1;
8125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008126 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8127 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008131 kind_self = PyUnicode_KIND(self);
8132 data_self = PyUnicode_DATA(self);
8133 kind_sub = PyUnicode_KIND(substring);
8134 data_sub = PyUnicode_DATA(substring);
8135 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8136
8137 if (direction > 0)
8138 offset = end;
8139 else
8140 offset = start;
8141
8142 if (PyUnicode_READ(kind_self, data_self, offset) ==
8143 PyUnicode_READ(kind_sub, data_sub, 0) &&
8144 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8145 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8146 /* If both are of the same kind, memcmp is sufficient */
8147 if (kind_self == kind_sub) {
8148 return ! memcmp((char *)data_self +
8149 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8150 data_sub,
8151 PyUnicode_GET_LENGTH(substring) *
8152 PyUnicode_CHARACTER_SIZE(substring));
8153 }
8154 /* otherwise we have to compare each character by first accesing it */
8155 else {
8156 /* We do not need to compare 0 and len(substring)-1 because
8157 the if statement above ensured already that they are equal
8158 when we end up here. */
8159 // TODO: honor direction and do a forward or backwards search
8160 for (i = 1; i < end_sub; ++i) {
8161 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8162 PyUnicode_READ(kind_sub, data_sub, i))
8163 return 0;
8164 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167 }
8168
8169 return 0;
8170}
8171
Alexander Belopolsky40018472011-02-26 01:02:56 +00008172Py_ssize_t
8173PyUnicode_Tailmatch(PyObject *str,
8174 PyObject *substr,
8175 Py_ssize_t start,
8176 Py_ssize_t end,
8177 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008179 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008180
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181 str = PyUnicode_FromObject(str);
8182 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184 substr = PyUnicode_FromObject(substr);
8185 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 Py_DECREF(str);
8187 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188 }
Tim Petersced69f82003-09-16 20:30:58 +00008189
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 (PyUnicodeObject *)substr,
8192 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008193 Py_DECREF(str);
8194 Py_DECREF(substr);
8195 return result;
8196}
8197
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198/* Apply fixfct filter to the Unicode object self and return a
8199 reference to the modified object */
8200
Alexander Belopolsky40018472011-02-26 01:02:56 +00008201static PyObject *
8202fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008203 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008205 PyObject *u;
8206 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008208 if (PyUnicode_READY(self) == -1)
8209 return NULL;
8210 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8211 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8212 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008216 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8217 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008219 /* fix functions return the new maximum character in a string,
8220 if the kind of the resulting unicode object does not change,
8221 everything is fine. Otherwise we need to change the string kind
8222 and re-run the fix function. */
8223 maxchar_new = fixfct((PyUnicodeObject*)u);
8224 if (maxchar_new == 0)
8225 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8226 else if (maxchar_new <= 127)
8227 maxchar_new = 127;
8228 else if (maxchar_new <= 255)
8229 maxchar_new = 255;
8230 else if (maxchar_new <= 65535)
8231 maxchar_new = 65535;
8232 else
8233 maxchar_new = 1114111; /* 0x10ffff */
8234
8235 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 /* fixfct should return TRUE if it modified the buffer. If
8237 FALSE, return a reference to the original buffer instead
8238 (to save space, not time) */
8239 Py_INCREF(self);
8240 Py_DECREF(u);
8241 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008243 else if (maxchar_new == maxchar_old) {
8244 return u;
8245 }
8246 else {
8247 /* In case the maximum character changed, we need to
8248 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008249 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008250 if (v == NULL) {
8251 Py_DECREF(u);
8252 return NULL;
8253 }
8254 if (maxchar_new > maxchar_old) {
8255 /* If the maxchar increased so that the kind changed, not all
8256 characters are representable anymore and we need to fix the
8257 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008258 if (PyUnicode_CopyCharacters(v, 0,
8259 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008260 PyUnicode_GET_LENGTH(self)) < 0)
8261 {
8262 Py_DECREF(u);
8263 return NULL;
8264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008265 maxchar_old = fixfct((PyUnicodeObject*)v);
8266 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8267 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008268 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008269 if (PyUnicode_CopyCharacters(v, 0,
8270 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008271 PyUnicode_GET_LENGTH(self)) < 0)
8272 {
8273 Py_DECREF(u);
8274 return NULL;
8275 }
8276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008277
8278 Py_DECREF(u);
8279 return v;
8280 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281}
8282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008283static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008284fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008286 /* No need to call PyUnicode_READY(self) because this function is only
8287 called as a callback from fixup() which does it already. */
8288 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8289 const int kind = PyUnicode_KIND(self);
8290 void *data = PyUnicode_DATA(self);
8291 int touched = 0;
8292 Py_UCS4 maxchar = 0;
8293 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008295 for (i = 0; i < len; ++i) {
8296 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8297 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8298 if (up != ch) {
8299 if (up > maxchar)
8300 maxchar = up;
8301 PyUnicode_WRITE(kind, data, i, up);
8302 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008304 else if (ch > maxchar)
8305 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 }
8307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008308 if (touched)
8309 return maxchar;
8310 else
8311 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312}
8313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008314static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008315fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008317 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8318 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8319 const int kind = PyUnicode_KIND(self);
8320 void *data = PyUnicode_DATA(self);
8321 int touched = 0;
8322 Py_UCS4 maxchar = 0;
8323 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008325 for(i = 0; i < len; ++i) {
8326 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8327 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8328 if (lo != ch) {
8329 if (lo > maxchar)
8330 maxchar = lo;
8331 PyUnicode_WRITE(kind, data, i, lo);
8332 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008334 else if (ch > maxchar)
8335 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336 }
8337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008338 if (touched)
8339 return maxchar;
8340 else
8341 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342}
8343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008344static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008345fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008347 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8348 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8349 const int kind = PyUnicode_KIND(self);
8350 void *data = PyUnicode_DATA(self);
8351 int touched = 0;
8352 Py_UCS4 maxchar = 0;
8353 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 for(i = 0; i < len; ++i) {
8356 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8357 Py_UCS4 nu = 0;
8358
8359 if (Py_UNICODE_ISUPPER(ch))
8360 nu = Py_UNICODE_TOLOWER(ch);
8361 else if (Py_UNICODE_ISLOWER(ch))
8362 nu = Py_UNICODE_TOUPPER(ch);
8363
8364 if (nu != 0) {
8365 if (nu > maxchar)
8366 maxchar = nu;
8367 PyUnicode_WRITE(kind, data, i, nu);
8368 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 else if (ch > maxchar)
8371 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372 }
8373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 if (touched)
8375 return maxchar;
8376 else
8377 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378}
8379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008381fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8384 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8385 const int kind = PyUnicode_KIND(self);
8386 void *data = PyUnicode_DATA(self);
8387 int touched = 0;
8388 Py_UCS4 maxchar = 0;
8389 Py_ssize_t i = 0;
8390 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008391
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008392 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394
8395 ch = PyUnicode_READ(kind, data, i);
8396 if (!Py_UNICODE_ISUPPER(ch)) {
8397 maxchar = Py_UNICODE_TOUPPER(ch);
8398 PyUnicode_WRITE(kind, data, i, maxchar);
8399 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 ++i;
8402 for(; i < len; ++i) {
8403 ch = PyUnicode_READ(kind, data, i);
8404 if (!Py_UNICODE_ISLOWER(ch)) {
8405 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8406 if (lo > maxchar)
8407 maxchar = lo;
8408 PyUnicode_WRITE(kind, data, i, lo);
8409 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008411 else if (ch > maxchar)
8412 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008413 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008414
8415 if (touched)
8416 return maxchar;
8417 else
8418 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419}
8420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008421static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008422fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008424 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8425 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8426 const int kind = PyUnicode_KIND(self);
8427 void *data = PyUnicode_DATA(self);
8428 Py_UCS4 maxchar = 0;
8429 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 int previous_is_cased;
8431
8432 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 if (len == 1) {
8434 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8435 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8436 if (ti != ch) {
8437 PyUnicode_WRITE(kind, data, i, ti);
8438 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 }
8440 else
8441 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008444 for(; i < len; ++i) {
8445 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8446 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008447
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008451 nu = Py_UNICODE_TOTITLE(ch);
8452
8453 if (nu > maxchar)
8454 maxchar = nu;
8455 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008456
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 if (Py_UNICODE_ISLOWER(ch) ||
8458 Py_UNICODE_ISUPPER(ch) ||
8459 Py_UNICODE_ISTITLE(ch))
8460 previous_is_cased = 1;
8461 else
8462 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465}
8466
Tim Peters8ce9f162004-08-27 01:49:32 +00008467PyObject *
8468PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008471 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008473 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008474 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8475 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008476 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008477 Py_ssize_t sz, i, res_offset;
8478 Py_UCS4 maxchar = 0;
8479 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480
Tim Peters05eba1f2004-08-27 21:32:02 +00008481 fseq = PySequence_Fast(seq, "");
8482 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008483 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008484 }
8485
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008486 /* NOTE: the following code can't call back into Python code,
8487 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008488 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008489
Tim Peters05eba1f2004-08-27 21:32:02 +00008490 seqlen = PySequence_Fast_GET_SIZE(fseq);
8491 /* If empty sequence, return u"". */
8492 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008494 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008495 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008496 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008497 /* If singleton sequence with an exact Unicode, return that. */
8498 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 item = items[0];
8500 if (PyUnicode_CheckExact(item)) {
8501 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 goto Done;
8504 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008505 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008506 else {
8507 /* Set up sep and seplen */
8508 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 /* fall back to a blank space separator */
8510 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008511 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008513 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008514 else {
8515 if (!PyUnicode_Check(separator)) {
8516 PyErr_Format(PyExc_TypeError,
8517 "separator: expected str instance,"
8518 " %.80s found",
8519 Py_TYPE(separator)->tp_name);
8520 goto onError;
8521 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008522 if (PyUnicode_READY(separator) == -1)
8523 goto onError;
8524 sep = separator;
8525 seplen = PyUnicode_GET_LENGTH(separator);
8526 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8527 /* inc refcount to keep this code path symetric with the
8528 above case of a blank separator */
8529 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008530 }
8531 }
8532
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008533 /* There are at least two things to join, or else we have a subclass
8534 * of str in the sequence.
8535 * Do a pre-pass to figure out the total amount of space we'll
8536 * need (sz), and see whether all argument are strings.
8537 */
8538 sz = 0;
8539 for (i = 0; i < seqlen; i++) {
8540 const Py_ssize_t old_sz = sz;
8541 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 if (!PyUnicode_Check(item)) {
8543 PyErr_Format(PyExc_TypeError,
8544 "sequence item %zd: expected str instance,"
8545 " %.80s found",
8546 i, Py_TYPE(item)->tp_name);
8547 goto onError;
8548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008549 if (PyUnicode_READY(item) == -1)
8550 goto onError;
8551 sz += PyUnicode_GET_LENGTH(item);
8552 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8553 if (item_maxchar > maxchar)
8554 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008555 if (i != 0)
8556 sz += seplen;
8557 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8558 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008560 goto onError;
8561 }
8562 }
Tim Petersced69f82003-09-16 20:30:58 +00008563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008565 if (res == NULL)
8566 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008567
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008568 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008569 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008570 Py_ssize_t itemlen;
8571 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 /* Copy item, and maybe the separator. */
8574 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008575 if (PyUnicode_CopyCharacters(res, res_offset,
8576 sep, 0, seplen) < 0)
8577 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008580 if (PyUnicode_CopyCharacters(res, res_offset,
8581 item, 0, itemlen) < 0)
8582 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008585 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008586
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008588 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 Py_XDECREF(sep);
8590 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008593 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008595 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596 return NULL;
8597}
8598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599#define FILL(kind, data, value, start, length) \
8600 do { \
8601 Py_ssize_t i_ = 0; \
8602 assert(kind != PyUnicode_WCHAR_KIND); \
8603 switch ((kind)) { \
8604 case PyUnicode_1BYTE_KIND: { \
8605 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8606 memset(to_, (unsigned char)value, length); \
8607 break; \
8608 } \
8609 case PyUnicode_2BYTE_KIND: { \
8610 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8611 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8612 break; \
8613 } \
8614 default: { \
8615 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8616 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8617 break; \
8618 } \
8619 } \
8620 } while (0)
8621
Alexander Belopolsky40018472011-02-26 01:02:56 +00008622static PyUnicodeObject *
8623pad(PyUnicodeObject *self,
8624 Py_ssize_t left,
8625 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 PyObject *u;
8629 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008630 int kind;
8631 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632
8633 if (left < 0)
8634 left = 0;
8635 if (right < 0)
8636 right = 0;
8637
Tim Peters7a29bd52001-09-12 03:03:31 +00008638 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639 Py_INCREF(self);
8640 return self;
8641 }
8642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8644 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008645 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8646 return NULL;
8647 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8649 if (fill > maxchar)
8650 maxchar = fill;
8651 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008652 if (!u)
8653 return NULL;
8654
8655 kind = PyUnicode_KIND(u);
8656 data = PyUnicode_DATA(u);
8657 if (left)
8658 FILL(kind, data, fill, 0, left);
8659 if (right)
8660 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008661 if (PyUnicode_CopyCharacters(u, left,
8662 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008663 _PyUnicode_LENGTH(self)) < 0)
8664 {
8665 Py_DECREF(u);
8666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667 }
8668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672
Alexander Belopolsky40018472011-02-26 01:02:56 +00008673PyObject *
8674PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677
8678 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008682 switch(PyUnicode_KIND(string)) {
8683 case PyUnicode_1BYTE_KIND:
8684 list = ucs1lib_splitlines(
8685 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8686 PyUnicode_GET_LENGTH(string), keepends);
8687 break;
8688 case PyUnicode_2BYTE_KIND:
8689 list = ucs2lib_splitlines(
8690 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8691 PyUnicode_GET_LENGTH(string), keepends);
8692 break;
8693 case PyUnicode_4BYTE_KIND:
8694 list = ucs4lib_splitlines(
8695 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8696 PyUnicode_GET_LENGTH(string), keepends);
8697 break;
8698 default:
8699 assert(0);
8700 list = 0;
8701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702 Py_DECREF(string);
8703 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704}
8705
Alexander Belopolsky40018472011-02-26 01:02:56 +00008706static PyObject *
8707split(PyUnicodeObject *self,
8708 PyUnicodeObject *substring,
8709 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008711 int kind1, kind2, kind;
8712 void *buf1, *buf2;
8713 Py_ssize_t len1, len2;
8714 PyObject* out;
8715
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008717 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719 if (PyUnicode_READY(self) == -1)
8720 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008722 if (substring == NULL)
8723 switch(PyUnicode_KIND(self)) {
8724 case PyUnicode_1BYTE_KIND:
8725 return ucs1lib_split_whitespace(
8726 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8727 PyUnicode_GET_LENGTH(self), maxcount
8728 );
8729 case PyUnicode_2BYTE_KIND:
8730 return ucs2lib_split_whitespace(
8731 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8732 PyUnicode_GET_LENGTH(self), maxcount
8733 );
8734 case PyUnicode_4BYTE_KIND:
8735 return ucs4lib_split_whitespace(
8736 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8737 PyUnicode_GET_LENGTH(self), maxcount
8738 );
8739 default:
8740 assert(0);
8741 return NULL;
8742 }
8743
8744 if (PyUnicode_READY(substring) == -1)
8745 return NULL;
8746
8747 kind1 = PyUnicode_KIND(self);
8748 kind2 = PyUnicode_KIND(substring);
8749 kind = kind1 > kind2 ? kind1 : kind2;
8750 buf1 = PyUnicode_DATA(self);
8751 buf2 = PyUnicode_DATA(substring);
8752 if (kind1 != kind)
8753 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8754 if (!buf1)
8755 return NULL;
8756 if (kind2 != kind)
8757 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8758 if (!buf2) {
8759 if (kind1 != kind) PyMem_Free(buf1);
8760 return NULL;
8761 }
8762 len1 = PyUnicode_GET_LENGTH(self);
8763 len2 = PyUnicode_GET_LENGTH(substring);
8764
8765 switch(kind) {
8766 case PyUnicode_1BYTE_KIND:
8767 out = ucs1lib_split(
8768 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8769 break;
8770 case PyUnicode_2BYTE_KIND:
8771 out = ucs2lib_split(
8772 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8773 break;
8774 case PyUnicode_4BYTE_KIND:
8775 out = ucs4lib_split(
8776 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8777 break;
8778 default:
8779 out = NULL;
8780 }
8781 if (kind1 != kind)
8782 PyMem_Free(buf1);
8783 if (kind2 != kind)
8784 PyMem_Free(buf2);
8785 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786}
8787
Alexander Belopolsky40018472011-02-26 01:02:56 +00008788static PyObject *
8789rsplit(PyUnicodeObject *self,
8790 PyUnicodeObject *substring,
8791 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008792{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008793 int kind1, kind2, kind;
8794 void *buf1, *buf2;
8795 Py_ssize_t len1, len2;
8796 PyObject* out;
8797
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008798 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008799 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008801 if (PyUnicode_READY(self) == -1)
8802 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 if (substring == NULL)
8805 switch(PyUnicode_KIND(self)) {
8806 case PyUnicode_1BYTE_KIND:
8807 return ucs1lib_rsplit_whitespace(
8808 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8809 PyUnicode_GET_LENGTH(self), maxcount
8810 );
8811 case PyUnicode_2BYTE_KIND:
8812 return ucs2lib_rsplit_whitespace(
8813 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8814 PyUnicode_GET_LENGTH(self), maxcount
8815 );
8816 case PyUnicode_4BYTE_KIND:
8817 return ucs4lib_rsplit_whitespace(
8818 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8819 PyUnicode_GET_LENGTH(self), maxcount
8820 );
8821 default:
8822 assert(0);
8823 return NULL;
8824 }
8825
8826 if (PyUnicode_READY(substring) == -1)
8827 return NULL;
8828
8829 kind1 = PyUnicode_KIND(self);
8830 kind2 = PyUnicode_KIND(substring);
8831 kind = kind1 > kind2 ? kind1 : kind2;
8832 buf1 = PyUnicode_DATA(self);
8833 buf2 = PyUnicode_DATA(substring);
8834 if (kind1 != kind)
8835 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8836 if (!buf1)
8837 return NULL;
8838 if (kind2 != kind)
8839 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8840 if (!buf2) {
8841 if (kind1 != kind) PyMem_Free(buf1);
8842 return NULL;
8843 }
8844 len1 = PyUnicode_GET_LENGTH(self);
8845 len2 = PyUnicode_GET_LENGTH(substring);
8846
8847 switch(kind) {
8848 case PyUnicode_1BYTE_KIND:
8849 out = ucs1lib_rsplit(
8850 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8851 break;
8852 case PyUnicode_2BYTE_KIND:
8853 out = ucs2lib_rsplit(
8854 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8855 break;
8856 case PyUnicode_4BYTE_KIND:
8857 out = ucs4lib_rsplit(
8858 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8859 break;
8860 default:
8861 out = NULL;
8862 }
8863 if (kind1 != kind)
8864 PyMem_Free(buf1);
8865 if (kind2 != kind)
8866 PyMem_Free(buf2);
8867 return out;
8868}
8869
8870static Py_ssize_t
8871anylib_find(int kind, void *buf1, Py_ssize_t len1,
8872 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8873{
8874 switch(kind) {
8875 case PyUnicode_1BYTE_KIND:
8876 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8877 case PyUnicode_2BYTE_KIND:
8878 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8879 case PyUnicode_4BYTE_KIND:
8880 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8881 }
8882 assert(0);
8883 return -1;
8884}
8885
8886static Py_ssize_t
8887anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8888 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8889{
8890 switch(kind) {
8891 case PyUnicode_1BYTE_KIND:
8892 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8893 case PyUnicode_2BYTE_KIND:
8894 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8895 case PyUnicode_4BYTE_KIND:
8896 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8897 }
8898 assert(0);
8899 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008900}
8901
Alexander Belopolsky40018472011-02-26 01:02:56 +00008902static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903replace(PyObject *self, PyObject *str1,
8904 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 PyObject *u;
8907 char *sbuf = PyUnicode_DATA(self);
8908 char *buf1 = PyUnicode_DATA(str1);
8909 char *buf2 = PyUnicode_DATA(str2);
8910 int srelease = 0, release1 = 0, release2 = 0;
8911 int skind = PyUnicode_KIND(self);
8912 int kind1 = PyUnicode_KIND(str1);
8913 int kind2 = PyUnicode_KIND(str2);
8914 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8915 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8916 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917
8918 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008919 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008921 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 if (skind < kind1)
8924 /* substring too wide to be present */
8925 goto nothing;
8926
8927 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008928 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008929 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008931 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008933 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 Py_UCS4 u1, u2, maxchar;
8935 int mayshrink, rkind;
8936 u1 = PyUnicode_READ_CHAR(str1, 0);
8937 if (!findchar(sbuf, PyUnicode_KIND(self),
8938 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008939 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 u2 = PyUnicode_READ_CHAR(str2, 0);
8941 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8942 /* Replacing u1 with u2 may cause a maxchar reduction in the
8943 result string. */
8944 mayshrink = maxchar > 127;
8945 if (u2 > maxchar) {
8946 maxchar = u2;
8947 mayshrink = 0;
8948 }
8949 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008950 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008952 if (PyUnicode_CopyCharacters(u, 0,
8953 (PyObject*)self, 0, slen) < 0)
8954 {
8955 Py_DECREF(u);
8956 return NULL;
8957 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 rkind = PyUnicode_KIND(u);
8959 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8960 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008961 if (--maxcount < 0)
8962 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008964 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 if (mayshrink) {
8966 PyObject *tmp = u;
8967 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8968 PyUnicode_GET_LENGTH(tmp));
8969 Py_DECREF(tmp);
8970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 int rkind = skind;
8973 char *res;
8974 if (kind1 < rkind) {
8975 /* widen substring */
8976 buf1 = _PyUnicode_AsKind(str1, rkind);
8977 if (!buf1) goto error;
8978 release1 = 1;
8979 }
8980 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008981 if (i < 0)
8982 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008983 if (rkind > kind2) {
8984 /* widen replacement */
8985 buf2 = _PyUnicode_AsKind(str2, rkind);
8986 if (!buf2) goto error;
8987 release2 = 1;
8988 }
8989 else if (rkind < kind2) {
8990 /* widen self and buf1 */
8991 rkind = kind2;
8992 if (release1) PyMem_Free(buf1);
8993 sbuf = _PyUnicode_AsKind(self, rkind);
8994 if (!sbuf) goto error;
8995 srelease = 1;
8996 buf1 = _PyUnicode_AsKind(str1, rkind);
8997 if (!buf1) goto error;
8998 release1 = 1;
8999 }
9000 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9001 if (!res) {
9002 PyErr_NoMemory();
9003 goto error;
9004 }
9005 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009006 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009007 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9008 buf2,
9009 PyUnicode_KIND_SIZE(rkind, len2));
9010 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009011
9012 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9014 slen-i,
9015 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009016 if (i == -1)
9017 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9019 buf2,
9020 PyUnicode_KIND_SIZE(rkind, len2));
9021 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023
9024 u = PyUnicode_FromKindAndData(rkind, res, slen);
9025 PyMem_Free(res);
9026 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 Py_ssize_t n, i, j, ires;
9031 Py_ssize_t product, new_size;
9032 int rkind = skind;
9033 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035 if (kind1 < rkind) {
9036 buf1 = _PyUnicode_AsKind(str1, rkind);
9037 if (!buf1) goto error;
9038 release1 = 1;
9039 }
9040 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009041 if (n == 0)
9042 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043 if (kind2 < rkind) {
9044 buf2 = _PyUnicode_AsKind(str2, rkind);
9045 if (!buf2) goto error;
9046 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 else if (kind2 > rkind) {
9049 rkind = kind2;
9050 sbuf = _PyUnicode_AsKind(self, rkind);
9051 if (!sbuf) goto error;
9052 srelease = 1;
9053 if (release1) PyMem_Free(buf1);
9054 buf1 = _PyUnicode_AsKind(str1, rkind);
9055 if (!buf1) goto error;
9056 release1 = 1;
9057 }
9058 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9059 PyUnicode_GET_LENGTH(str1))); */
9060 product = n * (len2-len1);
9061 if ((product / (len2-len1)) != n) {
9062 PyErr_SetString(PyExc_OverflowError,
9063 "replace string is too long");
9064 goto error;
9065 }
9066 new_size = slen + product;
9067 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9068 PyErr_SetString(PyExc_OverflowError,
9069 "replace string is too long");
9070 goto error;
9071 }
9072 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9073 if (!res)
9074 goto error;
9075 ires = i = 0;
9076 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009077 while (n-- > 0) {
9078 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079 j = anylib_find(rkind,
9080 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9081 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009082 if (j == -1)
9083 break;
9084 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009085 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009086 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9087 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9088 PyUnicode_KIND_SIZE(rkind, j-i));
9089 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009090 }
9091 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092 if (len2 > 0) {
9093 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9094 buf2,
9095 PyUnicode_KIND_SIZE(rkind, len2));
9096 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009099 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009101 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9103 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9104 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009105 } else {
9106 /* interleave */
9107 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9109 buf2,
9110 PyUnicode_KIND_SIZE(rkind, len2));
9111 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009112 if (--n <= 0)
9113 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9115 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9116 PyUnicode_KIND_SIZE(rkind, 1));
9117 ires++;
9118 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9121 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9122 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009123 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009125 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 if (srelease)
9128 PyMem_FREE(sbuf);
9129 if (release1)
9130 PyMem_FREE(buf1);
9131 if (release2)
9132 PyMem_FREE(buf2);
9133 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009134
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009136 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009137 if (srelease)
9138 PyMem_FREE(sbuf);
9139 if (release1)
9140 PyMem_FREE(buf1);
9141 if (release2)
9142 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009143 if (PyUnicode_CheckExact(self)) {
9144 Py_INCREF(self);
9145 return (PyObject *) self;
9146 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009147 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148 error:
9149 if (srelease && sbuf)
9150 PyMem_FREE(sbuf);
9151 if (release1 && buf1)
9152 PyMem_FREE(buf1);
9153 if (release2 && buf2)
9154 PyMem_FREE(buf2);
9155 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156}
9157
9158/* --- Unicode Object Methods --------------------------------------------- */
9159
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009160PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009161 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009162\n\
9163Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009164characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165
9166static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009167unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009168{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169 return fixup(self, fixtitle);
9170}
9171
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009172PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009173 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174\n\
9175Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009176have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177
9178static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009179unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181 return fixup(self, fixcapitalize);
9182}
9183
9184#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009185PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009187\n\
9188Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009189normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009190
9191static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009192unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009193{
9194 PyObject *list;
9195 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009196 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009197
Guido van Rossumd57fd912000-03-10 22:53:23 +00009198 /* Split into words */
9199 list = split(self, NULL, -1);
9200 if (!list)
9201 return NULL;
9202
9203 /* Capitalize each word */
9204 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9205 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009206 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207 if (item == NULL)
9208 goto onError;
9209 Py_DECREF(PyList_GET_ITEM(list, i));
9210 PyList_SET_ITEM(list, i, item);
9211 }
9212
9213 /* Join the words to form a new string */
9214 item = PyUnicode_Join(NULL, list);
9215
Benjamin Peterson29060642009-01-31 22:14:21 +00009216 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217 Py_DECREF(list);
9218 return (PyObject *)item;
9219}
9220#endif
9221
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009222/* Argument converter. Coerces to a single unicode character */
9223
9224static int
9225convert_uc(PyObject *obj, void *addr)
9226{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009228 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009229
Benjamin Peterson14339b62009-01-31 16:36:08 +00009230 uniobj = PyUnicode_FromObject(obj);
9231 if (uniobj == NULL) {
9232 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009233 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009234 return 0;
9235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009237 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009238 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009239 Py_DECREF(uniobj);
9240 return 0;
9241 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009243 Py_DECREF(uniobj);
9244 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009245}
9246
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009247PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009248 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009250Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009251done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009252
9253static PyObject *
9254unicode_center(PyUnicodeObject *self, PyObject *args)
9255{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009256 Py_ssize_t marg, left;
9257 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258 Py_UCS4 fillchar = ' ';
9259
Victor Stinnere9a29352011-10-01 02:14:59 +02009260 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262
Victor Stinnere9a29352011-10-01 02:14:59 +02009263 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264 return NULL;
9265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009267 Py_INCREF(self);
9268 return (PyObject*) self;
9269 }
9270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009272 left = marg / 2 + (marg & width & 1);
9273
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009274 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275}
9276
Marc-André Lemburge5034372000-08-08 08:04:29 +00009277#if 0
9278
9279/* This code should go into some future Unicode collation support
9280 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009281 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009282
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009283/* speedy UTF-16 code point order comparison */
9284/* gleaned from: */
9285/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9286
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009287static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009288{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009289 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009290 0, 0, 0, 0, 0, 0, 0, 0,
9291 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009292 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009293};
9294
Guido van Rossumd57fd912000-03-10 22:53:23 +00009295static int
9296unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9297{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009298 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009299
Guido van Rossumd57fd912000-03-10 22:53:23 +00009300 Py_UNICODE *s1 = str1->str;
9301 Py_UNICODE *s2 = str2->str;
9302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009303 len1 = str1->_base._base.length;
9304 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009305
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009307 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009308
9309 c1 = *s1++;
9310 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009311
Benjamin Peterson29060642009-01-31 22:14:21 +00009312 if (c1 > (1<<11) * 26)
9313 c1 += utf16Fixup[c1>>11];
9314 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009315 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009316 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009317
9318 if (c1 != c2)
9319 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009320
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009321 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322 }
9323
9324 return (len1 < len2) ? -1 : (len1 != len2);
9325}
9326
Marc-André Lemburge5034372000-08-08 08:04:29 +00009327#else
9328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329/* This function assumes that str1 and str2 are readied by the caller. */
9330
Marc-André Lemburge5034372000-08-08 08:04:29 +00009331static int
9332unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9333{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334 int kind1, kind2;
9335 void *data1, *data2;
9336 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 kind1 = PyUnicode_KIND(str1);
9339 kind2 = PyUnicode_KIND(str2);
9340 data1 = PyUnicode_DATA(str1);
9341 data2 = PyUnicode_DATA(str2);
9342 len1 = PyUnicode_GET_LENGTH(str1);
9343 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 for (i = 0; i < len1 && i < len2; ++i) {
9346 Py_UCS4 c1, c2;
9347 c1 = PyUnicode_READ(kind1, data1, i);
9348 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009349
9350 if (c1 != c2)
9351 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009352 }
9353
9354 return (len1 < len2) ? -1 : (len1 != len2);
9355}
9356
9357#endif
9358
Alexander Belopolsky40018472011-02-26 01:02:56 +00009359int
9360PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009362 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9363 if (PyUnicode_READY(left) == -1 ||
9364 PyUnicode_READY(right) == -1)
9365 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009366 return unicode_compare((PyUnicodeObject *)left,
9367 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009368 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009369 PyErr_Format(PyExc_TypeError,
9370 "Can't compare %.100s and %.100s",
9371 left->ob_type->tp_name,
9372 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373 return -1;
9374}
9375
Martin v. Löwis5b222132007-06-10 09:51:05 +00009376int
9377PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9378{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 Py_ssize_t i;
9380 int kind;
9381 void *data;
9382 Py_UCS4 chr;
9383
Martin v. Löwis5b222132007-06-10 09:51:05 +00009384 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 if (PyUnicode_READY(uni) == -1)
9386 return -1;
9387 kind = PyUnicode_KIND(uni);
9388 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009389 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9391 if (chr != str[i])
9392 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009393 /* This check keeps Python strings that end in '\0' from comparing equal
9394 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009396 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009397 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009398 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009399 return 0;
9400}
9401
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009402
Benjamin Peterson29060642009-01-31 22:14:21 +00009403#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009404 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009405
Alexander Belopolsky40018472011-02-26 01:02:56 +00009406PyObject *
9407PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009408{
9409 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009410
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009411 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9412 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 if (PyUnicode_READY(left) == -1 ||
9414 PyUnicode_READY(right) == -1)
9415 return NULL;
9416 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9417 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009418 if (op == Py_EQ) {
9419 Py_INCREF(Py_False);
9420 return Py_False;
9421 }
9422 if (op == Py_NE) {
9423 Py_INCREF(Py_True);
9424 return Py_True;
9425 }
9426 }
9427 if (left == right)
9428 result = 0;
9429 else
9430 result = unicode_compare((PyUnicodeObject *)left,
9431 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009432
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009433 /* Convert the return value to a Boolean */
9434 switch (op) {
9435 case Py_EQ:
9436 v = TEST_COND(result == 0);
9437 break;
9438 case Py_NE:
9439 v = TEST_COND(result != 0);
9440 break;
9441 case Py_LE:
9442 v = TEST_COND(result <= 0);
9443 break;
9444 case Py_GE:
9445 v = TEST_COND(result >= 0);
9446 break;
9447 case Py_LT:
9448 v = TEST_COND(result == -1);
9449 break;
9450 case Py_GT:
9451 v = TEST_COND(result == 1);
9452 break;
9453 default:
9454 PyErr_BadArgument();
9455 return NULL;
9456 }
9457 Py_INCREF(v);
9458 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009459 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009460
Brian Curtindfc80e32011-08-10 20:28:54 -05009461 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009462}
9463
Alexander Belopolsky40018472011-02-26 01:02:56 +00009464int
9465PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009466{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009467 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 int kind1, kind2, kind;
9469 void *buf1, *buf2;
9470 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009471 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009472
9473 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009474 sub = PyUnicode_FromObject(element);
9475 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009476 PyErr_Format(PyExc_TypeError,
9477 "'in <string>' requires string as left operand, not %s",
9478 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009479 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009480 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 if (PyUnicode_READY(sub) == -1)
9482 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009483
Thomas Wouters477c8d52006-05-27 19:21:47 +00009484 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009485 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009486 Py_DECREF(sub);
9487 return -1;
9488 }
9489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 kind1 = PyUnicode_KIND(str);
9491 kind2 = PyUnicode_KIND(sub);
9492 kind = kind1 > kind2 ? kind1 : kind2;
9493 buf1 = PyUnicode_DATA(str);
9494 buf2 = PyUnicode_DATA(sub);
9495 if (kind1 != kind)
9496 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9497 if (!buf1) {
9498 Py_DECREF(sub);
9499 return -1;
9500 }
9501 if (kind2 != kind)
9502 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9503 if (!buf2) {
9504 Py_DECREF(sub);
9505 if (kind1 != kind) PyMem_Free(buf1);
9506 return -1;
9507 }
9508 len1 = PyUnicode_GET_LENGTH(str);
9509 len2 = PyUnicode_GET_LENGTH(sub);
9510
9511 switch(kind) {
9512 case PyUnicode_1BYTE_KIND:
9513 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9514 break;
9515 case PyUnicode_2BYTE_KIND:
9516 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9517 break;
9518 case PyUnicode_4BYTE_KIND:
9519 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9520 break;
9521 default:
9522 result = -1;
9523 assert(0);
9524 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009525
9526 Py_DECREF(str);
9527 Py_DECREF(sub);
9528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529 if (kind1 != kind)
9530 PyMem_Free(buf1);
9531 if (kind2 != kind)
9532 PyMem_Free(buf2);
9533
Guido van Rossum403d68b2000-03-13 15:55:09 +00009534 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009535}
9536
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537/* Concat to string or Unicode object giving a new Unicode object. */
9538
Alexander Belopolsky40018472011-02-26 01:02:56 +00009539PyObject *
9540PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009542 PyObject *u = NULL, *v = NULL, *w;
9543 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544
9545 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009546 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009548 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009551 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552
9553 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009559 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561 }
9562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009564 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 w = PyUnicode_New(
9568 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9569 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009571 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009572 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9573 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009574 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009575 v, 0,
9576 PyUnicode_GET_LENGTH(v)) < 0)
9577 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578 Py_DECREF(u);
9579 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009580 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583 Py_XDECREF(u);
9584 Py_XDECREF(v);
9585 return NULL;
9586}
9587
Walter Dörwald1ab83302007-05-18 17:15:44 +00009588void
9589PyUnicode_Append(PyObject **pleft, PyObject *right)
9590{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009591 PyObject *new;
9592 if (*pleft == NULL)
9593 return;
9594 if (right == NULL || !PyUnicode_Check(*pleft)) {
9595 Py_DECREF(*pleft);
9596 *pleft = NULL;
9597 return;
9598 }
9599 new = PyUnicode_Concat(*pleft, right);
9600 Py_DECREF(*pleft);
9601 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009602}
9603
9604void
9605PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9606{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009607 PyUnicode_Append(pleft, right);
9608 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009609}
9610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009611PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009612 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009614Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009615string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009616interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009617
9618static PyObject *
9619unicode_count(PyUnicodeObject *self, PyObject *args)
9620{
9621 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009622 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009623 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 int kind1, kind2, kind;
9626 void *buf1, *buf2;
9627 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628
Jesus Ceaac451502011-04-20 17:09:23 +02009629 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9630 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009631 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 kind1 = PyUnicode_KIND(self);
9634 kind2 = PyUnicode_KIND(substring);
9635 kind = kind1 > kind2 ? kind1 : kind2;
9636 buf1 = PyUnicode_DATA(self);
9637 buf2 = PyUnicode_DATA(substring);
9638 if (kind1 != kind)
9639 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9640 if (!buf1) {
9641 Py_DECREF(substring);
9642 return NULL;
9643 }
9644 if (kind2 != kind)
9645 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9646 if (!buf2) {
9647 Py_DECREF(substring);
9648 if (kind1 != kind) PyMem_Free(buf1);
9649 return NULL;
9650 }
9651 len1 = PyUnicode_GET_LENGTH(self);
9652 len2 = PyUnicode_GET_LENGTH(substring);
9653
9654 ADJUST_INDICES(start, end, len1);
9655 switch(kind) {
9656 case PyUnicode_1BYTE_KIND:
9657 iresult = ucs1lib_count(
9658 ((Py_UCS1*)buf1) + start, end - start,
9659 buf2, len2, PY_SSIZE_T_MAX
9660 );
9661 break;
9662 case PyUnicode_2BYTE_KIND:
9663 iresult = ucs2lib_count(
9664 ((Py_UCS2*)buf1) + start, end - start,
9665 buf2, len2, PY_SSIZE_T_MAX
9666 );
9667 break;
9668 case PyUnicode_4BYTE_KIND:
9669 iresult = ucs4lib_count(
9670 ((Py_UCS4*)buf1) + start, end - start,
9671 buf2, len2, PY_SSIZE_T_MAX
9672 );
9673 break;
9674 default:
9675 assert(0); iresult = 0;
9676 }
9677
9678 result = PyLong_FromSsize_t(iresult);
9679
9680 if (kind1 != kind)
9681 PyMem_Free(buf1);
9682 if (kind2 != kind)
9683 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684
9685 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009686
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687 return result;
9688}
9689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009690PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009691 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009693Encode S using the codec registered for encoding. Default encoding\n\
9694is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009695handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009696a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9697'xmlcharrefreplace' as well as any other name registered with\n\
9698codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699
9700static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009701unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009703 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704 char *encoding = NULL;
9705 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009706
Benjamin Peterson308d6372009-09-18 21:42:35 +00009707 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9708 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009710 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009711}
9712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009713PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009714 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715\n\
9716Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009717If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718
9719static PyObject*
9720unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9721{
9722 Py_UNICODE *e;
9723 Py_UNICODE *p;
9724 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009725 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727 PyUnicodeObject *u;
9728 int tabsize = 8;
9729
9730 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009731 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9734 return NULL;
9735
Thomas Wouters7e474022000-07-16 12:04:32 +00009736 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009737 i = 0; /* chars up to and including most recent \n or \r */
9738 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9740 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009741 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009742 if (tabsize > 0) {
9743 incr = tabsize - (j % tabsize); /* cannot overflow */
9744 if (j > PY_SSIZE_T_MAX - incr)
9745 goto overflow1;
9746 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009747 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009748 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009750 if (j > PY_SSIZE_T_MAX - 1)
9751 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009752 j++;
9753 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009754 if (i > PY_SSIZE_T_MAX - j)
9755 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009757 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758 }
9759 }
9760
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009761 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009762 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009763
Guido van Rossumd57fd912000-03-10 22:53:23 +00009764 /* Second pass: create output string and fill it */
9765 u = _PyUnicode_New(i + j);
9766 if (!u)
9767 return NULL;
9768
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009769 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 q = _PyUnicode_WSTR(u); /* next output char */
9771 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009775 if (tabsize > 0) {
9776 i = tabsize - (j % tabsize);
9777 j += i;
9778 while (i--) {
9779 if (q >= qe)
9780 goto overflow2;
9781 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009782 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009783 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009784 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009785 else {
9786 if (q >= qe)
9787 goto overflow2;
9788 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009789 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790 if (*p == '\n' || *p == '\r')
9791 j = 0;
9792 }
9793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 if (PyUnicode_READY(u) == -1) {
9795 Py_DECREF(u);
9796 return NULL;
9797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009798 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009799
9800 overflow2:
9801 Py_DECREF(u);
9802 overflow1:
9803 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805}
9806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009807PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009808 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809\n\
9810Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009811such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812arguments start and end are interpreted as in slice notation.\n\
9813\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009814Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815
9816static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818{
Jesus Ceaac451502011-04-20 17:09:23 +02009819 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009820 Py_ssize_t start;
9821 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009822 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823
Jesus Ceaac451502011-04-20 17:09:23 +02009824 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9825 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 if (PyUnicode_READY(self) == -1)
9829 return NULL;
9830 if (PyUnicode_READY(substring) == -1)
9831 return NULL;
9832
9833 result = any_find_slice(
9834 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9835 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009836 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009837
9838 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840 if (result == -2)
9841 return NULL;
9842
Christian Heimes217cfd12007-12-02 14:31:20 +00009843 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844}
9845
9846static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +02009847unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009848{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02009849 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
9850 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853}
9854
Guido van Rossumc2504932007-09-18 19:42:40 +00009855/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009856 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009857static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009858unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859{
Guido van Rossumc2504932007-09-18 19:42:40 +00009860 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009861 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 if (_PyUnicode_HASH(self) != -1)
9864 return _PyUnicode_HASH(self);
9865 if (PyUnicode_READY(self) == -1)
9866 return -1;
9867 len = PyUnicode_GET_LENGTH(self);
9868
9869 /* The hash function as a macro, gets expanded three times below. */
9870#define HASH(P) \
9871 x = (Py_uhash_t)*P << 7; \
9872 while (--len >= 0) \
9873 x = (1000003*x) ^ (Py_uhash_t)*P++;
9874
9875 switch (PyUnicode_KIND(self)) {
9876 case PyUnicode_1BYTE_KIND: {
9877 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9878 HASH(c);
9879 break;
9880 }
9881 case PyUnicode_2BYTE_KIND: {
9882 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9883 HASH(s);
9884 break;
9885 }
9886 default: {
9887 Py_UCS4 *l;
9888 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9889 "Impossible switch case in unicode_hash");
9890 l = PyUnicode_4BYTE_DATA(self);
9891 HASH(l);
9892 break;
9893 }
9894 }
9895 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9896
Guido van Rossumc2504932007-09-18 19:42:40 +00009897 if (x == -1)
9898 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009900 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009904PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009905 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009907Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009908
9909static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009912 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009913 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009914 Py_ssize_t start;
9915 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916
Jesus Ceaac451502011-04-20 17:09:23 +02009917 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9918 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009919 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 if (PyUnicode_READY(self) == -1)
9922 return NULL;
9923 if (PyUnicode_READY(substring) == -1)
9924 return NULL;
9925
9926 result = any_find_slice(
9927 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9928 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009929 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930
9931 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 if (result == -2)
9934 return NULL;
9935
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936 if (result < 0) {
9937 PyErr_SetString(PyExc_ValueError, "substring not found");
9938 return NULL;
9939 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009940
Christian Heimes217cfd12007-12-02 14:31:20 +00009941 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942}
9943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009944PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009945 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009946\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009947Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009948at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949
9950static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009951unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 Py_ssize_t i, length;
9954 int kind;
9955 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009956 int cased;
9957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009958 if (PyUnicode_READY(self) == -1)
9959 return NULL;
9960 length = PyUnicode_GET_LENGTH(self);
9961 kind = PyUnicode_KIND(self);
9962 data = PyUnicode_DATA(self);
9963
Guido van Rossumd57fd912000-03-10 22:53:23 +00009964 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 if (length == 1)
9966 return PyBool_FromLong(
9967 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009969 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009971 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009972
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 for (i = 0; i < length; i++) {
9975 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009976
Benjamin Peterson29060642009-01-31 22:14:21 +00009977 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9978 return PyBool_FromLong(0);
9979 else if (!cased && Py_UNICODE_ISLOWER(ch))
9980 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009981 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009982 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009983}
9984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009985PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009986 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009987\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009988Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009989at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990
9991static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009992unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009994 Py_ssize_t i, length;
9995 int kind;
9996 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997 int cased;
9998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 if (PyUnicode_READY(self) == -1)
10000 return NULL;
10001 length = PyUnicode_GET_LENGTH(self);
10002 kind = PyUnicode_KIND(self);
10003 data = PyUnicode_DATA(self);
10004
Guido van Rossumd57fd912000-03-10 22:53:23 +000010005 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 if (length == 1)
10007 return PyBool_FromLong(
10008 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010009
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010010 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010012 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010013
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 for (i = 0; i < length; i++) {
10016 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010017
Benjamin Peterson29060642009-01-31 22:14:21 +000010018 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10019 return PyBool_FromLong(0);
10020 else if (!cased && Py_UNICODE_ISUPPER(ch))
10021 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010023 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010024}
10025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010026PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010027 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010029Return True if S is a titlecased string and there is at least one\n\
10030character in S, i.e. upper- and titlecase characters may only\n\
10031follow uncased characters and lowercase characters only cased ones.\n\
10032Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033
10034static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010035unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 Py_ssize_t i, length;
10038 int kind;
10039 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040 int cased, previous_is_cased;
10041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 if (PyUnicode_READY(self) == -1)
10043 return NULL;
10044 length = PyUnicode_GET_LENGTH(self);
10045 kind = PyUnicode_KIND(self);
10046 data = PyUnicode_DATA(self);
10047
Guido van Rossumd57fd912000-03-10 22:53:23 +000010048 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 if (length == 1) {
10050 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10051 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10052 (Py_UNICODE_ISUPPER(ch) != 0));
10053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010054
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010055 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010057 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010058
Guido van Rossumd57fd912000-03-10 22:53:23 +000010059 cased = 0;
10060 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 for (i = 0; i < length; i++) {
10062 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010063
Benjamin Peterson29060642009-01-31 22:14:21 +000010064 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10065 if (previous_is_cased)
10066 return PyBool_FromLong(0);
10067 previous_is_cased = 1;
10068 cased = 1;
10069 }
10070 else if (Py_UNICODE_ISLOWER(ch)) {
10071 if (!previous_is_cased)
10072 return PyBool_FromLong(0);
10073 previous_is_cased = 1;
10074 cased = 1;
10075 }
10076 else
10077 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010078 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010079 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010080}
10081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010082PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010083 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010084\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010085Return True if all characters in S are whitespace\n\
10086and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010087
10088static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010089unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010090{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 Py_ssize_t i, length;
10092 int kind;
10093 void *data;
10094
10095 if (PyUnicode_READY(self) == -1)
10096 return NULL;
10097 length = PyUnicode_GET_LENGTH(self);
10098 kind = PyUnicode_KIND(self);
10099 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100
Guido van Rossumd57fd912000-03-10 22:53:23 +000010101 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 if (length == 1)
10103 return PyBool_FromLong(
10104 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010105
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010106 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010108 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 for (i = 0; i < length; i++) {
10111 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010112 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010113 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010114 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010115 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010116}
10117
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010118PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010119 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010120\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010121Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010122and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010123
10124static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010125unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010126{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 Py_ssize_t i, length;
10128 int kind;
10129 void *data;
10130
10131 if (PyUnicode_READY(self) == -1)
10132 return NULL;
10133 length = PyUnicode_GET_LENGTH(self);
10134 kind = PyUnicode_KIND(self);
10135 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010136
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010137 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 if (length == 1)
10139 return PyBool_FromLong(
10140 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010141
10142 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010144 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 for (i = 0; i < length; i++) {
10147 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010148 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010149 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010150 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010151}
10152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010153PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010154 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010155\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010156Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010157and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010158
10159static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010160unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 int kind;
10163 void *data;
10164 Py_ssize_t len, i;
10165
10166 if (PyUnicode_READY(self) == -1)
10167 return NULL;
10168
10169 kind = PyUnicode_KIND(self);
10170 data = PyUnicode_DATA(self);
10171 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010172
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010173 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 if (len == 1) {
10175 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10176 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10177 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010178
10179 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010181 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 for (i = 0; i < len; i++) {
10184 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010185 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010186 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010187 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010188 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010189}
10190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010191PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010192 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010193\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010194Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010195False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010196
10197static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010198unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 Py_ssize_t i, length;
10201 int kind;
10202 void *data;
10203
10204 if (PyUnicode_READY(self) == -1)
10205 return NULL;
10206 length = PyUnicode_GET_LENGTH(self);
10207 kind = PyUnicode_KIND(self);
10208 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 if (length == 1)
10212 return PyBool_FromLong(
10213 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010215 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010216 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010217 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 for (i = 0; i < length; i++) {
10220 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010221 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010223 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224}
10225
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010226PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010227 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010229Return True if all characters in S are digits\n\
10230and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231
10232static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010233unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 Py_ssize_t i, length;
10236 int kind;
10237 void *data;
10238
10239 if (PyUnicode_READY(self) == -1)
10240 return NULL;
10241 length = PyUnicode_GET_LENGTH(self);
10242 kind = PyUnicode_KIND(self);
10243 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010244
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 if (length == 1) {
10247 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10248 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010251 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010253 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 for (i = 0; i < length; i++) {
10256 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010257 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010259 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260}
10261
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010262PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010263 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010265Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010266False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267
10268static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010269unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 Py_ssize_t i, length;
10272 int kind;
10273 void *data;
10274
10275 if (PyUnicode_READY(self) == -1)
10276 return NULL;
10277 length = PyUnicode_GET_LENGTH(self);
10278 kind = PyUnicode_KIND(self);
10279 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 if (length == 1)
10283 return PyBool_FromLong(
10284 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010285
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010286 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010288 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 for (i = 0; i < length; i++) {
10291 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010292 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010293 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010294 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295}
10296
Martin v. Löwis47383402007-08-15 07:32:56 +000010297int
10298PyUnicode_IsIdentifier(PyObject *self)
10299{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 int kind;
10301 void *data;
10302 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010303 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 if (PyUnicode_READY(self) == -1) {
10306 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010307 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 }
10309
10310 /* Special case for empty strings */
10311 if (PyUnicode_GET_LENGTH(self) == 0)
10312 return 0;
10313 kind = PyUnicode_KIND(self);
10314 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010315
10316 /* PEP 3131 says that the first character must be in
10317 XID_Start and subsequent characters in XID_Continue,
10318 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010319 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010320 letters, digits, underscore). However, given the current
10321 definition of XID_Start and XID_Continue, it is sufficient
10322 to check just for these, except that _ must be allowed
10323 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010325 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010326 return 0;
10327
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010328 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010330 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010331 return 1;
10332}
10333
10334PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010335 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010336\n\
10337Return True if S is a valid identifier according\n\
10338to the language definition.");
10339
10340static PyObject*
10341unicode_isidentifier(PyObject *self)
10342{
10343 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10344}
10345
Georg Brandl559e5d72008-06-11 18:37:52 +000010346PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010347 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010348\n\
10349Return True if all characters in S are considered\n\
10350printable in repr() or S is empty, False otherwise.");
10351
10352static PyObject*
10353unicode_isprintable(PyObject *self)
10354{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 Py_ssize_t i, length;
10356 int kind;
10357 void *data;
10358
10359 if (PyUnicode_READY(self) == -1)
10360 return NULL;
10361 length = PyUnicode_GET_LENGTH(self);
10362 kind = PyUnicode_KIND(self);
10363 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010364
10365 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 if (length == 1)
10367 return PyBool_FromLong(
10368 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 for (i = 0; i < length; i++) {
10371 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010372 Py_RETURN_FALSE;
10373 }
10374 }
10375 Py_RETURN_TRUE;
10376}
10377
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010378PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010379 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380\n\
10381Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010382iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010383
10384static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010385unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010387 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388}
10389
Martin v. Löwis18e16552006-02-15 17:27:45 +000010390static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391unicode_length(PyUnicodeObject *self)
10392{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 if (PyUnicode_READY(self) == -1)
10394 return -1;
10395 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396}
10397
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010398PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010399 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010401Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010402done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403
10404static PyObject *
10405unicode_ljust(PyUnicodeObject *self, PyObject *args)
10406{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010407 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 Py_UCS4 fillchar = ' ';
10409
10410 if (PyUnicode_READY(self) == -1)
10411 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010412
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010413 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414 return NULL;
10415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417 Py_INCREF(self);
10418 return (PyObject*) self;
10419 }
10420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010422}
10423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010424PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010425 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010426\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010427Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428
10429static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010430unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432 return fixup(self, fixlower);
10433}
10434
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010435#define LEFTSTRIP 0
10436#define RIGHTSTRIP 1
10437#define BOTHSTRIP 2
10438
10439/* Arrays indexed by above */
10440static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10441
10442#define STRIPNAME(i) (stripformat[i]+3)
10443
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010444/* externally visible for str.strip(unicode) */
10445PyObject *
10446_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 void *data;
10449 int kind;
10450 Py_ssize_t i, j, len;
10451 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10454 return NULL;
10455
10456 kind = PyUnicode_KIND(self);
10457 data = PyUnicode_DATA(self);
10458 len = PyUnicode_GET_LENGTH(self);
10459 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10460 PyUnicode_DATA(sepobj),
10461 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010462
Benjamin Peterson14339b62009-01-31 16:36:08 +000010463 i = 0;
10464 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 while (i < len &&
10466 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010467 i++;
10468 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010469 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010470
Benjamin Peterson14339b62009-01-31 16:36:08 +000010471 j = len;
10472 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010473 do {
10474 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 } while (j >= i &&
10476 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010477 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010478 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010479
Victor Stinner12bab6d2011-10-01 01:53:49 +020010480 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481}
10482
10483PyObject*
10484PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10485{
10486 unsigned char *data;
10487 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010488 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489
Victor Stinnerde636f32011-10-01 03:55:54 +020010490 if (PyUnicode_READY(self) == -1)
10491 return NULL;
10492
10493 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10494
Victor Stinner12bab6d2011-10-01 01:53:49 +020010495 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010497 if (PyUnicode_CheckExact(self)) {
10498 Py_INCREF(self);
10499 return self;
10500 }
10501 else
10502 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 }
10504
Victor Stinner12bab6d2011-10-01 01:53:49 +020010505 length = end - start;
10506 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010507 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508
Victor Stinnerde636f32011-10-01 03:55:54 +020010509 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010510 PyErr_SetString(PyExc_IndexError, "string index out of range");
10511 return NULL;
10512 }
10513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 kind = PyUnicode_KIND(self);
10515 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010516 return PyUnicode_FromKindAndData(kind,
10517 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010518 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520
10521static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010522do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 int kind;
10525 void *data;
10526 Py_ssize_t len, i, j;
10527
10528 if (PyUnicode_READY(self) == -1)
10529 return NULL;
10530
10531 kind = PyUnicode_KIND(self);
10532 data = PyUnicode_DATA(self);
10533 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010534
Benjamin Peterson14339b62009-01-31 16:36:08 +000010535 i = 0;
10536 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010538 i++;
10539 }
10540 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010541
Benjamin Peterson14339b62009-01-31 16:36:08 +000010542 j = len;
10543 if (striptype != LEFTSTRIP) {
10544 do {
10545 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010547 j++;
10548 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010549
Victor Stinner12bab6d2011-10-01 01:53:49 +020010550 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551}
10552
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010553
10554static PyObject *
10555do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10556{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010557 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010558
Benjamin Peterson14339b62009-01-31 16:36:08 +000010559 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10560 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010561
Benjamin Peterson14339b62009-01-31 16:36:08 +000010562 if (sep != NULL && sep != Py_None) {
10563 if (PyUnicode_Check(sep))
10564 return _PyUnicode_XStrip(self, striptype, sep);
10565 else {
10566 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010567 "%s arg must be None or str",
10568 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010569 return NULL;
10570 }
10571 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010572
Benjamin Peterson14339b62009-01-31 16:36:08 +000010573 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010574}
10575
10576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010577PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010578 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010579\n\
10580Return a copy of the string S with leading and trailing\n\
10581whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010582If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010583
10584static PyObject *
10585unicode_strip(PyUnicodeObject *self, PyObject *args)
10586{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010587 if (PyTuple_GET_SIZE(args) == 0)
10588 return do_strip(self, BOTHSTRIP); /* Common case */
10589 else
10590 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010591}
10592
10593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010594PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010595 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010596\n\
10597Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010598If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010599
10600static PyObject *
10601unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10602{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010603 if (PyTuple_GET_SIZE(args) == 0)
10604 return do_strip(self, LEFTSTRIP); /* Common case */
10605 else
10606 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010607}
10608
10609
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010610PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010611 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010612\n\
10613Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010614If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010615
10616static PyObject *
10617unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10618{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010619 if (PyTuple_GET_SIZE(args) == 0)
10620 return do_strip(self, RIGHTSTRIP); /* Common case */
10621 else
10622 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010623}
10624
10625
Guido van Rossumd57fd912000-03-10 22:53:23 +000010626static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010627unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628{
10629 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010631
Georg Brandl222de0f2009-04-12 12:01:50 +000010632 if (len < 1) {
10633 Py_INCREF(unicode_empty);
10634 return (PyObject *)unicode_empty;
10635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636
Tim Peters7a29bd52001-09-12 03:03:31 +000010637 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638 /* no repeat, return original string */
10639 Py_INCREF(str);
10640 return (PyObject*) str;
10641 }
Tim Peters8f422462000-09-09 06:13:41 +000010642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 if (PyUnicode_READY(str) == -1)
10644 return NULL;
10645
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010646 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010647 PyErr_SetString(PyExc_OverflowError,
10648 "repeated string is too long");
10649 return NULL;
10650 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010654 if (!u)
10655 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010656 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 if (PyUnicode_GET_LENGTH(str) == 1) {
10659 const int kind = PyUnicode_KIND(str);
10660 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10661 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010662 if (kind == PyUnicode_1BYTE_KIND)
10663 memset(to, (unsigned char)fill_char, len);
10664 else {
10665 for (n = 0; n < len; ++n)
10666 PyUnicode_WRITE(kind, to, n, fill_char);
10667 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 }
10669 else {
10670 /* number of characters copied this far */
10671 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10672 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10673 char *to = (char *) PyUnicode_DATA(u);
10674 Py_MEMCPY(to, PyUnicode_DATA(str),
10675 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010676 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 n = (done <= nchars-done) ? done : nchars-done;
10678 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010679 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010681 }
10682
10683 return (PyObject*) u;
10684}
10685
Alexander Belopolsky40018472011-02-26 01:02:56 +000010686PyObject *
10687PyUnicode_Replace(PyObject *obj,
10688 PyObject *subobj,
10689 PyObject *replobj,
10690 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691{
10692 PyObject *self;
10693 PyObject *str1;
10694 PyObject *str2;
10695 PyObject *result;
10696
10697 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010698 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010699 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010701 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010702 Py_DECREF(self);
10703 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704 }
10705 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010706 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010707 Py_DECREF(self);
10708 Py_DECREF(str1);
10709 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010711 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712 Py_DECREF(self);
10713 Py_DECREF(str1);
10714 Py_DECREF(str2);
10715 return result;
10716}
10717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010718PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010719 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720\n\
10721Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010722old replaced by new. If the optional argument count is\n\
10723given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724
10725static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 PyObject *str1;
10729 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010730 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731 PyObject *result;
10732
Martin v. Löwis18e16552006-02-15 17:27:45 +000010733 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010736 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 str1 = PyUnicode_FromObject(str1);
10738 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10739 return NULL;
10740 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020010741 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010742 Py_DECREF(str1);
10743 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745
10746 result = replace(self, str1, str2, maxcount);
10747
10748 Py_DECREF(str1);
10749 Py_DECREF(str2);
10750 return result;
10751}
10752
Alexander Belopolsky40018472011-02-26 01:02:56 +000010753static PyObject *
10754unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010756 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 Py_ssize_t isize;
10758 Py_ssize_t osize, squote, dquote, i, o;
10759 Py_UCS4 max, quote;
10760 int ikind, okind;
10761 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010764 return NULL;
10765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010766 isize = PyUnicode_GET_LENGTH(unicode);
10767 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 /* Compute length of output, quote characters, and
10770 maximum character */
10771 osize = 2; /* quotes */
10772 max = 127;
10773 squote = dquote = 0;
10774 ikind = PyUnicode_KIND(unicode);
10775 for (i = 0; i < isize; i++) {
10776 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10777 switch (ch) {
10778 case '\'': squote++; osize++; break;
10779 case '"': dquote++; osize++; break;
10780 case '\\': case '\t': case '\r': case '\n':
10781 osize += 2; break;
10782 default:
10783 /* Fast-path ASCII */
10784 if (ch < ' ' || ch == 0x7f)
10785 osize += 4; /* \xHH */
10786 else if (ch < 0x7f)
10787 osize++;
10788 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10789 osize++;
10790 max = ch > max ? ch : max;
10791 }
10792 else if (ch < 0x100)
10793 osize += 4; /* \xHH */
10794 else if (ch < 0x10000)
10795 osize += 6; /* \uHHHH */
10796 else
10797 osize += 10; /* \uHHHHHHHH */
10798 }
10799 }
10800
10801 quote = '\'';
10802 if (squote) {
10803 if (dquote)
10804 /* Both squote and dquote present. Use squote,
10805 and escape them */
10806 osize += squote;
10807 else
10808 quote = '"';
10809 }
10810
10811 repr = PyUnicode_New(osize, max);
10812 if (repr == NULL)
10813 return NULL;
10814 okind = PyUnicode_KIND(repr);
10815 odata = PyUnicode_DATA(repr);
10816
10817 PyUnicode_WRITE(okind, odata, 0, quote);
10818 PyUnicode_WRITE(okind, odata, osize-1, quote);
10819
10820 for (i = 0, o = 1; i < isize; i++) {
10821 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010822
10823 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824 if ((ch == quote) || (ch == '\\')) {
10825 PyUnicode_WRITE(okind, odata, o++, '\\');
10826 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010827 continue;
10828 }
10829
Benjamin Peterson29060642009-01-31 22:14:21 +000010830 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010831 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 PyUnicode_WRITE(okind, odata, o++, '\\');
10833 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010834 }
10835 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 PyUnicode_WRITE(okind, odata, o++, '\\');
10837 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010838 }
10839 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 PyUnicode_WRITE(okind, odata, o++, '\\');
10841 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010842 }
10843
10844 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010845 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 PyUnicode_WRITE(okind, odata, o++, '\\');
10847 PyUnicode_WRITE(okind, odata, o++, 'x');
10848 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10849 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010850 }
10851
Georg Brandl559e5d72008-06-11 18:37:52 +000010852 /* Copy ASCII characters as-is */
10853 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010855 }
10856
Benjamin Peterson29060642009-01-31 22:14:21 +000010857 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010858 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010859 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010860 (categories Z* and C* except ASCII space)
10861 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010863 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 if (ch <= 0xff) {
10865 PyUnicode_WRITE(okind, odata, o++, '\\');
10866 PyUnicode_WRITE(okind, odata, o++, 'x');
10867 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10868 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010869 }
10870 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010871 else if (ch >= 0x10000) {
10872 PyUnicode_WRITE(okind, odata, o++, '\\');
10873 PyUnicode_WRITE(okind, odata, o++, 'U');
10874 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10875 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10876 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10877 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10878 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10879 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10880 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10881 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010882 }
10883 /* Map 16-bit characters to '\uxxxx' */
10884 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010885 PyUnicode_WRITE(okind, odata, o++, '\\');
10886 PyUnicode_WRITE(okind, odata, o++, 'u');
10887 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10888 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10889 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10890 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010891 }
10892 }
10893 /* Copy characters as-is */
10894 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010895 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010896 }
10897 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010898 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010899 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010900 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901}
10902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010903PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010904 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905\n\
10906Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010907such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908arguments start and end are interpreted as in slice notation.\n\
10909\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010910Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911
10912static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010913unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914{
Jesus Ceaac451502011-04-20 17:09:23 +020010915 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010916 Py_ssize_t start;
10917 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010918 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919
Jesus Ceaac451502011-04-20 17:09:23 +020010920 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10921 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010922 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010924 if (PyUnicode_READY(self) == -1)
10925 return NULL;
10926 if (PyUnicode_READY(substring) == -1)
10927 return NULL;
10928
10929 result = any_find_slice(
10930 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10931 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010932 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933
10934 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 if (result == -2)
10937 return NULL;
10938
Christian Heimes217cfd12007-12-02 14:31:20 +000010939 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940}
10941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010942PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010943 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010945Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946
10947static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010948unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949{
Jesus Ceaac451502011-04-20 17:09:23 +020010950 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010951 Py_ssize_t start;
10952 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010953 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954
Jesus Ceaac451502011-04-20 17:09:23 +020010955 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10956 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010957 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 if (PyUnicode_READY(self) == -1)
10960 return NULL;
10961 if (PyUnicode_READY(substring) == -1)
10962 return NULL;
10963
10964 result = any_find_slice(
10965 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10966 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010967 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968
10969 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 if (result == -2)
10972 return NULL;
10973
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974 if (result < 0) {
10975 PyErr_SetString(PyExc_ValueError, "substring not found");
10976 return NULL;
10977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010978
Christian Heimes217cfd12007-12-02 14:31:20 +000010979 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980}
10981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010982PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010983 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010985Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010986done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987
10988static PyObject *
10989unicode_rjust(PyUnicodeObject *self, PyObject *args)
10990{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010991 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 Py_UCS4 fillchar = ' ';
10993
Victor Stinnere9a29352011-10-01 02:14:59 +020010994 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010996
Victor Stinnere9a29352011-10-01 02:14:59 +020010997 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 return NULL;
10999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001 Py_INCREF(self);
11002 return (PyObject*) self;
11003 }
11004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006}
11007
Alexander Belopolsky40018472011-02-26 01:02:56 +000011008PyObject *
11009PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010{
11011 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011012
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 s = PyUnicode_FromObject(s);
11014 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011015 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 if (sep != NULL) {
11017 sep = PyUnicode_FromObject(sep);
11018 if (sep == NULL) {
11019 Py_DECREF(s);
11020 return NULL;
11021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022 }
11023
11024 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11025
11026 Py_DECREF(s);
11027 Py_XDECREF(sep);
11028 return result;
11029}
11030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011031PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011032 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033\n\
11034Return a list of the words in S, using sep as the\n\
11035delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011036splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011037whitespace string is a separator and empty strings are\n\
11038removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039
11040static PyObject*
11041unicode_split(PyUnicodeObject *self, PyObject *args)
11042{
11043 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011044 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045
Martin v. Löwis18e16552006-02-15 17:27:45 +000011046 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047 return NULL;
11048
11049 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011050 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011052 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011054 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055}
11056
Thomas Wouters477c8d52006-05-27 19:21:47 +000011057PyObject *
11058PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11059{
11060 PyObject* str_obj;
11061 PyObject* sep_obj;
11062 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063 int kind1, kind2, kind;
11064 void *buf1 = NULL, *buf2 = NULL;
11065 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011066
11067 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011068 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011069 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011070 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011072 Py_DECREF(str_obj);
11073 return NULL;
11074 }
11075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 kind1 = PyUnicode_KIND(str_in);
11077 kind2 = PyUnicode_KIND(sep_obj);
11078 kind = kind1 > kind2 ? kind1 : kind2;
11079 buf1 = PyUnicode_DATA(str_in);
11080 if (kind1 != kind)
11081 buf1 = _PyUnicode_AsKind(str_in, kind);
11082 if (!buf1)
11083 goto onError;
11084 buf2 = PyUnicode_DATA(sep_obj);
11085 if (kind2 != kind)
11086 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11087 if (!buf2)
11088 goto onError;
11089 len1 = PyUnicode_GET_LENGTH(str_obj);
11090 len2 = PyUnicode_GET_LENGTH(sep_obj);
11091
11092 switch(PyUnicode_KIND(str_in)) {
11093 case PyUnicode_1BYTE_KIND:
11094 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11095 break;
11096 case PyUnicode_2BYTE_KIND:
11097 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11098 break;
11099 case PyUnicode_4BYTE_KIND:
11100 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11101 break;
11102 default:
11103 assert(0);
11104 out = 0;
11105 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011106
11107 Py_DECREF(sep_obj);
11108 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011109 if (kind1 != kind)
11110 PyMem_Free(buf1);
11111 if (kind2 != kind)
11112 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011113
11114 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011115 onError:
11116 Py_DECREF(sep_obj);
11117 Py_DECREF(str_obj);
11118 if (kind1 != kind && buf1)
11119 PyMem_Free(buf1);
11120 if (kind2 != kind && buf2)
11121 PyMem_Free(buf2);
11122 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011123}
11124
11125
11126PyObject *
11127PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11128{
11129 PyObject* str_obj;
11130 PyObject* sep_obj;
11131 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 int kind1, kind2, kind;
11133 void *buf1 = NULL, *buf2 = NULL;
11134 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011135
11136 str_obj = PyUnicode_FromObject(str_in);
11137 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011138 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011139 sep_obj = PyUnicode_FromObject(sep_in);
11140 if (!sep_obj) {
11141 Py_DECREF(str_obj);
11142 return NULL;
11143 }
11144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011145 kind1 = PyUnicode_KIND(str_in);
11146 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011147 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 buf1 = PyUnicode_DATA(str_in);
11149 if (kind1 != kind)
11150 buf1 = _PyUnicode_AsKind(str_in, kind);
11151 if (!buf1)
11152 goto onError;
11153 buf2 = PyUnicode_DATA(sep_obj);
11154 if (kind2 != kind)
11155 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11156 if (!buf2)
11157 goto onError;
11158 len1 = PyUnicode_GET_LENGTH(str_obj);
11159 len2 = PyUnicode_GET_LENGTH(sep_obj);
11160
11161 switch(PyUnicode_KIND(str_in)) {
11162 case PyUnicode_1BYTE_KIND:
11163 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11164 break;
11165 case PyUnicode_2BYTE_KIND:
11166 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11167 break;
11168 case PyUnicode_4BYTE_KIND:
11169 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11170 break;
11171 default:
11172 assert(0);
11173 out = 0;
11174 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011175
11176 Py_DECREF(sep_obj);
11177 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011178 if (kind1 != kind)
11179 PyMem_Free(buf1);
11180 if (kind2 != kind)
11181 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011182
11183 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 onError:
11185 Py_DECREF(sep_obj);
11186 Py_DECREF(str_obj);
11187 if (kind1 != kind && buf1)
11188 PyMem_Free(buf1);
11189 if (kind2 != kind && buf2)
11190 PyMem_Free(buf2);
11191 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011192}
11193
11194PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011195 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011196\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011197Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011198the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011199found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011200
11201static PyObject*
11202unicode_partition(PyUnicodeObject *self, PyObject *separator)
11203{
11204 return PyUnicode_Partition((PyObject *)self, separator);
11205}
11206
11207PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011208 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011209\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011210Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011211the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011212separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011213
11214static PyObject*
11215unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11216{
11217 return PyUnicode_RPartition((PyObject *)self, separator);
11218}
11219
Alexander Belopolsky40018472011-02-26 01:02:56 +000011220PyObject *
11221PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011222{
11223 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011224
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011225 s = PyUnicode_FromObject(s);
11226 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011227 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011228 if (sep != NULL) {
11229 sep = PyUnicode_FromObject(sep);
11230 if (sep == NULL) {
11231 Py_DECREF(s);
11232 return NULL;
11233 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011234 }
11235
11236 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11237
11238 Py_DECREF(s);
11239 Py_XDECREF(sep);
11240 return result;
11241}
11242
11243PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011244 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011245\n\
11246Return a list of the words in S, using sep as the\n\
11247delimiter string, starting at the end of the string and\n\
11248working to the front. If maxsplit is given, at most maxsplit\n\
11249splits are done. If sep is not specified, any whitespace string\n\
11250is a separator.");
11251
11252static PyObject*
11253unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11254{
11255 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011256 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011257
Martin v. Löwis18e16552006-02-15 17:27:45 +000011258 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011259 return NULL;
11260
11261 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011262 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011263 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011265 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011267}
11268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011269PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011270 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271\n\
11272Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011273Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011274is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275
11276static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011277unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011279 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011280 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011282 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11283 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284 return NULL;
11285
Guido van Rossum86662912000-04-11 15:38:46 +000011286 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287}
11288
11289static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011290PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291{
Walter Dörwald346737f2007-05-31 10:44:43 +000011292 if (PyUnicode_CheckExact(self)) {
11293 Py_INCREF(self);
11294 return self;
11295 } else
11296 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011297 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298}
11299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011300PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011301 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302\n\
11303Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011304and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305
11306static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011307unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309 return fixup(self, fixswapcase);
11310}
11311
Georg Brandlceee0772007-11-27 23:48:05 +000011312PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011313 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011314\n\
11315Return a translation table usable for str.translate().\n\
11316If there is only one argument, it must be a dictionary mapping Unicode\n\
11317ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011318Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011319If there are two arguments, they must be strings of equal length, and\n\
11320in the resulting dictionary, each character in x will be mapped to the\n\
11321character at the same position in y. If there is a third argument, it\n\
11322must be a string, whose characters will be mapped to None in the result.");
11323
11324static PyObject*
11325unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11326{
11327 PyObject *x, *y = NULL, *z = NULL;
11328 PyObject *new = NULL, *key, *value;
11329 Py_ssize_t i = 0;
11330 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011331
Georg Brandlceee0772007-11-27 23:48:05 +000011332 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11333 return NULL;
11334 new = PyDict_New();
11335 if (!new)
11336 return NULL;
11337 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 int x_kind, y_kind, z_kind;
11339 void *x_data, *y_data, *z_data;
11340
Georg Brandlceee0772007-11-27 23:48:05 +000011341 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011342 if (!PyUnicode_Check(x)) {
11343 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11344 "be a string if there is a second argument");
11345 goto err;
11346 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011347 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011348 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11349 "arguments must have equal length");
11350 goto err;
11351 }
11352 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 x_kind = PyUnicode_KIND(x);
11354 y_kind = PyUnicode_KIND(y);
11355 x_data = PyUnicode_DATA(x);
11356 y_data = PyUnicode_DATA(y);
11357 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11358 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11359 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011360 if (!key || !value)
11361 goto err;
11362 res = PyDict_SetItem(new, key, value);
11363 Py_DECREF(key);
11364 Py_DECREF(value);
11365 if (res < 0)
11366 goto err;
11367 }
11368 /* create entries for deleting chars in z */
11369 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011370 z_kind = PyUnicode_KIND(z);
11371 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011372 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011374 if (!key)
11375 goto err;
11376 res = PyDict_SetItem(new, key, Py_None);
11377 Py_DECREF(key);
11378 if (res < 0)
11379 goto err;
11380 }
11381 }
11382 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011383 int kind;
11384 void *data;
11385
Georg Brandlceee0772007-11-27 23:48:05 +000011386 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011387 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011388 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11389 "to maketrans it must be a dict");
11390 goto err;
11391 }
11392 /* copy entries into the new dict, converting string keys to int keys */
11393 while (PyDict_Next(x, &i, &key, &value)) {
11394 if (PyUnicode_Check(key)) {
11395 /* convert string keys to integer keys */
11396 PyObject *newkey;
11397 if (PyUnicode_GET_SIZE(key) != 1) {
11398 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11399 "table must be of length 1");
11400 goto err;
11401 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011402 kind = PyUnicode_KIND(key);
11403 data = PyUnicode_DATA(key);
11404 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011405 if (!newkey)
11406 goto err;
11407 res = PyDict_SetItem(new, newkey, value);
11408 Py_DECREF(newkey);
11409 if (res < 0)
11410 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011411 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011412 /* just keep integer keys */
11413 if (PyDict_SetItem(new, key, value) < 0)
11414 goto err;
11415 } else {
11416 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11417 "be strings or integers");
11418 goto err;
11419 }
11420 }
11421 }
11422 return new;
11423 err:
11424 Py_DECREF(new);
11425 return NULL;
11426}
11427
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011428PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011429 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430\n\
11431Return a copy of the string S, where all characters have been mapped\n\
11432through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011433Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011434Unmapped characters are left untouched. Characters mapped to None\n\
11435are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436
11437static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441}
11442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011443PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011446Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447
11448static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011449unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451 return fixup(self, fixupper);
11452}
11453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011454PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011455 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011457Pad a numeric string S with zeros on the left, to fill a field\n\
11458of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459
11460static PyObject *
11461unicode_zfill(PyUnicodeObject *self, PyObject *args)
11462{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011463 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011465 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 int kind;
11467 void *data;
11468 Py_UCS4 chr;
11469
11470 if (PyUnicode_READY(self) == -1)
11471 return NULL;
11472
Martin v. Löwis18e16552006-02-15 17:27:45 +000011473 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474 return NULL;
11475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011477 if (PyUnicode_CheckExact(self)) {
11478 Py_INCREF(self);
11479 return (PyObject*) self;
11480 }
11481 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011482 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 }
11484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486
11487 u = pad(self, fill, 0, '0');
11488
Walter Dörwald068325e2002-04-15 13:36:47 +000011489 if (u == NULL)
11490 return NULL;
11491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 kind = PyUnicode_KIND(u);
11493 data = PyUnicode_DATA(u);
11494 chr = PyUnicode_READ(kind, data, fill);
11495
11496 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498 PyUnicode_WRITE(kind, data, 0, chr);
11499 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 }
11501
11502 return (PyObject*) u;
11503}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504
11505#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011506static PyObject *
11507unicode__decimal2ascii(PyObject *self)
11508{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011510}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511#endif
11512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011513PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011516Return True if S starts with the specified prefix, False otherwise.\n\
11517With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011518With optional end, stop comparing S at that position.\n\
11519prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520
11521static PyObject *
11522unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011525 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011527 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011528 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011529 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530
Jesus Ceaac451502011-04-20 17:09:23 +020011531 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011532 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011533 if (PyTuple_Check(subobj)) {
11534 Py_ssize_t i;
11535 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11536 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011537 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011538 if (substring == NULL)
11539 return NULL;
11540 result = tailmatch(self, substring, start, end, -1);
11541 Py_DECREF(substring);
11542 if (result) {
11543 Py_RETURN_TRUE;
11544 }
11545 }
11546 /* nothing matched */
11547 Py_RETURN_FALSE;
11548 }
11549 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011550 if (substring == NULL) {
11551 if (PyErr_ExceptionMatches(PyExc_TypeError))
11552 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11553 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011554 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011555 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011556 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011558 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559}
11560
11561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011562PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011565Return True if S ends with the specified suffix, False otherwise.\n\
11566With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011567With optional end, stop comparing S at that position.\n\
11568suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569
11570static PyObject *
11571unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011572 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011574 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011576 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011577 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011578 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579
Jesus Ceaac451502011-04-20 17:09:23 +020011580 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011582 if (PyTuple_Check(subobj)) {
11583 Py_ssize_t i;
11584 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11585 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011587 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011589 result = tailmatch(self, substring, start, end, +1);
11590 Py_DECREF(substring);
11591 if (result) {
11592 Py_RETURN_TRUE;
11593 }
11594 }
11595 Py_RETURN_FALSE;
11596 }
11597 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011598 if (substring == NULL) {
11599 if (PyErr_ExceptionMatches(PyExc_TypeError))
11600 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11601 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011602 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011603 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011604 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011606 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607}
11608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011610
11611PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011612 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011613\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011614Return a formatted version of S, using substitutions from args and kwargs.\n\
11615The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011616
Eric Smith27bbca62010-11-04 17:06:58 +000011617PyDoc_STRVAR(format_map__doc__,
11618 "S.format_map(mapping) -> str\n\
11619\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011620Return a formatted version of S, using substitutions from mapping.\n\
11621The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011622
Eric Smith4a7d76d2008-05-30 18:10:19 +000011623static PyObject *
11624unicode__format__(PyObject* self, PyObject* args)
11625{
11626 PyObject *format_spec;
11627
11628 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11629 return NULL;
11630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11632 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011633}
11634
Eric Smith8c663262007-08-25 02:26:07 +000011635PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011637\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011638Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011639
11640static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011641unicode__sizeof__(PyUnicodeObject *v)
11642{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 Py_ssize_t size;
11644
11645 /* If it's a compact object, account for base structure +
11646 character data. */
11647 if (PyUnicode_IS_COMPACT_ASCII(v))
11648 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11649 else if (PyUnicode_IS_COMPACT(v))
11650 size = sizeof(PyCompactUnicodeObject) +
11651 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11652 else {
11653 /* If it is a two-block object, account for base object, and
11654 for character block if present. */
11655 size = sizeof(PyUnicodeObject);
11656 if (v->data.any)
11657 size += (PyUnicode_GET_LENGTH(v) + 1) *
11658 PyUnicode_CHARACTER_SIZE(v);
11659 }
11660 /* If the wstr pointer is present, account for it unless it is shared
11661 with the data pointer. Since PyUnicode_DATA will crash if the object
11662 is not ready, check whether it's either not ready (in which case the
11663 data is entirely in wstr) or if the data is not shared. */
11664 if (_PyUnicode_WSTR(v) &&
11665 (!PyUnicode_IS_READY(v) ||
11666 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11667 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011668 if (!PyUnicode_IS_COMPACT_ASCII(v)
11669 && _PyUnicode_UTF8(v)
11670 && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11671 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672
11673 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011674}
11675
11676PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011677 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011678
11679static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011680unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011681{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011682 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683 if (!copy)
11684 return NULL;
11685 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011686}
11687
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688static PyMethodDef unicode_methods[] = {
11689
11690 /* Order is according to common usage: often used methods should
11691 appear first, since lookup is done sequentially. */
11692
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011693 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011694 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11695 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011696 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011697 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11698 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11699 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11700 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11701 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11702 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11703 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011704 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011705 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11706 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11707 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011708 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011709 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11710 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11711 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011712 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011713 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011714 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011715 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011716 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11717 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11718 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11719 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11720 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11721 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11722 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11723 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11724 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11725 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11726 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11727 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11728 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11729 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011730 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011731 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011732 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011733 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011734 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011735 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011736 {"maketrans", (PyCFunction) unicode_maketrans,
11737 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011738 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011739#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011740 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741#endif
11742
11743#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011744 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011745 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746#endif
11747
Benjamin Peterson14339b62009-01-31 16:36:08 +000011748 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749 {NULL, NULL}
11750};
11751
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011752static PyObject *
11753unicode_mod(PyObject *v, PyObject *w)
11754{
Brian Curtindfc80e32011-08-10 20:28:54 -050011755 if (!PyUnicode_Check(v))
11756 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011757 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011758}
11759
11760static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011761 0, /*nb_add*/
11762 0, /*nb_subtract*/
11763 0, /*nb_multiply*/
11764 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011765};
11766
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011768 (lenfunc) unicode_length, /* sq_length */
11769 PyUnicode_Concat, /* sq_concat */
11770 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11771 (ssizeargfunc) unicode_getitem, /* sq_item */
11772 0, /* sq_slice */
11773 0, /* sq_ass_item */
11774 0, /* sq_ass_slice */
11775 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776};
11777
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011778static PyObject*
11779unicode_subscript(PyUnicodeObject* self, PyObject* item)
11780{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (PyUnicode_READY(self) == -1)
11782 return NULL;
11783
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011784 if (PyIndex_Check(item)) {
11785 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011786 if (i == -1 && PyErr_Occurred())
11787 return NULL;
11788 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011790 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011791 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011792 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011794 Py_UNICODE* result_buf;
11795 PyObject* result;
11796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011798 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011799 return NULL;
11800 }
11801
11802 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 return PyUnicode_New(0, 0);
11804 } else if (start == 0 && step == 1 &&
11805 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011806 PyUnicode_CheckExact(self)) {
11807 Py_INCREF(self);
11808 return (PyObject *)self;
11809 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011810 return PyUnicode_Substring((PyObject*)self,
11811 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011812 } else {
11813 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011814 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11815 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011816
Benjamin Peterson29060642009-01-31 22:14:21 +000011817 if (result_buf == NULL)
11818 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011819
11820 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11821 result_buf[i] = source_buf[cur];
11822 }
Tim Petersced69f82003-09-16 20:30:58 +000011823
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011824 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011825 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011826 return result;
11827 }
11828 } else {
11829 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11830 return NULL;
11831 }
11832}
11833
11834static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011835 (lenfunc)unicode_length, /* mp_length */
11836 (binaryfunc)unicode_subscript, /* mp_subscript */
11837 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011838};
11839
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841/* Helpers for PyUnicode_Format() */
11842
11843static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011844getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011846 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 (*p_argidx)++;
11849 if (arglen < 0)
11850 return args;
11851 else
11852 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 }
11854 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011855 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856 return NULL;
11857}
11858
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011859/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011861static PyObject *
11862formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011864 char *p;
11865 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011867
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868 x = PyFloat_AsDouble(v);
11869 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011870 return NULL;
11871
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011873 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011874
Eric Smith0923d1d2009-04-16 20:16:10 +000011875 p = PyOS_double_to_string(x, type, prec,
11876 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011877 if (p == NULL)
11878 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011880 PyMem_Free(p);
11881 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882}
11883
Tim Peters38fd5b62000-09-21 05:43:11 +000011884static PyObject*
11885formatlong(PyObject *val, int flags, int prec, int type)
11886{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011887 char *buf;
11888 int len;
11889 PyObject *str; /* temporary string object. */
11890 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011891
Benjamin Peterson14339b62009-01-31 16:36:08 +000011892 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11893 if (!str)
11894 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011896 Py_DECREF(str);
11897 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011898}
11899
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011902 size_t buflen,
11903 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011905 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011906 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011907 if (PyUnicode_GET_LENGTH(v) == 1) {
11908 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011909 buf[1] = '\0';
11910 return 1;
11911 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011912 goto onError;
11913 }
11914 else {
11915 /* Integer input truncated to a character */
11916 long x;
11917 x = PyLong_AsLong(v);
11918 if (x == -1 && PyErr_Occurred())
11919 goto onError;
11920
11921 if (x < 0 || x > 0x10ffff) {
11922 PyErr_SetString(PyExc_OverflowError,
11923 "%c arg not in range(0x110000)");
11924 return -1;
11925 }
11926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011928 buf[1] = '\0';
11929 return 1;
11930 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011931
Benjamin Peterson29060642009-01-31 22:14:21 +000011932 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011933 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011934 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011935 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936}
11937
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011938/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011939 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011940*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011941#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011942
Alexander Belopolsky40018472011-02-26 01:02:56 +000011943PyObject *
11944PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 void *fmt;
11947 int fmtkind;
11948 PyObject *result;
11949 Py_UCS4 *res, *res0;
11950 Py_UCS4 max;
11951 int kind;
11952 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011956
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011958 PyErr_BadInternalCall();
11959 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11962 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011963 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 fmt = PyUnicode_DATA(uformat);
11965 fmtkind = PyUnicode_KIND(uformat);
11966 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11967 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968
11969 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11971 if (res0 == NULL) {
11972 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011973 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975
11976 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 arglen = PyTuple_Size(args);
11978 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979 }
11980 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 arglen = -1;
11982 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011984 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011985 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987
11988 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011990 if (--rescnt < 0) {
11991 rescnt = fmtcnt + 100;
11992 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11994 if (res0 == NULL){
11995 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 }
11998 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011999 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012002 }
12003 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 /* Got a format specifier */
12005 int flags = 0;
12006 Py_ssize_t width = -1;
12007 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 Py_UCS4 c = '\0';
12009 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012010 int isnumok;
12011 PyObject *v = NULL;
12012 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 void *pbuf;
12014 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012015 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 Py_ssize_t len, len1;
12017 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 fmtpos++;
12020 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12021 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 Py_ssize_t keylen;
12023 PyObject *key;
12024 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012025
Benjamin Peterson29060642009-01-31 22:14:21 +000012026 if (dict == NULL) {
12027 PyErr_SetString(PyExc_TypeError,
12028 "format requires a mapping");
12029 goto onError;
12030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012032 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012034 /* Skip over balanced parentheses */
12035 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012037 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012039 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012043 if (fmtcnt < 0 || pcount > 0) {
12044 PyErr_SetString(PyExc_ValueError,
12045 "incomplete format key");
12046 goto onError;
12047 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012048 key = PyUnicode_Substring((PyObject*)uformat,
12049 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012050 if (key == NULL)
12051 goto onError;
12052 if (args_owned) {
12053 Py_DECREF(args);
12054 args_owned = 0;
12055 }
12056 args = PyObject_GetItem(dict, key);
12057 Py_DECREF(key);
12058 if (args == NULL) {
12059 goto onError;
12060 }
12061 args_owned = 1;
12062 arglen = -1;
12063 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012064 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012065 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012067 case '-': flags |= F_LJUST; continue;
12068 case '+': flags |= F_SIGN; continue;
12069 case ' ': flags |= F_BLANK; continue;
12070 case '#': flags |= F_ALT; continue;
12071 case '0': flags |= F_ZERO; continue;
12072 }
12073 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012074 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012075 if (c == '*') {
12076 v = getnextarg(args, arglen, &argidx);
12077 if (v == NULL)
12078 goto onError;
12079 if (!PyLong_Check(v)) {
12080 PyErr_SetString(PyExc_TypeError,
12081 "* wants int");
12082 goto onError;
12083 }
12084 width = PyLong_AsLong(v);
12085 if (width == -1 && PyErr_Occurred())
12086 goto onError;
12087 if (width < 0) {
12088 flags |= F_LJUST;
12089 width = -width;
12090 }
12091 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012093 }
12094 else if (c >= '0' && c <= '9') {
12095 width = c - '0';
12096 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 if (c < '0' || c > '9')
12099 break;
12100 if ((width*10) / 10 != width) {
12101 PyErr_SetString(PyExc_ValueError,
12102 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012103 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012104 }
12105 width = width*10 + (c - '0');
12106 }
12107 }
12108 if (c == '.') {
12109 prec = 0;
12110 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012112 if (c == '*') {
12113 v = getnextarg(args, arglen, &argidx);
12114 if (v == NULL)
12115 goto onError;
12116 if (!PyLong_Check(v)) {
12117 PyErr_SetString(PyExc_TypeError,
12118 "* wants int");
12119 goto onError;
12120 }
12121 prec = PyLong_AsLong(v);
12122 if (prec == -1 && PyErr_Occurred())
12123 goto onError;
12124 if (prec < 0)
12125 prec = 0;
12126 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012128 }
12129 else if (c >= '0' && c <= '9') {
12130 prec = c - '0';
12131 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012133 if (c < '0' || c > '9')
12134 break;
12135 if ((prec*10) / 10 != prec) {
12136 PyErr_SetString(PyExc_ValueError,
12137 "prec too big");
12138 goto onError;
12139 }
12140 prec = prec*10 + (c - '0');
12141 }
12142 }
12143 } /* prec */
12144 if (fmtcnt >= 0) {
12145 if (c == 'h' || c == 'l' || c == 'L') {
12146 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012147 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012148 }
12149 }
12150 if (fmtcnt < 0) {
12151 PyErr_SetString(PyExc_ValueError,
12152 "incomplete format");
12153 goto onError;
12154 }
12155 if (c != '%') {
12156 v = getnextarg(args, arglen, &argidx);
12157 if (v == NULL)
12158 goto onError;
12159 }
12160 sign = 0;
12161 fill = ' ';
12162 switch (c) {
12163
12164 case '%':
12165 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012166 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012167 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012169 len = 1;
12170 break;
12171
12172 case 's':
12173 case 'r':
12174 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012175 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012176 temp = v;
12177 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012178 }
12179 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012180 if (c == 's')
12181 temp = PyObject_Str(v);
12182 else if (c == 'r')
12183 temp = PyObject_Repr(v);
12184 else
12185 temp = PyObject_ASCII(v);
12186 if (temp == NULL)
12187 goto onError;
12188 if (PyUnicode_Check(temp))
12189 /* nothing to do */;
12190 else {
12191 Py_DECREF(temp);
12192 PyErr_SetString(PyExc_TypeError,
12193 "%s argument has non-string str()");
12194 goto onError;
12195 }
12196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 if (PyUnicode_READY(temp) == -1) {
12198 Py_CLEAR(temp);
12199 goto onError;
12200 }
12201 pbuf = PyUnicode_DATA(temp);
12202 kind = PyUnicode_KIND(temp);
12203 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012204 if (prec >= 0 && len > prec)
12205 len = prec;
12206 break;
12207
12208 case 'i':
12209 case 'd':
12210 case 'u':
12211 case 'o':
12212 case 'x':
12213 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012214 isnumok = 0;
12215 if (PyNumber_Check(v)) {
12216 PyObject *iobj=NULL;
12217
12218 if (PyLong_Check(v)) {
12219 iobj = v;
12220 Py_INCREF(iobj);
12221 }
12222 else {
12223 iobj = PyNumber_Long(v);
12224 }
12225 if (iobj!=NULL) {
12226 if (PyLong_Check(iobj)) {
12227 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012228 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012229 Py_DECREF(iobj);
12230 if (!temp)
12231 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232 if (PyUnicode_READY(temp) == -1) {
12233 Py_CLEAR(temp);
12234 goto onError;
12235 }
12236 pbuf = PyUnicode_DATA(temp);
12237 kind = PyUnicode_KIND(temp);
12238 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012239 sign = 1;
12240 }
12241 else {
12242 Py_DECREF(iobj);
12243 }
12244 }
12245 }
12246 if (!isnumok) {
12247 PyErr_Format(PyExc_TypeError,
12248 "%%%c format: a number is required, "
12249 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12250 goto onError;
12251 }
12252 if (flags & F_ZERO)
12253 fill = '0';
12254 break;
12255
12256 case 'e':
12257 case 'E':
12258 case 'f':
12259 case 'F':
12260 case 'g':
12261 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012262 temp = formatfloat(v, flags, prec, c);
12263 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012264 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 if (PyUnicode_READY(temp) == -1) {
12266 Py_CLEAR(temp);
12267 goto onError;
12268 }
12269 pbuf = PyUnicode_DATA(temp);
12270 kind = PyUnicode_KIND(temp);
12271 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012272 sign = 1;
12273 if (flags & F_ZERO)
12274 fill = '0';
12275 break;
12276
12277 case 'c':
12278 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012280 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012281 if (len < 0)
12282 goto onError;
12283 break;
12284
12285 default:
12286 PyErr_Format(PyExc_ValueError,
12287 "unsupported format character '%c' (0x%x) "
12288 "at index %zd",
12289 (31<=c && c<=126) ? (char)c : '?',
12290 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012292 goto onError;
12293 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 /* pbuf is initialized here. */
12295 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012296 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12298 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12299 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012300 len--;
12301 }
12302 else if (flags & F_SIGN)
12303 sign = '+';
12304 else if (flags & F_BLANK)
12305 sign = ' ';
12306 else
12307 sign = 0;
12308 }
12309 if (width < len)
12310 width = len;
12311 if (rescnt - (sign != 0) < width) {
12312 reslen -= rescnt;
12313 rescnt = width + fmtcnt + 100;
12314 reslen += rescnt;
12315 if (reslen < 0) {
12316 Py_XDECREF(temp);
12317 PyErr_NoMemory();
12318 goto onError;
12319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12321 if (res0 == 0) {
12322 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012323 Py_XDECREF(temp);
12324 goto onError;
12325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012327 }
12328 if (sign) {
12329 if (fill != ' ')
12330 *res++ = sign;
12331 rescnt--;
12332 if (width > len)
12333 width--;
12334 }
12335 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12337 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012338 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12340 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 }
12342 rescnt -= 2;
12343 width -= 2;
12344 if (width < 0)
12345 width = 0;
12346 len -= 2;
12347 }
12348 if (width > len && !(flags & F_LJUST)) {
12349 do {
12350 --rescnt;
12351 *res++ = fill;
12352 } while (--width > len);
12353 }
12354 if (fill == ' ') {
12355 if (sign)
12356 *res++ = sign;
12357 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12359 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12360 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12361 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012362 }
12363 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 /* Copy all characters, preserving len */
12365 len1 = len;
12366 while (len1--) {
12367 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12368 rescnt--;
12369 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012370 while (--width >= len) {
12371 --rescnt;
12372 *res++ = ' ';
12373 }
12374 if (dict && (argidx < arglen) && c != '%') {
12375 PyErr_SetString(PyExc_TypeError,
12376 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012377 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012378 goto onError;
12379 }
12380 Py_XDECREF(temp);
12381 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382 } /* until end */
12383 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012384 PyErr_SetString(PyExc_TypeError,
12385 "not all arguments converted during string formatting");
12386 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387 }
12388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389
12390 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12391 if (*res > max)
12392 max = *res;
12393 result = PyUnicode_New(reslen - rescnt, max);
12394 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012395 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012396 kind = PyUnicode_KIND(result);
12397 for (res = res0; res < res0+reslen-rescnt; res++)
12398 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12399 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012401 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012402 }
12403 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012404 return (PyObject *)result;
12405
Benjamin Peterson29060642009-01-31 22:14:21 +000012406 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012407 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012408 Py_DECREF(uformat);
12409 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012410 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411 }
12412 return NULL;
12413}
12414
Jeremy Hylton938ace62002-07-17 16:30:39 +000012415static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012416unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12417
Tim Peters6d6c1a32001-08-02 04:15:00 +000012418static PyObject *
12419unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12420{
Benjamin Peterson29060642009-01-31 22:14:21 +000012421 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012422 static char *kwlist[] = {"object", "encoding", "errors", 0};
12423 char *encoding = NULL;
12424 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012425
Benjamin Peterson14339b62009-01-31 16:36:08 +000012426 if (type != &PyUnicode_Type)
12427 return unicode_subtype_new(type, args, kwds);
12428 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012429 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012430 return NULL;
12431 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012433 if (encoding == NULL && errors == NULL)
12434 return PyObject_Str(x);
12435 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012436 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012437}
12438
Guido van Rossume023fe02001-08-30 03:12:59 +000012439static PyObject *
12440unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12441{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012442 PyUnicodeObject *unicode, *self;
12443 Py_ssize_t length, char_size;
12444 int share_wstr, share_utf8;
12445 unsigned int kind;
12446 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012447
Benjamin Peterson14339b62009-01-31 16:36:08 +000012448 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012449
12450 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12451 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012452 return NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012453 assert(PyUnicode_Check(unicode));
12454 if (PyUnicode_READY(unicode))
12455 return NULL;
12456
12457 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12458 if (self == NULL) {
12459 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012460 return NULL;
12461 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012462 kind = PyUnicode_KIND(unicode);
12463 length = PyUnicode_GET_LENGTH(unicode);
12464
12465 _PyUnicode_LENGTH(self) = length;
12466 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12467 _PyUnicode_STATE(self).interned = 0;
12468 _PyUnicode_STATE(self).kind = kind;
12469 _PyUnicode_STATE(self).compact = 0;
12470 _PyUnicode_STATE(self).ascii = 0;
12471 _PyUnicode_STATE(self).ready = 1;
12472 _PyUnicode_WSTR(self) = NULL;
12473 _PyUnicode_UTF8_LENGTH(self) = 0;
12474 _PyUnicode_UTF8(self) = NULL;
12475 _PyUnicode_WSTR_LENGTH(self) = 0;
12476 self->data.any = NULL;
12477
12478 share_utf8 = 0;
12479 share_wstr = 0;
12480 if (kind == PyUnicode_1BYTE_KIND) {
12481 char_size = 1;
12482 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12483 share_utf8 = 1;
12484 }
12485 else if (kind == PyUnicode_2BYTE_KIND) {
12486 char_size = 2;
12487 if (sizeof(wchar_t) == 2)
12488 share_wstr = 1;
12489 }
12490 else {
12491 assert(kind == PyUnicode_4BYTE_KIND);
12492 char_size = 4;
12493 if (sizeof(wchar_t) == 4)
12494 share_wstr = 1;
12495 }
12496
12497 /* Ensure we won't overflow the length. */
12498 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12499 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012501 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012502 data = PyObject_MALLOC((length + 1) * char_size);
12503 if (data == NULL) {
12504 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 goto onError;
12506 }
12507
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012508 self->data.any = data;
12509 if (share_utf8) {
12510 _PyUnicode_UTF8_LENGTH(self) = length;
12511 _PyUnicode_UTF8(self) = data;
12512 }
12513 if (share_wstr) {
12514 _PyUnicode_WSTR_LENGTH(self) = length;
12515 _PyUnicode_WSTR(self) = (wchar_t *)data;
12516 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012518 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12519 PyUnicode_KIND_SIZE(kind, length + 1));
12520 Py_DECREF(unicode);
12521 return (PyObject *)self;
12522
12523onError:
12524 Py_DECREF(unicode);
12525 Py_DECREF(self);
12526 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012527}
12528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012529PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012530 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012531\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012532Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012533encoding defaults to the current default string encoding.\n\
12534errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012535
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012536static PyObject *unicode_iter(PyObject *seq);
12537
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012539 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012540 "str", /* tp_name */
12541 sizeof(PyUnicodeObject), /* tp_size */
12542 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012544 (destructor)unicode_dealloc, /* tp_dealloc */
12545 0, /* tp_print */
12546 0, /* tp_getattr */
12547 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012548 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012549 unicode_repr, /* tp_repr */
12550 &unicode_as_number, /* tp_as_number */
12551 &unicode_as_sequence, /* tp_as_sequence */
12552 &unicode_as_mapping, /* tp_as_mapping */
12553 (hashfunc) unicode_hash, /* tp_hash*/
12554 0, /* tp_call*/
12555 (reprfunc) unicode_str, /* tp_str */
12556 PyObject_GenericGetAttr, /* tp_getattro */
12557 0, /* tp_setattro */
12558 0, /* tp_as_buffer */
12559 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012561 unicode_doc, /* tp_doc */
12562 0, /* tp_traverse */
12563 0, /* tp_clear */
12564 PyUnicode_RichCompare, /* tp_richcompare */
12565 0, /* tp_weaklistoffset */
12566 unicode_iter, /* tp_iter */
12567 0, /* tp_iternext */
12568 unicode_methods, /* tp_methods */
12569 0, /* tp_members */
12570 0, /* tp_getset */
12571 &PyBaseObject_Type, /* tp_base */
12572 0, /* tp_dict */
12573 0, /* tp_descr_get */
12574 0, /* tp_descr_set */
12575 0, /* tp_dictoffset */
12576 0, /* tp_init */
12577 0, /* tp_alloc */
12578 unicode_new, /* tp_new */
12579 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580};
12581
12582/* Initialize the Unicode implementation */
12583
Thomas Wouters78890102000-07-22 19:25:51 +000012584void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012586 int i;
12587
Thomas Wouters477c8d52006-05-27 19:21:47 +000012588 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012589 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012590 0x000A, /* LINE FEED */
12591 0x000D, /* CARRIAGE RETURN */
12592 0x001C, /* FILE SEPARATOR */
12593 0x001D, /* GROUP SEPARATOR */
12594 0x001E, /* RECORD SEPARATOR */
12595 0x0085, /* NEXT LINE */
12596 0x2028, /* LINE SEPARATOR */
12597 0x2029, /* PARAGRAPH SEPARATOR */
12598 };
12599
Fred Drakee4315f52000-05-09 19:53:39 +000012600 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012602 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012603 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012604
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012605 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012606 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012607 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012608 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012609
12610 /* initialize the linebreak bloom filter */
12611 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012613 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012614
12615 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616}
12617
12618/* Finalize the Unicode implementation */
12619
Christian Heimesa156e092008-02-16 07:38:31 +000012620int
12621PyUnicode_ClearFreeList(void)
12622{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012624}
12625
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626void
Thomas Wouters78890102000-07-22 19:25:51 +000012627_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012629 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012631 Py_XDECREF(unicode_empty);
12632 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012633
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012634 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012635 if (unicode_latin1[i]) {
12636 Py_DECREF(unicode_latin1[i]);
12637 unicode_latin1[i] = NULL;
12638 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012639 }
Christian Heimesa156e092008-02-16 07:38:31 +000012640 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012642
Walter Dörwald16807132007-05-25 13:52:07 +000012643void
12644PyUnicode_InternInPlace(PyObject **p)
12645{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012646 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12647 PyObject *t;
12648 if (s == NULL || !PyUnicode_Check(s))
12649 Py_FatalError(
12650 "PyUnicode_InternInPlace: unicode strings only please!");
12651 /* If it's a subclass, we don't really know what putting
12652 it in the interned dict might do. */
12653 if (!PyUnicode_CheckExact(s))
12654 return;
12655 if (PyUnicode_CHECK_INTERNED(s))
12656 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 if (PyUnicode_READY(s) == -1) {
12658 assert(0 && "ready fail in intern...");
12659 return;
12660 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012661 if (interned == NULL) {
12662 interned = PyDict_New();
12663 if (interned == NULL) {
12664 PyErr_Clear(); /* Don't leave an exception */
12665 return;
12666 }
12667 }
12668 /* It might be that the GetItem call fails even
12669 though the key is present in the dictionary,
12670 namely when this happens during a stack overflow. */
12671 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012672 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012673 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012674
Benjamin Peterson29060642009-01-31 22:14:21 +000012675 if (t) {
12676 Py_INCREF(t);
12677 Py_DECREF(*p);
12678 *p = t;
12679 return;
12680 }
Walter Dörwald16807132007-05-25 13:52:07 +000012681
Benjamin Peterson14339b62009-01-31 16:36:08 +000012682 PyThreadState_GET()->recursion_critical = 1;
12683 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12684 PyErr_Clear();
12685 PyThreadState_GET()->recursion_critical = 0;
12686 return;
12687 }
12688 PyThreadState_GET()->recursion_critical = 0;
12689 /* The two references in interned are not counted by refcnt.
12690 The deallocator will take care of this */
12691 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012692 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012693}
12694
12695void
12696PyUnicode_InternImmortal(PyObject **p)
12697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12699
Benjamin Peterson14339b62009-01-31 16:36:08 +000012700 PyUnicode_InternInPlace(p);
12701 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012702 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012703 Py_INCREF(*p);
12704 }
Walter Dörwald16807132007-05-25 13:52:07 +000012705}
12706
12707PyObject *
12708PyUnicode_InternFromString(const char *cp)
12709{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012710 PyObject *s = PyUnicode_FromString(cp);
12711 if (s == NULL)
12712 return NULL;
12713 PyUnicode_InternInPlace(&s);
12714 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012715}
12716
Alexander Belopolsky40018472011-02-26 01:02:56 +000012717void
12718_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012719{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012720 PyObject *keys;
12721 PyUnicodeObject *s;
12722 Py_ssize_t i, n;
12723 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012724
Benjamin Peterson14339b62009-01-31 16:36:08 +000012725 if (interned == NULL || !PyDict_Check(interned))
12726 return;
12727 keys = PyDict_Keys(interned);
12728 if (keys == NULL || !PyList_Check(keys)) {
12729 PyErr_Clear();
12730 return;
12731 }
Walter Dörwald16807132007-05-25 13:52:07 +000012732
Benjamin Peterson14339b62009-01-31 16:36:08 +000012733 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12734 detector, interned unicode strings are not forcibly deallocated;
12735 rather, we give them their stolen references back, and then clear
12736 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012737
Benjamin Peterson14339b62009-01-31 16:36:08 +000012738 n = PyList_GET_SIZE(keys);
12739 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012741 for (i = 0; i < n; i++) {
12742 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 if (PyUnicode_READY(s) == -1)
12744 fprintf(stderr, "could not ready string\n");
12745 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012746 case SSTATE_NOT_INTERNED:
12747 /* XXX Shouldn't happen */
12748 break;
12749 case SSTATE_INTERNED_IMMORTAL:
12750 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012752 break;
12753 case SSTATE_INTERNED_MORTAL:
12754 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012756 break;
12757 default:
12758 Py_FatalError("Inconsistent interned string state.");
12759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012760 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012761 }
12762 fprintf(stderr, "total size of all interned strings: "
12763 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12764 "mortal/immortal\n", mortal_size, immortal_size);
12765 Py_DECREF(keys);
12766 PyDict_Clear(interned);
12767 Py_DECREF(interned);
12768 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012769}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012770
12771
12772/********************* Unicode Iterator **************************/
12773
12774typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012775 PyObject_HEAD
12776 Py_ssize_t it_index;
12777 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012778} unicodeiterobject;
12779
12780static void
12781unicodeiter_dealloc(unicodeiterobject *it)
12782{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012783 _PyObject_GC_UNTRACK(it);
12784 Py_XDECREF(it->it_seq);
12785 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012786}
12787
12788static int
12789unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12790{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012791 Py_VISIT(it->it_seq);
12792 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012793}
12794
12795static PyObject *
12796unicodeiter_next(unicodeiterobject *it)
12797{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012798 PyUnicodeObject *seq;
12799 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012800
Benjamin Peterson14339b62009-01-31 16:36:08 +000012801 assert(it != NULL);
12802 seq = it->it_seq;
12803 if (seq == NULL)
12804 return NULL;
12805 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12808 int kind = PyUnicode_KIND(seq);
12809 void *data = PyUnicode_DATA(seq);
12810 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12811 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012812 if (item != NULL)
12813 ++it->it_index;
12814 return item;
12815 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012816
Benjamin Peterson14339b62009-01-31 16:36:08 +000012817 Py_DECREF(seq);
12818 it->it_seq = NULL;
12819 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012820}
12821
12822static PyObject *
12823unicodeiter_len(unicodeiterobject *it)
12824{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012825 Py_ssize_t len = 0;
12826 if (it->it_seq)
12827 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12828 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012829}
12830
12831PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12832
12833static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012834 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012835 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012836 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012837};
12838
12839PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012840 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12841 "str_iterator", /* tp_name */
12842 sizeof(unicodeiterobject), /* tp_basicsize */
12843 0, /* tp_itemsize */
12844 /* methods */
12845 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12846 0, /* tp_print */
12847 0, /* tp_getattr */
12848 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012849 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012850 0, /* tp_repr */
12851 0, /* tp_as_number */
12852 0, /* tp_as_sequence */
12853 0, /* tp_as_mapping */
12854 0, /* tp_hash */
12855 0, /* tp_call */
12856 0, /* tp_str */
12857 PyObject_GenericGetAttr, /* tp_getattro */
12858 0, /* tp_setattro */
12859 0, /* tp_as_buffer */
12860 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12861 0, /* tp_doc */
12862 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12863 0, /* tp_clear */
12864 0, /* tp_richcompare */
12865 0, /* tp_weaklistoffset */
12866 PyObject_SelfIter, /* tp_iter */
12867 (iternextfunc)unicodeiter_next, /* tp_iternext */
12868 unicodeiter_methods, /* tp_methods */
12869 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012870};
12871
12872static PyObject *
12873unicode_iter(PyObject *seq)
12874{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012875 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012876
Benjamin Peterson14339b62009-01-31 16:36:08 +000012877 if (!PyUnicode_Check(seq)) {
12878 PyErr_BadInternalCall();
12879 return NULL;
12880 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012881 if (PyUnicode_READY(seq) == -1)
12882 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012883 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12884 if (it == NULL)
12885 return NULL;
12886 it->it_index = 0;
12887 Py_INCREF(seq);
12888 it->it_seq = (PyUnicodeObject *)seq;
12889 _PyObject_GC_TRACK(it);
12890 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012891}
12892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893#define UNIOP(x) Py_UNICODE_##x
12894#define UNIOP_t Py_UNICODE
12895#include "uniops.h"
12896#undef UNIOP
12897#undef UNIOP_t
12898#define UNIOP(x) Py_UCS4_##x
12899#define UNIOP_t Py_UCS4
12900#include "uniops.h"
12901#undef UNIOP
12902#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012903
Victor Stinner71133ff2010-09-01 23:43:53 +000012904Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012905PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012906{
12907 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12908 Py_UNICODE *copy;
12909 Py_ssize_t size;
12910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 if (!PyUnicode_Check(unicode)) {
12912 PyErr_BadArgument();
12913 return NULL;
12914 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012915 /* Ensure we won't overflow the size. */
12916 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12917 PyErr_NoMemory();
12918 return NULL;
12919 }
12920 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12921 size *= sizeof(Py_UNICODE);
12922 copy = PyMem_Malloc(size);
12923 if (copy == NULL) {
12924 PyErr_NoMemory();
12925 return NULL;
12926 }
12927 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12928 return copy;
12929}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012930
Georg Brandl66c221e2010-10-14 07:04:07 +000012931/* A _string module, to export formatter_parser and formatter_field_name_split
12932 to the string.Formatter class implemented in Python. */
12933
12934static PyMethodDef _string_methods[] = {
12935 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12936 METH_O, PyDoc_STR("split the argument as a field name")},
12937 {"formatter_parser", (PyCFunction) formatter_parser,
12938 METH_O, PyDoc_STR("parse the argument as a format string")},
12939 {NULL, NULL}
12940};
12941
12942static struct PyModuleDef _string_module = {
12943 PyModuleDef_HEAD_INIT,
12944 "_string",
12945 PyDoc_STR("string helper module"),
12946 0,
12947 _string_methods,
12948 NULL,
12949 NULL,
12950 NULL,
12951 NULL
12952};
12953
12954PyMODINIT_FUNC
12955PyInit__string(void)
12956{
12957 return PyModule_Create(&_string_module);
12958}
12959
12960
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012961#ifdef __cplusplus
12962}
12963#endif