blob: 2c48c82225a680915739837e9682ee064a14be3b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092/* Generic helper macro to convert characters of different types.
93 from_type and to_type have to be valid type names, begin and end
94 are pointers to the source characters which should be of type
95 "from_type *". to is a pointer of type "to_type *" and points to the
96 buffer where the result characters are written to. */
97#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
98 do { \
99 const from_type *iter_; to_type *to_; \
100 for (iter_ = (begin), to_ = (to_type *)(to); \
101 iter_ < (end); \
102 ++iter_, ++to_) { \
103 *to_ = (to_type)*iter_; \
104 } \
105 } while (0)
106
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107#define _PyUnicode_UTF8(op) \
108 (((PyCompactUnicodeObject*)(op))->utf8)
109#define PyUnicode_UTF8(op) \
110 (assert(PyUnicode_Check(op)), \
111 assert(PyUnicode_IS_READY(op)), \
112 PyUnicode_IS_COMPACT_ASCII(op) ? \
113 ((char*)((PyASCIIObject*)(op) + 1)) : \
114 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200115#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 (((PyCompactUnicodeObject*)(op))->utf8_length)
117#define PyUnicode_UTF8_LENGTH(op) \
118 (assert(PyUnicode_Check(op)), \
119 assert(PyUnicode_IS_READY(op)), \
120 PyUnicode_IS_COMPACT_ASCII(op) ? \
121 ((PyASCIIObject*)(op))->length : \
122 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
124#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
125#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
126#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
128#define _PyUnicode_KIND(op) \
129 (assert(PyUnicode_Check(op)), \
130 ((PyASCIIObject *)(op))->state.kind)
131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(PyUnicode_Check(op)), \
133 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200134#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200135
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200136/* The Unicode string has been modified: reset the hash */
137#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
138
Walter Dörwald16807132007-05-25 13:52:07 +0000139/* This dictionary holds all interned unicode strings. Note that references
140 to strings in this dictionary are *not* counted in the string's ob_refcnt.
141 When the interned string reaches a refcnt of 0 the string deallocation
142 function will delete the reference from this dictionary.
143
144 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000145 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000146*/
147static PyObject *interned;
148
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000149/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200150static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000151
152/* Single character Unicode strings in the Latin-1 range are being
153 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200154static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Fast detection of the most frequent whitespace characters */
157const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000159/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000161/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* case 0x000C: * FORM FEED */
163/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 1, 1, 1, 1, 1, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000166/* case 0x001C: * FILE SEPARATOR */
167/* case 0x001D: * GROUP SEPARATOR */
168/* case 0x001E: * RECORD SEPARATOR */
169/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 1, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
Alexander Belopolsky40018472011-02-26 01:02:56 +0000187static PyObject *
188unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000189 PyObject **errorHandler,const char *encoding, const char *reason,
190 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
191 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
192
Alexander Belopolsky40018472011-02-26 01:02:56 +0000193static void
194raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300195 const char *encoding,
196 const Py_UNICODE *unicode, Py_ssize_t size,
197 Py_ssize_t startpos, Py_ssize_t endpos,
198 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000199
Christian Heimes190d79e2008-01-30 11:58:22 +0000200/* Same for linebreaks */
201static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000204/* 0x000B, * LINE TABULATION */
205/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000206/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* 0x001C, * FILE SEPARATOR */
210/* 0x001D, * GROUP SEPARATOR */
211/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 1, 1, 1, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300228/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
229 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000230Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000231PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000232{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000233#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000235#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 /* This is actually an illegal character, so it should
237 not be passed to unichr. */
238 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000239#endif
240}
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242/* --- Bloom Filters ----------------------------------------------------- */
243
244/* stuff to implement simple "bloom filters" for Unicode characters.
245 to keep things simple, we use a single bitmask, using the least 5
246 bits from each unicode characters as the bit index. */
247
248/* the linebreak mask is set up by Unicode_Init below */
249
Antoine Pitrouf068f942010-01-13 14:19:12 +0000250#if LONG_BIT >= 128
251#define BLOOM_WIDTH 128
252#elif LONG_BIT >= 64
253#define BLOOM_WIDTH 64
254#elif LONG_BIT >= 32
255#define BLOOM_WIDTH 32
256#else
257#error "LONG_BIT is smaller than 32"
258#endif
259
Thomas Wouters477c8d52006-05-27 19:21:47 +0000260#define BLOOM_MASK unsigned long
261
262static BLOOM_MASK bloom_linebreak;
263
Antoine Pitrouf068f942010-01-13 14:19:12 +0000264#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
265#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266
Benjamin Peterson29060642009-01-31 22:14:21 +0000267#define BLOOM_LINEBREAK(ch) \
268 ((ch) < 128U ? ascii_linebreak[(ch)] : \
269 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270
Alexander Belopolsky40018472011-02-26 01:02:56 +0000271Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200272make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273{
274 /* calculate simple bloom-style bitmask for a given unicode string */
275
Antoine Pitrouf068f942010-01-13 14:19:12 +0000276 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000277 Py_ssize_t i;
278
279 mask = 0;
280 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
283 return mask;
284}
285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200286#define BLOOM_MEMBER(mask, chr, str) \
287 (BLOOM(mask, chr) \
288 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000289
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290/* --- Unicode Object ----------------------------------------------------- */
291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200292static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200293fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
294
295Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
296 Py_ssize_t size, Py_UCS4 ch,
297 int direction)
298{
299 /* like wcschr, but doesn't stop at NULL characters */
300 Py_ssize_t i;
301 if (direction == 1) {
302 for(i = 0; i < size; i++)
303 if (PyUnicode_READ(kind, s, i) == ch)
304 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
305 }
306 else {
307 for(i = size-1; i >= 0; i--)
308 if (PyUnicode_READ(kind, s, i) == ch)
309 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
310 }
311 return NULL;
312}
313
Alexander Belopolsky40018472011-02-26 01:02:56 +0000314static int
315unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000317{
318 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200320 /* Resizing is only supported for old unicode objects. */
321 assert(!PyUnicode_IS_COMPACT(unicode));
322 assert(_PyUnicode_WSTR(unicode) != NULL);
323
324 /* ... and only if they have not been readied yet, because
325 callees usually rely on the wstr representation when resizing. */
326 assert(unicode->data.any == NULL);
327
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000328 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200329 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000330 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000332 /* Resizing shared object (unicode_empty or single character
333 objects) in-place is not allowed. Use PyUnicode_Resize()
334 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000335
Benjamin Peterson14339b62009-01-31 16:36:08 +0000336 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200337 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
338 _PyUnicode_WSTR(unicode)[0] < 256U &&
339 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000341 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 return -1;
343 }
344
Thomas Wouters477c8d52006-05-27 19:21:47 +0000345 /* We allocate one more byte to make sure the string is Ux0000 terminated.
346 The overallocation is also used by fastsearch, which assumes that it's
347 safe to look at str[length] (without making any assumptions about what
348 it contains). */
349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350 oldstr = _PyUnicode_WSTR(unicode);
351 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
352 sizeof(Py_UNICODE) * (length + 1));
353 if (!_PyUnicode_WSTR(unicode)) {
354 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 PyErr_NoMemory();
356 return -1;
357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 _PyUnicode_WSTR(unicode)[length] = 0;
359 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360
Benjamin Peterson29060642009-01-31 22:14:21 +0000361 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362 if (unicode->data.any != NULL) {
363 PyObject_FREE(unicode->data.any);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200364 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != unicode->data.any) {
365 PyObject_FREE(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200367 _PyUnicode_UTF8(unicode) = NULL;
368 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200369 unicode->data.any = NULL;
370 _PyUnicode_LENGTH(unicode) = 0;
371 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
372 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200374 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000375
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return 0;
377}
378
379/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000380 Ux0000 terminated; some code (e.g. new_identifier)
381 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382
383 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385
386*/
387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200388#ifdef Py_DEBUG
389int unicode_old_new_calls = 0;
390#endif
391
Alexander Belopolsky40018472011-02-26 01:02:56 +0000392static PyUnicodeObject *
393_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394{
395 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200396 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 if (length == 0 && unicode_empty != NULL) {
400 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200401 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 }
403
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000404 /* Ensure we won't overflow the size. */
405 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
406 return (PyUnicodeObject *)PyErr_NoMemory();
407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200408 if (length < 0) {
409 PyErr_SetString(PyExc_SystemError,
410 "Negative size passed to _PyUnicode_New");
411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000412 }
413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200414#ifdef Py_DEBUG
415 ++unicode_old_new_calls;
416#endif
417
418 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
419 if (unicode == NULL)
420 return NULL;
421 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
422 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
423 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 PyErr_NoMemory();
425 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200427
Jeremy Hyltond8082792003-09-16 19:41:39 +0000428 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000429 * the caller fails before initializing str -- unicode_resize()
430 * reads str[0], and the Keep-Alive optimization can keep memory
431 * allocated for str alive across a call to unicode_dealloc(unicode).
432 * We don't want unicode_resize to read uninitialized memory in
433 * that case.
434 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200435 _PyUnicode_WSTR(unicode)[0] = 0;
436 _PyUnicode_WSTR(unicode)[length] = 0;
437 _PyUnicode_WSTR_LENGTH(unicode) = length;
438 _PyUnicode_HASH(unicode) = -1;
439 _PyUnicode_STATE(unicode).interned = 0;
440 _PyUnicode_STATE(unicode).kind = 0;
441 _PyUnicode_STATE(unicode).compact = 0;
442 _PyUnicode_STATE(unicode).ready = 0;
443 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200444 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200445 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200446 _PyUnicode_UTF8(unicode) = NULL;
447 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000449
Benjamin Peterson29060642009-01-31 22:14:21 +0000450 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000451 /* XXX UNREF/NEWREF interface should be more symmetrical */
452 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000453 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000454 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000455 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456}
457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200458#ifdef Py_DEBUG
459int unicode_new_new_calls = 0;
460
461/* Functions wrapping macros for use in debugger */
462char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200463 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200464}
465
466void *_PyUnicode_compact_data(void *unicode) {
467 return _PyUnicode_COMPACT_DATA(unicode);
468}
469void *_PyUnicode_data(void *unicode){
470 printf("obj %p\n", unicode);
471 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
472 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
473 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
474 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
475 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
476 return PyUnicode_DATA(unicode);
477}
478#endif
479
480PyObject *
481PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
482{
483 PyObject *obj;
484 PyCompactUnicodeObject *unicode;
485 void *data;
486 int kind_state;
487 int is_sharing = 0, is_ascii = 0;
488 Py_ssize_t char_size;
489 Py_ssize_t struct_size;
490
491 /* Optimization for empty strings */
492 if (size == 0 && unicode_empty != NULL) {
493 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200494 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200495 }
496
497#ifdef Py_DEBUG
498 ++unicode_new_new_calls;
499#endif
500
501 struct_size = sizeof(PyCompactUnicodeObject);
502 if (maxchar < 128) {
503 kind_state = PyUnicode_1BYTE_KIND;
504 char_size = 1;
505 is_ascii = 1;
506 struct_size = sizeof(PyASCIIObject);
507 }
508 else if (maxchar < 256) {
509 kind_state = PyUnicode_1BYTE_KIND;
510 char_size = 1;
511 }
512 else if (maxchar < 65536) {
513 kind_state = PyUnicode_2BYTE_KIND;
514 char_size = 2;
515 if (sizeof(wchar_t) == 2)
516 is_sharing = 1;
517 }
518 else {
519 kind_state = PyUnicode_4BYTE_KIND;
520 char_size = 4;
521 if (sizeof(wchar_t) == 4)
522 is_sharing = 1;
523 }
524
525 /* Ensure we won't overflow the size. */
526 if (size < 0) {
527 PyErr_SetString(PyExc_SystemError,
528 "Negative size passed to PyUnicode_New");
529 return NULL;
530 }
531 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
532 return PyErr_NoMemory();
533
534 /* Duplicated allocation code from _PyObject_New() instead of a call to
535 * PyObject_New() so we are able to allocate space for the object and
536 * it's data buffer.
537 */
538 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
539 if (obj == NULL)
540 return PyErr_NoMemory();
541 obj = PyObject_INIT(obj, &PyUnicode_Type);
542 if (obj == NULL)
543 return NULL;
544
545 unicode = (PyCompactUnicodeObject *)obj;
546 if (is_ascii)
547 data = ((PyASCIIObject*)obj) + 1;
548 else
549 data = unicode + 1;
550 _PyUnicode_LENGTH(unicode) = size;
551 _PyUnicode_HASH(unicode) = -1;
552 _PyUnicode_STATE(unicode).interned = 0;
553 _PyUnicode_STATE(unicode).kind = kind_state;
554 _PyUnicode_STATE(unicode).compact = 1;
555 _PyUnicode_STATE(unicode).ready = 1;
556 _PyUnicode_STATE(unicode).ascii = is_ascii;
557 if (is_ascii) {
558 ((char*)data)[size] = 0;
559 _PyUnicode_WSTR(unicode) = NULL;
560 }
561 else if (kind_state == PyUnicode_1BYTE_KIND) {
562 ((char*)data)[size] = 0;
563 _PyUnicode_WSTR(unicode) = NULL;
564 _PyUnicode_WSTR_LENGTH(unicode) = 0;
565 unicode->utf8_length = 0;
566 unicode->utf8 = NULL;
567 }
568 else {
569 unicode->utf8 = NULL;
570 if (kind_state == PyUnicode_2BYTE_KIND)
571 ((Py_UCS2*)data)[size] = 0;
572 else /* kind_state == PyUnicode_4BYTE_KIND */
573 ((Py_UCS4*)data)[size] = 0;
574 if (is_sharing) {
575 _PyUnicode_WSTR_LENGTH(unicode) = size;
576 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
577 }
578 else {
579 _PyUnicode_WSTR_LENGTH(unicode) = 0;
580 _PyUnicode_WSTR(unicode) = NULL;
581 }
582 }
583 return obj;
584}
585
586#if SIZEOF_WCHAR_T == 2
587/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
588 will decode surrogate pairs, the other conversions are implemented as macros
589 for efficency.
590
591 This function assumes that unicode can hold one more code point than wstr
592 characters for a terminating null character. */
593static int
594unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
595 PyUnicodeObject *unicode)
596{
597 const wchar_t *iter;
598 Py_UCS4 *ucs4_out;
599
600 assert(unicode && PyUnicode_Check(unicode));
601 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
602 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
603
604 for (iter = begin; iter < end; ) {
605 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
606 _PyUnicode_GET_LENGTH(unicode)));
607 if (*iter >= 0xD800 && *iter <= 0xDBFF
608 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
609 {
610 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
611 iter += 2;
612 }
613 else {
614 *ucs4_out++ = *iter;
615 iter++;
616 }
617 }
618 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
619 _PyUnicode_GET_LENGTH(unicode)));
620
621 return 0;
622}
623#endif
624
Victor Stinnercd9950f2011-10-02 00:34:53 +0200625static int
626_PyUnicode_Dirty(PyObject *unicode)
627{
628 assert(PyUnicode_Check(unicode));
629 if (Py_REFCNT(unicode) != 1) {
630 PyErr_SetString(PyExc_ValueError,
631 "Cannot modify a string having more than 1 reference");
632 return -1;
633 }
634 _PyUnicode_DIRTY(unicode);
635 return 0;
636}
637
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200638Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200639PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
640 PyObject *from, Py_ssize_t from_start,
641 Py_ssize_t how_many)
642{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200643 unsigned int from_kind, to_kind;
644 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645
Victor Stinnerb1536152011-09-30 02:26:10 +0200646 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
647 PyErr_BadInternalCall();
648 return -1;
649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200650
651 if (PyUnicode_READY(from))
652 return -1;
653 if (PyUnicode_READY(to))
654 return -1;
655
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200656 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200657 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
658 PyErr_Format(PyExc_ValueError,
659 "Cannot write %zi characters at %zi "
660 "in a string of %zi characters",
661 how_many, to_start, PyUnicode_GET_LENGTH(to));
662 return -1;
663 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200664 if (how_many == 0)
665 return 0;
666
Victor Stinnercd9950f2011-10-02 00:34:53 +0200667 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200668 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200669
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200670 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200671 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200673 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200674
675 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200676 /* fast path */
Victor Stinnera0702ab2011-09-29 14:14:38 +0200677 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200678 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200679 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200680 + PyUnicode_KIND_SIZE(from_kind, from_start),
681 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200682 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200683 else if (from_kind == PyUnicode_1BYTE_KIND
684 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200685 {
686 _PyUnicode_CONVERT_BYTES(
687 Py_UCS1, Py_UCS2,
688 PyUnicode_1BYTE_DATA(from) + from_start,
689 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
690 PyUnicode_2BYTE_DATA(to) + to_start
691 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200692 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200693 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200694 && to_kind == PyUnicode_4BYTE_KIND)
695 {
696 _PyUnicode_CONVERT_BYTES(
697 Py_UCS1, Py_UCS4,
698 PyUnicode_1BYTE_DATA(from) + from_start,
699 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
700 PyUnicode_4BYTE_DATA(to) + to_start
701 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200702 }
703 else if (from_kind == PyUnicode_2BYTE_KIND
704 && to_kind == PyUnicode_4BYTE_KIND)
705 {
706 _PyUnicode_CONVERT_BYTES(
707 Py_UCS2, Py_UCS4,
708 PyUnicode_2BYTE_DATA(from) + from_start,
709 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
710 PyUnicode_4BYTE_DATA(to) + to_start
711 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200712 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200713 else {
714 int invalid_kinds;
715 if (from_kind > to_kind) {
716 /* slow path to check for character overflow */
717 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
718 Py_UCS4 ch, maxchar;
719 Py_ssize_t i;
720
721 maxchar = 0;
722 invalid_kinds = 0;
723 for (i=0; i < how_many; i++) {
724 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
725 if (ch > maxchar) {
726 maxchar = ch;
727 if (maxchar > to_maxchar) {
728 invalid_kinds = 1;
729 break;
730 }
731 }
732 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
733 }
734 }
735 else
736 invalid_kinds = 1;
737 if (invalid_kinds) {
738 PyErr_Format(PyExc_ValueError,
739 "Cannot copy UCS%u characters "
740 "into a string of UCS%u characters",
741 1 << (from_kind - 1),
742 1 << (to_kind -1));
743 return -1;
744 }
745 }
746 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200747}
748
Victor Stinner17222162011-09-28 22:15:37 +0200749/* Find the maximum code point and count the number of surrogate pairs so a
750 correct string length can be computed before converting a string to UCS4.
751 This function counts single surrogates as a character and not as a pair.
752
753 Return 0 on success, or -1 on error. */
754static int
755find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
756 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200757{
758 const wchar_t *iter;
759
760 if (num_surrogates == NULL || maxchar == NULL) {
761 PyErr_SetString(PyExc_SystemError,
762 "unexpected NULL arguments to "
763 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
764 return -1;
765 }
766
767 *num_surrogates = 0;
768 *maxchar = 0;
769
770 for (iter = begin; iter < end; ) {
771 if (*iter > *maxchar)
772 *maxchar = *iter;
773#if SIZEOF_WCHAR_T == 2
774 if (*iter >= 0xD800 && *iter <= 0xDBFF
775 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
776 {
777 Py_UCS4 surrogate_val;
778 surrogate_val = (((iter[0] & 0x3FF)<<10)
779 | (iter[1] & 0x3FF)) + 0x10000;
780 ++(*num_surrogates);
781 if (surrogate_val > *maxchar)
782 *maxchar = surrogate_val;
783 iter += 2;
784 }
785 else
786 iter++;
787#else
788 iter++;
789#endif
790 }
791 return 0;
792}
793
794#ifdef Py_DEBUG
795int unicode_ready_calls = 0;
796#endif
797
798int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200799_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200800{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200801 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 wchar_t *end;
803 Py_UCS4 maxchar = 0;
804 Py_ssize_t num_surrogates;
805#if SIZEOF_WCHAR_T == 2
806 Py_ssize_t length_wo_surrogates;
807#endif
808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200810 strings were created using _PyObject_New() and where no canonical
811 representation (the str field) has been set yet aka strings
812 which are not yet ready. */
813 assert(PyUnicode_Check(obj));
814 assert(!PyUnicode_IS_READY(obj));
815 assert(!PyUnicode_IS_COMPACT(obj));
816 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200817 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +0200818 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200819 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200820 /* Actually, it should neither be interned nor be anything else: */
821 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200822
823#ifdef Py_DEBUG
824 ++unicode_ready_calls;
825#endif
826
827 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200828 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200829 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200830 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200831
832 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +0200833 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
834 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200835 PyErr_NoMemory();
836 return -1;
837 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200838 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200839 _PyUnicode_WSTR(unicode), end,
840 PyUnicode_1BYTE_DATA(unicode));
841 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
842 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
843 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
844 if (maxchar < 128) {
Victor Stinnerc3c74152011-10-02 20:39:55 +0200845 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200846 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200847 }
848 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200849 _PyUnicode_UTF8(unicode) = NULL;
850 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200851 }
852 PyObject_FREE(_PyUnicode_WSTR(unicode));
853 _PyUnicode_WSTR(unicode) = NULL;
854 _PyUnicode_WSTR_LENGTH(unicode) = 0;
855 }
856 /* In this case we might have to convert down from 4-byte native
857 wchar_t to 2-byte unicode. */
858 else if (maxchar < 65536) {
859 assert(num_surrogates == 0 &&
860 "FindMaxCharAndNumSurrogatePairs() messed up");
861
Victor Stinner506f5922011-09-28 22:34:18 +0200862#if SIZEOF_WCHAR_T == 2
863 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +0200864 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +0200865 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
866 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
867 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200868 _PyUnicode_UTF8(unicode) = NULL;
869 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200870#else
871 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +0200872 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +0200873 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +0200874 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +0200875 PyErr_NoMemory();
876 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877 }
Victor Stinner506f5922011-09-28 22:34:18 +0200878 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
879 _PyUnicode_WSTR(unicode), end,
880 PyUnicode_2BYTE_DATA(unicode));
881 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
882 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
883 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200884 _PyUnicode_UTF8(unicode) = NULL;
885 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200886 PyObject_FREE(_PyUnicode_WSTR(unicode));
887 _PyUnicode_WSTR(unicode) = NULL;
888 _PyUnicode_WSTR_LENGTH(unicode) = 0;
889#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200890 }
891 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
892 else {
893#if SIZEOF_WCHAR_T == 2
894 /* in case the native representation is 2-bytes, we need to allocate a
895 new normalized 4-byte version. */
896 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200897 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
898 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899 PyErr_NoMemory();
900 return -1;
901 }
902 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
903 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200904 _PyUnicode_UTF8(unicode) = NULL;
905 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
907 unicode) < 0) {
908 assert(0 && "ConvertWideCharToUCS4 failed");
909 return -1;
910 }
911 PyObject_FREE(_PyUnicode_WSTR(unicode));
912 _PyUnicode_WSTR(unicode) = NULL;
913 _PyUnicode_WSTR_LENGTH(unicode) = 0;
914#else
915 assert(num_surrogates == 0);
916
Victor Stinnerc3c74152011-10-02 20:39:55 +0200917 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200918 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200919 _PyUnicode_UTF8(unicode) = NULL;
920 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
922#endif
923 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
924 }
925 _PyUnicode_STATE(unicode).ready = 1;
926 return 0;
927}
928
Alexander Belopolsky40018472011-02-26 01:02:56 +0000929static void
930unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000931{
Walter Dörwald16807132007-05-25 13:52:07 +0000932 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000933 case SSTATE_NOT_INTERNED:
934 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000935
Benjamin Peterson29060642009-01-31 22:14:21 +0000936 case SSTATE_INTERNED_MORTAL:
937 /* revive dead object temporarily for DelItem */
938 Py_REFCNT(unicode) = 3;
939 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
940 Py_FatalError(
941 "deletion of interned string failed");
942 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000943
Benjamin Peterson29060642009-01-31 22:14:21 +0000944 case SSTATE_INTERNED_IMMORTAL:
945 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000946
Benjamin Peterson29060642009-01-31 22:14:21 +0000947 default:
948 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000949 }
950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 if (_PyUnicode_WSTR(unicode) &&
952 (!PyUnicode_IS_READY(unicode) ||
953 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
954 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200955 if (!PyUnicode_IS_COMPACT_ASCII(unicode)
956 && _PyUnicode_UTF8(unicode)
957 && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
958 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200959
960 if (PyUnicode_IS_COMPACT(unicode)) {
961 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000962 }
963 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +0200964 if (_PyUnicode_DATA_ANY(unicode))
965 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +0000966 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000967 }
968}
969
Alexander Belopolsky40018472011-02-26 01:02:56 +0000970static int
971_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000972{
973 register PyUnicodeObject *v;
974
975 /* Argument checks */
976 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000977 PyErr_BadInternalCall();
978 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000979 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000980 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
982 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000983 PyErr_BadInternalCall();
984 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000985 }
986
987 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988 possible since these are being shared.
989 The same goes for new-representation unicode objects or objects which
990 have already been readied.
991 For these, we simply return a fresh copy with the same Unicode content.
992 */
993 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
994 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
995 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000996 PyUnicodeObject *w = _PyUnicode_New(length);
997 if (w == NULL)
998 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200999 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
1000 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +00001001 Py_DECREF(*unicode);
1002 *unicode = w;
1003 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001004 }
1005
1006 /* Note that we don't have to modify *unicode for unshared Unicode
1007 objects, since we can modify them in-place. */
1008 return unicode_resize(v, length);
1009}
1010
Alexander Belopolsky40018472011-02-26 01:02:56 +00001011int
1012PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001013{
1014 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
1015}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001017static PyObject*
1018get_latin1_char(unsigned char ch)
1019{
Victor Stinnera464fc12011-10-02 20:39:30 +02001020 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001021 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001022 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001023 if (!unicode)
1024 return NULL;
1025 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1026 unicode_latin1[ch] = unicode;
1027 }
1028 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001029 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030}
1031
Alexander Belopolsky40018472011-02-26 01:02:56 +00001032PyObject *
1033PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034{
1035 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001036 Py_UCS4 maxchar = 0;
1037 Py_ssize_t num_surrogates;
1038
1039 if (u == NULL)
1040 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001042 /* If the Unicode data is known at construction time, we can apply
1043 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 /* Optimization for empty strings */
1046 if (size == 0 && unicode_empty != NULL) {
1047 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001048 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001049 }
Tim Petersced69f82003-09-16 20:30:58 +00001050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 /* Single character Unicode objects in the Latin-1 range are
1052 shared when using this constructor */
1053 if (size == 1 && *u < 256)
1054 return get_latin1_char((unsigned char)*u);
1055
1056 /* If not empty and not single character, copy the Unicode data
1057 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001058 if (find_maxchar_surrogates(u, u + size,
1059 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 return NULL;
1061
1062 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1063 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064 if (!unicode)
1065 return NULL;
1066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 switch (PyUnicode_KIND(unicode)) {
1068 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001069 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1071 break;
1072 case PyUnicode_2BYTE_KIND:
1073#if Py_UNICODE_SIZE == 2
1074 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1075#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001076 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001077 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1078#endif
1079 break;
1080 case PyUnicode_4BYTE_KIND:
1081#if SIZEOF_WCHAR_T == 2
1082 /* This is the only case which has to process surrogates, thus
1083 a simple copy loop is not enough and we need a function. */
1084 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1085 Py_DECREF(unicode);
1086 return NULL;
1087 }
1088#else
1089 assert(num_surrogates == 0);
1090 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1091#endif
1092 break;
1093 default:
1094 assert(0 && "Impossible state");
1095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096
1097 return (PyObject *)unicode;
1098}
1099
Alexander Belopolsky40018472011-02-26 01:02:56 +00001100PyObject *
1101PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001102{
1103 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001104
Benjamin Peterson14339b62009-01-31 16:36:08 +00001105 if (size < 0) {
1106 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001107 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001108 return NULL;
1109 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001110
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001111 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001112 some optimizations which share commonly used objects.
1113 Also, this means the input must be UTF-8, so fall back to the
1114 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001115 if (u != NULL) {
1116
Benjamin Peterson29060642009-01-31 22:14:21 +00001117 /* Optimization for empty strings */
1118 if (size == 0 && unicode_empty != NULL) {
1119 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001120 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001121 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001122
1123 /* Single characters are shared when using this constructor.
1124 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 if (size == 1 && Py_CHARMASK(*u) < 128)
1126 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001127
1128 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001129 }
1130
Walter Dörwald55507312007-05-18 13:12:10 +00001131 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001132 if (!unicode)
1133 return NULL;
1134
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001135 return (PyObject *)unicode;
1136}
1137
Alexander Belopolsky40018472011-02-26 01:02:56 +00001138PyObject *
1139PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001140{
1141 size_t size = strlen(u);
1142 if (size > PY_SSIZE_T_MAX) {
1143 PyErr_SetString(PyExc_OverflowError, "input too long");
1144 return NULL;
1145 }
1146
1147 return PyUnicode_FromStringAndSize(u, size);
1148}
1149
Victor Stinnere57b1c02011-09-28 22:20:48 +02001150static PyObject*
1151_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001152{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153 PyObject *res;
1154 unsigned char max = 127;
1155 Py_ssize_t i;
1156 for (i = 0; i < size; i++) {
1157 if (u[i] & 0x80) {
1158 max = 255;
1159 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001160 }
1161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 res = PyUnicode_New(size, max);
1163 if (!res)
1164 return NULL;
1165 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1166 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001167}
1168
Victor Stinnere57b1c02011-09-28 22:20:48 +02001169static PyObject*
1170_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171{
1172 PyObject *res;
1173 Py_UCS2 max = 0;
1174 Py_ssize_t i;
1175 for (i = 0; i < size; i++)
1176 if (u[i] > max)
1177 max = u[i];
1178 res = PyUnicode_New(size, max);
1179 if (!res)
1180 return NULL;
1181 if (max >= 256)
1182 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1183 else
1184 for (i = 0; i < size; i++)
1185 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1186 return res;
1187}
1188
Victor Stinnere57b1c02011-09-28 22:20:48 +02001189static PyObject*
1190_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001191{
1192 PyObject *res;
1193 Py_UCS4 max = 0;
1194 Py_ssize_t i;
1195 for (i = 0; i < size; i++)
1196 if (u[i] > max)
1197 max = u[i];
1198 res = PyUnicode_New(size, max);
1199 if (!res)
1200 return NULL;
1201 if (max >= 0x10000)
1202 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1203 else {
1204 int kind = PyUnicode_KIND(res);
1205 void *data = PyUnicode_DATA(res);
1206 for (i = 0; i < size; i++)
1207 PyUnicode_WRITE(kind, data, i, u[i]);
1208 }
1209 return res;
1210}
1211
1212PyObject*
1213PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1214{
1215 switch(kind) {
1216 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001217 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001219 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001220 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001221 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001223 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 return NULL;
1225}
1226
Victor Stinner034f6cf2011-09-30 02:26:44 +02001227PyObject*
1228PyUnicode_Copy(PyObject *unicode)
1229{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001230 Py_ssize_t size;
1231 PyObject *copy;
1232 void *data;
1233
Victor Stinner034f6cf2011-09-30 02:26:44 +02001234 if (!PyUnicode_Check(unicode)) {
1235 PyErr_BadInternalCall();
1236 return NULL;
1237 }
1238 if (PyUnicode_READY(unicode))
1239 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001240
1241 size = PyUnicode_GET_LENGTH(unicode);
1242 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1243 if (!copy)
1244 return NULL;
1245 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1246
1247 data = PyUnicode_DATA(unicode);
1248 switch (PyUnicode_KIND(unicode))
1249 {
1250 case PyUnicode_1BYTE_KIND:
1251 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1252 break;
1253 case PyUnicode_2BYTE_KIND:
1254 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1255 break;
1256 case PyUnicode_4BYTE_KIND:
1257 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1258 break;
1259 default:
1260 assert(0);
1261 break;
1262 }
1263 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001264}
1265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001266
Victor Stinnerbc603d12011-10-02 01:00:40 +02001267/* Widen Unicode objects to larger buffers. Don't write terminating null
1268 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269
1270void*
1271_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1272{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001273 Py_ssize_t len;
1274 void *result;
1275 unsigned int skind;
1276
1277 if (PyUnicode_READY(s))
1278 return NULL;
1279
1280 len = PyUnicode_GET_LENGTH(s);
1281 skind = PyUnicode_KIND(s);
1282 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001283 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1284 return NULL;
1285 }
1286 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001287 case PyUnicode_2BYTE_KIND:
1288 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1289 if (!result)
1290 return PyErr_NoMemory();
1291 assert(skind == PyUnicode_1BYTE_KIND);
1292 _PyUnicode_CONVERT_BYTES(
1293 Py_UCS1, Py_UCS2,
1294 PyUnicode_1BYTE_DATA(s),
1295 PyUnicode_1BYTE_DATA(s) + len,
1296 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001298 case PyUnicode_4BYTE_KIND:
1299 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1300 if (!result)
1301 return PyErr_NoMemory();
1302 if (skind == PyUnicode_2BYTE_KIND) {
1303 _PyUnicode_CONVERT_BYTES(
1304 Py_UCS2, Py_UCS4,
1305 PyUnicode_2BYTE_DATA(s),
1306 PyUnicode_2BYTE_DATA(s) + len,
1307 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001309 else {
1310 assert(skind == PyUnicode_1BYTE_KIND);
1311 _PyUnicode_CONVERT_BYTES(
1312 Py_UCS1, Py_UCS4,
1313 PyUnicode_1BYTE_DATA(s),
1314 PyUnicode_1BYTE_DATA(s) + len,
1315 result);
1316 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001318 default:
1319 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001321 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 return NULL;
1323}
1324
1325static Py_UCS4*
1326as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1327 int copy_null)
1328{
1329 int kind;
1330 void *data;
1331 Py_ssize_t len, targetlen;
1332 if (PyUnicode_READY(string) == -1)
1333 return NULL;
1334 kind = PyUnicode_KIND(string);
1335 data = PyUnicode_DATA(string);
1336 len = PyUnicode_GET_LENGTH(string);
1337 targetlen = len;
1338 if (copy_null)
1339 targetlen++;
1340 if (!target) {
1341 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1342 PyErr_NoMemory();
1343 return NULL;
1344 }
1345 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1346 if (!target) {
1347 PyErr_NoMemory();
1348 return NULL;
1349 }
1350 }
1351 else {
1352 if (targetsize < targetlen) {
1353 PyErr_Format(PyExc_SystemError,
1354 "string is longer than the buffer");
1355 if (copy_null && 0 < targetsize)
1356 target[0] = 0;
1357 return NULL;
1358 }
1359 }
1360 if (kind != PyUnicode_4BYTE_KIND) {
1361 Py_ssize_t i;
1362 for (i = 0; i < len; i++)
1363 target[i] = PyUnicode_READ(kind, data, i);
1364 }
1365 else
1366 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1367 if (copy_null)
1368 target[len] = 0;
1369 return target;
1370}
1371
1372Py_UCS4*
1373PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1374 int copy_null)
1375{
1376 if (target == NULL || targetsize < 1) {
1377 PyErr_BadInternalCall();
1378 return NULL;
1379 }
1380 return as_ucs4(string, target, targetsize, copy_null);
1381}
1382
1383Py_UCS4*
1384PyUnicode_AsUCS4Copy(PyObject *string)
1385{
1386 return as_ucs4(string, NULL, 0, 1);
1387}
1388
1389#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001390
Alexander Belopolsky40018472011-02-26 01:02:56 +00001391PyObject *
1392PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001395 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001397 PyErr_BadInternalCall();
1398 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001399 }
1400
Martin v. Löwis790465f2008-04-05 20:41:37 +00001401 if (size == -1) {
1402 size = wcslen(w);
1403 }
1404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406}
1407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001409
Walter Dörwald346737f2007-05-31 10:44:43 +00001410static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001411makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1412 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001413{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001414 *fmt++ = '%';
1415 if (width) {
1416 if (zeropad)
1417 *fmt++ = '0';
1418 fmt += sprintf(fmt, "%d", width);
1419 }
1420 if (precision)
1421 fmt += sprintf(fmt, ".%d", precision);
1422 if (longflag)
1423 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001424 else if (longlongflag) {
1425 /* longlongflag should only ever be nonzero on machines with
1426 HAVE_LONG_LONG defined */
1427#ifdef HAVE_LONG_LONG
1428 char *f = PY_FORMAT_LONG_LONG;
1429 while (*f)
1430 *fmt++ = *f++;
1431#else
1432 /* we shouldn't ever get here */
1433 assert(0);
1434 *fmt++ = 'l';
1435#endif
1436 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001437 else if (size_tflag) {
1438 char *f = PY_FORMAT_SIZE_T;
1439 while (*f)
1440 *fmt++ = *f++;
1441 }
1442 *fmt++ = c;
1443 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001444}
1445
Victor Stinner96865452011-03-01 23:44:09 +00001446/* helper for PyUnicode_FromFormatV() */
1447
1448static const char*
1449parse_format_flags(const char *f,
1450 int *p_width, int *p_precision,
1451 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1452{
1453 int width, precision, longflag, longlongflag, size_tflag;
1454
1455 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1456 f++;
1457 width = 0;
1458 while (Py_ISDIGIT((unsigned)*f))
1459 width = (width*10) + *f++ - '0';
1460 precision = 0;
1461 if (*f == '.') {
1462 f++;
1463 while (Py_ISDIGIT((unsigned)*f))
1464 precision = (precision*10) + *f++ - '0';
1465 if (*f == '%') {
1466 /* "%.3%s" => f points to "3" */
1467 f--;
1468 }
1469 }
1470 if (*f == '\0') {
1471 /* bogus format "%.1" => go backward, f points to "1" */
1472 f--;
1473 }
1474 if (p_width != NULL)
1475 *p_width = width;
1476 if (p_precision != NULL)
1477 *p_precision = precision;
1478
1479 /* Handle %ld, %lu, %lld and %llu. */
1480 longflag = 0;
1481 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001482 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001483
1484 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001485 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001486 longflag = 1;
1487 ++f;
1488 }
1489#ifdef HAVE_LONG_LONG
1490 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001491 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001492 longlongflag = 1;
1493 f += 2;
1494 }
1495#endif
1496 }
1497 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001498 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001499 size_tflag = 1;
1500 ++f;
1501 }
1502 if (p_longflag != NULL)
1503 *p_longflag = longflag;
1504 if (p_longlongflag != NULL)
1505 *p_longlongflag = longlongflag;
1506 if (p_size_tflag != NULL)
1507 *p_size_tflag = size_tflag;
1508 return f;
1509}
1510
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001511/* maximum number of characters required for output of %ld. 21 characters
1512 allows for 64-bit integers (in decimal) and an optional sign. */
1513#define MAX_LONG_CHARS 21
1514/* maximum number of characters required for output of %lld.
1515 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1516 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1517#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1518
Walter Dörwaldd2034312007-05-18 16:29:38 +00001519PyObject *
1520PyUnicode_FromFormatV(const char *format, va_list vargs)
1521{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001522 va_list count;
1523 Py_ssize_t callcount = 0;
1524 PyObject **callresults = NULL;
1525 PyObject **callresult = NULL;
1526 Py_ssize_t n = 0;
1527 int width = 0;
1528 int precision = 0;
1529 int zeropad;
1530 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001532 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001533 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1535 Py_UCS4 argmaxchar;
1536 Py_ssize_t numbersize = 0;
1537 char *numberresults = NULL;
1538 char *numberresult = NULL;
1539 Py_ssize_t i;
1540 int kind;
1541 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001542
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001543 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001544 /* step 1: count the number of %S/%R/%A/%s format specifications
1545 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1546 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 * result in an array)
1548 * also esimate a upper bound for all the number formats in the string,
1549 * numbers will be formated in step 3 and be keept in a '\0'-separated
1550 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001551 for (f = format; *f; f++) {
1552 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001553 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1555 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1556 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1557 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001560#ifdef HAVE_LONG_LONG
1561 if (longlongflag) {
1562 if (width < MAX_LONG_LONG_CHARS)
1563 width = MAX_LONG_LONG_CHARS;
1564 }
1565 else
1566#endif
1567 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1568 including sign. Decimal takes the most space. This
1569 isn't enough for octal. If a width is specified we
1570 need more (which we allocate later). */
1571 if (width < MAX_LONG_CHARS)
1572 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573
1574 /* account for the size + '\0' to separate numbers
1575 inside of the numberresults buffer */
1576 numbersize += (width + 1);
1577 }
1578 }
1579 else if ((unsigned char)*f > 127) {
1580 PyErr_Format(PyExc_ValueError,
1581 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1582 "string, got a non-ASCII byte: 0x%02x",
1583 (unsigned char)*f);
1584 return NULL;
1585 }
1586 }
1587 /* step 2: allocate memory for the results of
1588 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1589 if (callcount) {
1590 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1591 if (!callresults) {
1592 PyErr_NoMemory();
1593 return NULL;
1594 }
1595 callresult = callresults;
1596 }
1597 /* step 2.5: allocate memory for the results of formating numbers */
1598 if (numbersize) {
1599 numberresults = PyObject_Malloc(numbersize);
1600 if (!numberresults) {
1601 PyErr_NoMemory();
1602 goto fail;
1603 }
1604 numberresult = numberresults;
1605 }
1606
1607 /* step 3: format numbers and figure out how large a buffer we need */
1608 for (f = format; *f; f++) {
1609 if (*f == '%') {
1610 const char* p;
1611 int longflag;
1612 int longlongflag;
1613 int size_tflag;
1614 int numprinted;
1615
1616 p = f;
1617 zeropad = (f[1] == '0');
1618 f = parse_format_flags(f, &width, &precision,
1619 &longflag, &longlongflag, &size_tflag);
1620 switch (*f) {
1621 case 'c':
1622 {
1623 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001624 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001625 n++;
1626 break;
1627 }
1628 case '%':
1629 n++;
1630 break;
1631 case 'i':
1632 case 'd':
1633 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1634 width, precision, *f);
1635 if (longflag)
1636 numprinted = sprintf(numberresult, fmt,
1637 va_arg(count, long));
1638#ifdef HAVE_LONG_LONG
1639 else if (longlongflag)
1640 numprinted = sprintf(numberresult, fmt,
1641 va_arg(count, PY_LONG_LONG));
1642#endif
1643 else if (size_tflag)
1644 numprinted = sprintf(numberresult, fmt,
1645 va_arg(count, Py_ssize_t));
1646 else
1647 numprinted = sprintf(numberresult, fmt,
1648 va_arg(count, int));
1649 n += numprinted;
1650 /* advance by +1 to skip over the '\0' */
1651 numberresult += (numprinted + 1);
1652 assert(*(numberresult - 1) == '\0');
1653 assert(*(numberresult - 2) != '\0');
1654 assert(numprinted >= 0);
1655 assert(numberresult <= numberresults + numbersize);
1656 break;
1657 case 'u':
1658 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1659 width, precision, 'u');
1660 if (longflag)
1661 numprinted = sprintf(numberresult, fmt,
1662 va_arg(count, unsigned long));
1663#ifdef HAVE_LONG_LONG
1664 else if (longlongflag)
1665 numprinted = sprintf(numberresult, fmt,
1666 va_arg(count, unsigned PY_LONG_LONG));
1667#endif
1668 else if (size_tflag)
1669 numprinted = sprintf(numberresult, fmt,
1670 va_arg(count, size_t));
1671 else
1672 numprinted = sprintf(numberresult, fmt,
1673 va_arg(count, unsigned int));
1674 n += numprinted;
1675 numberresult += (numprinted + 1);
1676 assert(*(numberresult - 1) == '\0');
1677 assert(*(numberresult - 2) != '\0');
1678 assert(numprinted >= 0);
1679 assert(numberresult <= numberresults + numbersize);
1680 break;
1681 case 'x':
1682 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1683 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1684 n += numprinted;
1685 numberresult += (numprinted + 1);
1686 assert(*(numberresult - 1) == '\0');
1687 assert(*(numberresult - 2) != '\0');
1688 assert(numprinted >= 0);
1689 assert(numberresult <= numberresults + numbersize);
1690 break;
1691 case 'p':
1692 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1693 /* %p is ill-defined: ensure leading 0x. */
1694 if (numberresult[1] == 'X')
1695 numberresult[1] = 'x';
1696 else if (numberresult[1] != 'x') {
1697 memmove(numberresult + 2, numberresult,
1698 strlen(numberresult) + 1);
1699 numberresult[0] = '0';
1700 numberresult[1] = 'x';
1701 numprinted += 2;
1702 }
1703 n += numprinted;
1704 numberresult += (numprinted + 1);
1705 assert(*(numberresult - 1) == '\0');
1706 assert(*(numberresult - 2) != '\0');
1707 assert(numprinted >= 0);
1708 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001709 break;
1710 case 's':
1711 {
1712 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001713 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001714 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1715 if (!str)
1716 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001717 /* since PyUnicode_DecodeUTF8 returns already flexible
1718 unicode objects, there is no need to call ready on them */
1719 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001720 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001722 /* Remember the str and switch to the next slot */
1723 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001724 break;
1725 }
1726 case 'U':
1727 {
1728 PyObject *obj = va_arg(count, PyObject *);
1729 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730 if (PyUnicode_READY(obj) == -1)
1731 goto fail;
1732 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001733 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001735 break;
1736 }
1737 case 'V':
1738 {
1739 PyObject *obj = va_arg(count, PyObject *);
1740 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001741 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001742 assert(obj || str);
1743 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001744 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 if (PyUnicode_READY(obj) == -1)
1746 goto fail;
1747 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001748 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001750 *callresult++ = NULL;
1751 }
1752 else {
1753 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1754 if (!str_obj)
1755 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001757 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001759 *callresult++ = str_obj;
1760 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001761 break;
1762 }
1763 case 'S':
1764 {
1765 PyObject *obj = va_arg(count, PyObject *);
1766 PyObject *str;
1767 assert(obj);
1768 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001770 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001772 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001774 /* Remember the str and switch to the next slot */
1775 *callresult++ = str;
1776 break;
1777 }
1778 case 'R':
1779 {
1780 PyObject *obj = va_arg(count, PyObject *);
1781 PyObject *repr;
1782 assert(obj);
1783 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001785 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001786 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001787 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001789 /* Remember the repr and switch to the next slot */
1790 *callresult++ = repr;
1791 break;
1792 }
1793 case 'A':
1794 {
1795 PyObject *obj = va_arg(count, PyObject *);
1796 PyObject *ascii;
1797 assert(obj);
1798 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001800 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001802 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001804 /* Remember the repr and switch to the next slot */
1805 *callresult++ = ascii;
1806 break;
1807 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001808 default:
1809 /* if we stumble upon an unknown
1810 formatting code, copy the rest of
1811 the format string to the output
1812 string. (we cannot just skip the
1813 code, since there's no way to know
1814 what's in the argument list) */
1815 n += strlen(p);
1816 goto expand;
1817 }
1818 } else
1819 n++;
1820 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001821 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001822 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001824 we don't have to resize the string.
1825 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001827 if (!string)
1828 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 kind = PyUnicode_KIND(string);
1830 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001831 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001835 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001836 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001837
1838 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1840 /* checking for == because the last argument could be a empty
1841 string, which causes i to point to end, the assert at the end of
1842 the loop */
1843 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001844
Benjamin Peterson14339b62009-01-31 16:36:08 +00001845 switch (*f) {
1846 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001847 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 const int ordinal = va_arg(vargs, int);
1849 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001850 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001851 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001852 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001853 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001854 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001855 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001856 case 'p':
1857 /* unused, since we already have the result */
1858 if (*f == 'p')
1859 (void) va_arg(vargs, void *);
1860 else
1861 (void) va_arg(vargs, int);
1862 /* extract the result from numberresults and append. */
1863 for (; *numberresult; ++i, ++numberresult)
1864 PyUnicode_WRITE(kind, data, i, *numberresult);
1865 /* skip over the separating '\0' */
1866 assert(*numberresult == '\0');
1867 numberresult++;
1868 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001869 break;
1870 case 's':
1871 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001872 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001874 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875 size = PyUnicode_GET_LENGTH(*callresult);
1876 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001877 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1878 *callresult, 0,
1879 size) < 0)
1880 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001881 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001882 /* We're done with the unicode()/repr() => forget it */
1883 Py_DECREF(*callresult);
1884 /* switch to next unicode()/repr() result */
1885 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001886 break;
1887 }
1888 case 'U':
1889 {
1890 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001891 Py_ssize_t size;
1892 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1893 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001894 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1895 obj, 0,
1896 size) < 0)
1897 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001899 break;
1900 }
1901 case 'V':
1902 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001904 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001905 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001906 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 size = PyUnicode_GET_LENGTH(obj);
1908 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001909 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1910 obj, 0,
1911 size) < 0)
1912 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001914 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915 size = PyUnicode_GET_LENGTH(*callresult);
1916 assert(PyUnicode_KIND(*callresult) <=
1917 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001918 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1919 *callresult,
1920 0, size) < 0)
1921 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001923 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001924 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001925 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001926 break;
1927 }
1928 case 'S':
1929 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001930 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001931 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001932 /* unused, since we already have the result */
1933 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001934 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001935 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1936 *callresult, 0,
1937 PyUnicode_GET_LENGTH(*callresult)) < 0)
1938 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001940 /* We're done with the unicode()/repr() => forget it */
1941 Py_DECREF(*callresult);
1942 /* switch to next unicode()/repr() result */
1943 ++callresult;
1944 break;
1945 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001946 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001948 break;
1949 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 for (; *p; ++p, ++i)
1951 PyUnicode_WRITE(kind, data, i, *p);
1952 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001953 goto end;
1954 }
Victor Stinner1205f272010-09-11 00:54:47 +00001955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001956 else {
1957 assert(i < PyUnicode_GET_LENGTH(string));
1958 PyUnicode_WRITE(kind, data, i++, *f);
1959 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001962
Benjamin Peterson29060642009-01-31 22:14:21 +00001963 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001964 if (callresults)
1965 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966 if (numberresults)
1967 PyObject_Free(numberresults);
1968 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001969 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001970 if (callresults) {
1971 PyObject **callresult2 = callresults;
1972 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001973 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001974 ++callresult2;
1975 }
1976 PyObject_Free(callresults);
1977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 if (numberresults)
1979 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001980 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001981}
1982
Walter Dörwaldd2034312007-05-18 16:29:38 +00001983PyObject *
1984PyUnicode_FromFormat(const char *format, ...)
1985{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001986 PyObject* ret;
1987 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001988
1989#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001990 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001991#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001992 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001993#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001994 ret = PyUnicode_FromFormatV(format, vargs);
1995 va_end(vargs);
1996 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001997}
1998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999#ifdef HAVE_WCHAR_H
2000
Victor Stinner5593d8a2010-10-02 11:11:27 +00002001/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2002 convert a Unicode object to a wide character string.
2003
Victor Stinnerd88d9832011-09-06 02:00:05 +02002004 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002005 character) required to convert the unicode object. Ignore size argument.
2006
Victor Stinnerd88d9832011-09-06 02:00:05 +02002007 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002008 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002009 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002010static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002011unicode_aswidechar(PyUnicodeObject *unicode,
2012 wchar_t *w,
2013 Py_ssize_t size)
2014{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002015 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 const wchar_t *wstr;
2017
2018 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2019 if (wstr == NULL)
2020 return -1;
2021
Victor Stinner5593d8a2010-10-02 11:11:27 +00002022 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002023 if (size > res)
2024 size = res + 1;
2025 else
2026 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002028 return res;
2029 }
2030 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002032}
2033
2034Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002035PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002036 wchar_t *w,
2037 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038{
2039 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002040 PyErr_BadInternalCall();
2041 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002043 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044}
2045
Victor Stinner137c34c2010-09-29 10:25:54 +00002046wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002047PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002048 Py_ssize_t *size)
2049{
2050 wchar_t* buffer;
2051 Py_ssize_t buflen;
2052
2053 if (unicode == NULL) {
2054 PyErr_BadInternalCall();
2055 return NULL;
2056 }
2057
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002058 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 if (buflen == -1)
2060 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002061 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002062 PyErr_NoMemory();
2063 return NULL;
2064 }
2065
Victor Stinner137c34c2010-09-29 10:25:54 +00002066 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2067 if (buffer == NULL) {
2068 PyErr_NoMemory();
2069 return NULL;
2070 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002071 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002072 if (buflen == -1)
2073 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002074 if (size != NULL)
2075 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002076 return buffer;
2077}
2078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002079#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080
Alexander Belopolsky40018472011-02-26 01:02:56 +00002081PyObject *
2082PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002083{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002085 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002086 PyErr_SetString(PyExc_ValueError,
2087 "chr() arg not in range(0x110000)");
2088 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002089 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 if (ordinal < 256)
2092 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002094 v = PyUnicode_New(1, ordinal);
2095 if (v == NULL)
2096 return NULL;
2097 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2098 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002099}
2100
Alexander Belopolsky40018472011-02-26 01:02:56 +00002101PyObject *
2102PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002104 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002105 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002106 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002107 if (PyUnicode_READY(obj))
2108 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002109 Py_INCREF(obj);
2110 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002111 }
2112 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002113 /* For a Unicode subtype that's not a Unicode object,
2114 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002115 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002116 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002117 PyErr_Format(PyExc_TypeError,
2118 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002119 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002120 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002121}
2122
Alexander Belopolsky40018472011-02-26 01:02:56 +00002123PyObject *
2124PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002125 const char *encoding,
2126 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002127{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002128 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002129 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002130
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002132 PyErr_BadInternalCall();
2133 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002135
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002136 /* Decoding bytes objects is the most common case and should be fast */
2137 if (PyBytes_Check(obj)) {
2138 if (PyBytes_GET_SIZE(obj) == 0) {
2139 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002140 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002141 }
2142 else {
2143 v = PyUnicode_Decode(
2144 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2145 encoding, errors);
2146 }
2147 return v;
2148 }
2149
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002150 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002151 PyErr_SetString(PyExc_TypeError,
2152 "decoding str is not supported");
2153 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002154 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002155
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002156 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2157 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2158 PyErr_Format(PyExc_TypeError,
2159 "coercing to str: need bytes, bytearray "
2160 "or buffer-like object, %.80s found",
2161 Py_TYPE(obj)->tp_name);
2162 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002163 }
Tim Petersced69f82003-09-16 20:30:58 +00002164
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002165 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002166 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002167 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 }
Tim Petersced69f82003-09-16 20:30:58 +00002169 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002170 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002171
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002172 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002173 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174}
2175
Victor Stinner600d3be2010-06-10 12:00:55 +00002176/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002177 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2178 1 on success. */
2179static int
2180normalize_encoding(const char *encoding,
2181 char *lower,
2182 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002184 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002185 char *l;
2186 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002187
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002188 e = encoding;
2189 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002190 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002191 while (*e) {
2192 if (l == l_end)
2193 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002194 if (Py_ISUPPER(*e)) {
2195 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002196 }
2197 else if (*e == '_') {
2198 *l++ = '-';
2199 e++;
2200 }
2201 else {
2202 *l++ = *e++;
2203 }
2204 }
2205 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002206 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002207}
2208
Alexander Belopolsky40018472011-02-26 01:02:56 +00002209PyObject *
2210PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002211 Py_ssize_t size,
2212 const char *encoding,
2213 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002214{
2215 PyObject *buffer = NULL, *unicode;
2216 Py_buffer info;
2217 char lower[11]; /* Enough for any encoding shortcut */
2218
2219 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002220 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002221
2222 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002223 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002224 if ((strcmp(lower, "utf-8") == 0) ||
2225 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002226 return PyUnicode_DecodeUTF8(s, size, errors);
2227 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002228 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002229 (strcmp(lower, "iso-8859-1") == 0))
2230 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002231#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002232 else if (strcmp(lower, "mbcs") == 0)
2233 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002234#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002235 else if (strcmp(lower, "ascii") == 0)
2236 return PyUnicode_DecodeASCII(s, size, errors);
2237 else if (strcmp(lower, "utf-16") == 0)
2238 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2239 else if (strcmp(lower, "utf-32") == 0)
2240 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242
2243 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002244 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002245 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002246 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002247 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 if (buffer == NULL)
2249 goto onError;
2250 unicode = PyCodec_Decode(buffer, encoding, errors);
2251 if (unicode == NULL)
2252 goto onError;
2253 if (!PyUnicode_Check(unicode)) {
2254 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002255 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002256 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257 Py_DECREF(unicode);
2258 goto onError;
2259 }
2260 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 if (PyUnicode_READY(unicode)) {
2262 Py_DECREF(unicode);
2263 return NULL;
2264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002266
Benjamin Peterson29060642009-01-31 22:14:21 +00002267 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 Py_XDECREF(buffer);
2269 return NULL;
2270}
2271
Alexander Belopolsky40018472011-02-26 01:02:56 +00002272PyObject *
2273PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002274 const char *encoding,
2275 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002276{
2277 PyObject *v;
2278
2279 if (!PyUnicode_Check(unicode)) {
2280 PyErr_BadArgument();
2281 goto onError;
2282 }
2283
2284 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002285 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002286
2287 /* Decode via the codec registry */
2288 v = PyCodec_Decode(unicode, encoding, errors);
2289 if (v == NULL)
2290 goto onError;
2291 return v;
2292
Benjamin Peterson29060642009-01-31 22:14:21 +00002293 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002294 return NULL;
2295}
2296
Alexander Belopolsky40018472011-02-26 01:02:56 +00002297PyObject *
2298PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002299 const char *encoding,
2300 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002301{
2302 PyObject *v;
2303
2304 if (!PyUnicode_Check(unicode)) {
2305 PyErr_BadArgument();
2306 goto onError;
2307 }
2308
2309 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002310 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002311
2312 /* Decode via the codec registry */
2313 v = PyCodec_Decode(unicode, encoding, errors);
2314 if (v == NULL)
2315 goto onError;
2316 if (!PyUnicode_Check(v)) {
2317 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002318 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002319 Py_TYPE(v)->tp_name);
2320 Py_DECREF(v);
2321 goto onError;
2322 }
2323 return v;
2324
Benjamin Peterson29060642009-01-31 22:14:21 +00002325 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002326 return NULL;
2327}
2328
Alexander Belopolsky40018472011-02-26 01:02:56 +00002329PyObject *
2330PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002331 Py_ssize_t size,
2332 const char *encoding,
2333 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334{
2335 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002336
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337 unicode = PyUnicode_FromUnicode(s, size);
2338 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2341 Py_DECREF(unicode);
2342 return v;
2343}
2344
Alexander Belopolsky40018472011-02-26 01:02:56 +00002345PyObject *
2346PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002347 const char *encoding,
2348 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002349{
2350 PyObject *v;
2351
2352 if (!PyUnicode_Check(unicode)) {
2353 PyErr_BadArgument();
2354 goto onError;
2355 }
2356
2357 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002358 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002359
2360 /* Encode via the codec registry */
2361 v = PyCodec_Encode(unicode, encoding, errors);
2362 if (v == NULL)
2363 goto onError;
2364 return v;
2365
Benjamin Peterson29060642009-01-31 22:14:21 +00002366 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002367 return NULL;
2368}
2369
Victor Stinnerad158722010-10-27 00:25:46 +00002370PyObject *
2371PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002372{
Victor Stinner99b95382011-07-04 14:23:54 +02002373#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002374 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2375 PyUnicode_GET_SIZE(unicode),
2376 NULL);
2377#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002379#else
Victor Stinner793b5312011-04-27 00:24:21 +02002380 PyInterpreterState *interp = PyThreadState_GET()->interp;
2381 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2382 cannot use it to encode and decode filenames before it is loaded. Load
2383 the Python codec requires to encode at least its own filename. Use the C
2384 version of the locale codec until the codec registry is initialized and
2385 the Python codec is loaded.
2386
2387 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2388 cannot only rely on it: check also interp->fscodec_initialized for
2389 subinterpreters. */
2390 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002391 return PyUnicode_AsEncodedString(unicode,
2392 Py_FileSystemDefaultEncoding,
2393 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002394 }
2395 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002396 /* locale encoding with surrogateescape */
2397 wchar_t *wchar;
2398 char *bytes;
2399 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002400 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002401
2402 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2403 if (wchar == NULL)
2404 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002405 bytes = _Py_wchar2char(wchar, &error_pos);
2406 if (bytes == NULL) {
2407 if (error_pos != (size_t)-1) {
2408 char *errmsg = strerror(errno);
2409 PyObject *exc = NULL;
2410 if (errmsg == NULL)
2411 errmsg = "Py_wchar2char() failed";
2412 raise_encode_exception(&exc,
2413 "filesystemencoding",
2414 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2415 error_pos, error_pos+1,
2416 errmsg);
2417 Py_XDECREF(exc);
2418 }
2419 else
2420 PyErr_NoMemory();
2421 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002422 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002423 }
2424 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002425
2426 bytes_obj = PyBytes_FromString(bytes);
2427 PyMem_Free(bytes);
2428 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002429 }
Victor Stinnerad158722010-10-27 00:25:46 +00002430#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002431}
2432
Alexander Belopolsky40018472011-02-26 01:02:56 +00002433PyObject *
2434PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002435 const char *encoding,
2436 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002437{
2438 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002439 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002440
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441 if (!PyUnicode_Check(unicode)) {
2442 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002443 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002444 }
Fred Drakee4315f52000-05-09 19:53:39 +00002445
Victor Stinner2f283c22011-03-02 01:21:46 +00002446 if (encoding == NULL) {
2447 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002449 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002451 }
Fred Drakee4315f52000-05-09 19:53:39 +00002452
2453 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002454 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002455 if ((strcmp(lower, "utf-8") == 0) ||
2456 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002457 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002458 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002460 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002462 }
Victor Stinner37296e82010-06-10 13:36:23 +00002463 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002464 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002465 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002466 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002467#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002468 else if (strcmp(lower, "mbcs") == 0)
2469 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2470 PyUnicode_GET_SIZE(unicode),
2471 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002472#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002473 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476
2477 /* Encode via the codec registry */
2478 v = PyCodec_Encode(unicode, encoding, errors);
2479 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002480 return NULL;
2481
2482 /* The normal path */
2483 if (PyBytes_Check(v))
2484 return v;
2485
2486 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002487 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002488 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002489 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002490
2491 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2492 "encoder %s returned bytearray instead of bytes",
2493 encoding);
2494 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002495 Py_DECREF(v);
2496 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002497 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002498
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002499 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2500 Py_DECREF(v);
2501 return b;
2502 }
2503
2504 PyErr_Format(PyExc_TypeError,
2505 "encoder did not return a bytes object (type=%.400s)",
2506 Py_TYPE(v)->tp_name);
2507 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002508 return NULL;
2509}
2510
Alexander Belopolsky40018472011-02-26 01:02:56 +00002511PyObject *
2512PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002513 const char *encoding,
2514 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002515{
2516 PyObject *v;
2517
2518 if (!PyUnicode_Check(unicode)) {
2519 PyErr_BadArgument();
2520 goto onError;
2521 }
2522
2523 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002524 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002525
2526 /* Encode via the codec registry */
2527 v = PyCodec_Encode(unicode, encoding, errors);
2528 if (v == NULL)
2529 goto onError;
2530 if (!PyUnicode_Check(v)) {
2531 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002532 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002533 Py_TYPE(v)->tp_name);
2534 Py_DECREF(v);
2535 goto onError;
2536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002538
Benjamin Peterson29060642009-01-31 22:14:21 +00002539 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 return NULL;
2541}
2542
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002543PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002544PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002545 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002546 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2547}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002548
Christian Heimes5894ba72007-11-04 11:43:14 +00002549PyObject*
2550PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2551{
Victor Stinner99b95382011-07-04 14:23:54 +02002552#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002553 return PyUnicode_DecodeMBCS(s, size, NULL);
2554#elif defined(__APPLE__)
2555 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2556#else
Victor Stinner793b5312011-04-27 00:24:21 +02002557 PyInterpreterState *interp = PyThreadState_GET()->interp;
2558 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2559 cannot use it to encode and decode filenames before it is loaded. Load
2560 the Python codec requires to encode at least its own filename. Use the C
2561 version of the locale codec until the codec registry is initialized and
2562 the Python codec is loaded.
2563
2564 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2565 cannot only rely on it: check also interp->fscodec_initialized for
2566 subinterpreters. */
2567 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002568 return PyUnicode_Decode(s, size,
2569 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002570 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002571 }
2572 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002573 /* locale encoding with surrogateescape */
2574 wchar_t *wchar;
2575 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002576 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002577
2578 if (s[size] != '\0' || size != strlen(s)) {
2579 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2580 return NULL;
2581 }
2582
Victor Stinner168e1172010-10-16 23:16:16 +00002583 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002584 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002585 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002586
Victor Stinner168e1172010-10-16 23:16:16 +00002587 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002588 PyMem_Free(wchar);
2589 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002590 }
Victor Stinnerad158722010-10-27 00:25:46 +00002591#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002592}
2593
Martin v. Löwis011e8422009-05-05 04:43:17 +00002594
2595int
2596PyUnicode_FSConverter(PyObject* arg, void* addr)
2597{
2598 PyObject *output = NULL;
2599 Py_ssize_t size;
2600 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002601 if (arg == NULL) {
2602 Py_DECREF(*(PyObject**)addr);
2603 return 1;
2604 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002605 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002606 output = arg;
2607 Py_INCREF(output);
2608 }
2609 else {
2610 arg = PyUnicode_FromObject(arg);
2611 if (!arg)
2612 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002613 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002614 Py_DECREF(arg);
2615 if (!output)
2616 return 0;
2617 if (!PyBytes_Check(output)) {
2618 Py_DECREF(output);
2619 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2620 return 0;
2621 }
2622 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002623 size = PyBytes_GET_SIZE(output);
2624 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002625 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002626 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002627 Py_DECREF(output);
2628 return 0;
2629 }
2630 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002631 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002632}
2633
2634
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002635int
2636PyUnicode_FSDecoder(PyObject* arg, void* addr)
2637{
2638 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002639 if (arg == NULL) {
2640 Py_DECREF(*(PyObject**)addr);
2641 return 1;
2642 }
2643 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 if (PyUnicode_READY(arg))
2645 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002646 output = arg;
2647 Py_INCREF(output);
2648 }
2649 else {
2650 arg = PyBytes_FromObject(arg);
2651 if (!arg)
2652 return 0;
2653 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2654 PyBytes_GET_SIZE(arg));
2655 Py_DECREF(arg);
2656 if (!output)
2657 return 0;
2658 if (!PyUnicode_Check(output)) {
2659 Py_DECREF(output);
2660 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2661 return 0;
2662 }
2663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002664 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2665 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002666 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2667 Py_DECREF(output);
2668 return 0;
2669 }
2670 *(PyObject**)addr = output;
2671 return Py_CLEANUP_SUPPORTED;
2672}
2673
2674
Martin v. Löwis5b222132007-06-10 09:51:05 +00002675char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002676PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002677{
Christian Heimesf3863112007-11-22 07:46:41 +00002678 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002679 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2680
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002681 if (!PyUnicode_Check(unicode)) {
2682 PyErr_BadArgument();
2683 return NULL;
2684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002685 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002686 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002687
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002688 if (PyUnicode_UTF8(unicode) == NULL) {
2689 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2691 if (bytes == NULL)
2692 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002693 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2694 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002695 Py_DECREF(bytes);
2696 return NULL;
2697 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002698 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2699 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 Py_DECREF(bytes);
2701 }
2702
2703 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002704 *psize = PyUnicode_UTF8_LENGTH(unicode);
2705 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002706}
2707
2708char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002709PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002710{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2712}
2713
2714#ifdef Py_DEBUG
2715int unicode_as_unicode_calls = 0;
2716#endif
2717
2718
2719Py_UNICODE *
2720PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2721{
2722 PyUnicodeObject *u;
2723 const unsigned char *one_byte;
2724#if SIZEOF_WCHAR_T == 4
2725 const Py_UCS2 *two_bytes;
2726#else
2727 const Py_UCS4 *four_bytes;
2728 const Py_UCS4 *ucs4_end;
2729 Py_ssize_t num_surrogates;
2730#endif
2731 wchar_t *w;
2732 wchar_t *wchar_end;
2733
2734 if (!PyUnicode_Check(unicode)) {
2735 PyErr_BadArgument();
2736 return NULL;
2737 }
2738 u = (PyUnicodeObject*)unicode;
2739 if (_PyUnicode_WSTR(u) == NULL) {
2740 /* Non-ASCII compact unicode object */
2741 assert(_PyUnicode_KIND(u) != 0);
2742 assert(PyUnicode_IS_READY(u));
2743
2744#ifdef Py_DEBUG
2745 ++unicode_as_unicode_calls;
2746#endif
2747
2748 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2749#if SIZEOF_WCHAR_T == 2
2750 four_bytes = PyUnicode_4BYTE_DATA(u);
2751 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2752 num_surrogates = 0;
2753
2754 for (; four_bytes < ucs4_end; ++four_bytes) {
2755 if (*four_bytes > 0xFFFF)
2756 ++num_surrogates;
2757 }
2758
2759 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2760 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2761 if (!_PyUnicode_WSTR(u)) {
2762 PyErr_NoMemory();
2763 return NULL;
2764 }
2765 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2766
2767 w = _PyUnicode_WSTR(u);
2768 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2769 four_bytes = PyUnicode_4BYTE_DATA(u);
2770 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2771 if (*four_bytes > 0xFFFF) {
2772 /* encode surrogate pair in this case */
2773 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2774 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2775 }
2776 else
2777 *w = *four_bytes;
2778
2779 if (w > wchar_end) {
2780 assert(0 && "Miscalculated string end");
2781 }
2782 }
2783 *w = 0;
2784#else
2785 /* sizeof(wchar_t) == 4 */
2786 Py_FatalError("Impossible unicode object state, wstr and str "
2787 "should share memory already.");
2788 return NULL;
2789#endif
2790 }
2791 else {
2792 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2793 (_PyUnicode_LENGTH(u) + 1));
2794 if (!_PyUnicode_WSTR(u)) {
2795 PyErr_NoMemory();
2796 return NULL;
2797 }
2798 if (!PyUnicode_IS_COMPACT_ASCII(u))
2799 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2800 w = _PyUnicode_WSTR(u);
2801 wchar_end = w + _PyUnicode_LENGTH(u);
2802
2803 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2804 one_byte = PyUnicode_1BYTE_DATA(u);
2805 for (; w < wchar_end; ++one_byte, ++w)
2806 *w = *one_byte;
2807 /* null-terminate the wstr */
2808 *w = 0;
2809 }
2810 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2811#if SIZEOF_WCHAR_T == 4
2812 two_bytes = PyUnicode_2BYTE_DATA(u);
2813 for (; w < wchar_end; ++two_bytes, ++w)
2814 *w = *two_bytes;
2815 /* null-terminate the wstr */
2816 *w = 0;
2817#else
2818 /* sizeof(wchar_t) == 2 */
2819 PyObject_FREE(_PyUnicode_WSTR(u));
2820 _PyUnicode_WSTR(u) = NULL;
2821 Py_FatalError("Impossible unicode object state, wstr "
2822 "and str should share memory already.");
2823 return NULL;
2824#endif
2825 }
2826 else {
2827 assert(0 && "This should never happen.");
2828 }
2829 }
2830 }
2831 if (size != NULL)
2832 *size = PyUnicode_WSTR_LENGTH(u);
2833 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002834}
2835
Alexander Belopolsky40018472011-02-26 01:02:56 +00002836Py_UNICODE *
2837PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002839 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840}
2841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002842
Alexander Belopolsky40018472011-02-26 01:02:56 +00002843Py_ssize_t
2844PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845{
2846 if (!PyUnicode_Check(unicode)) {
2847 PyErr_BadArgument();
2848 goto onError;
2849 }
2850 return PyUnicode_GET_SIZE(unicode);
2851
Benjamin Peterson29060642009-01-31 22:14:21 +00002852 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 return -1;
2854}
2855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002856Py_ssize_t
2857PyUnicode_GetLength(PyObject *unicode)
2858{
Victor Stinner5a706cf2011-10-02 00:36:53 +02002859 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002860 PyErr_BadArgument();
2861 return -1;
2862 }
2863
2864 return PyUnicode_GET_LENGTH(unicode);
2865}
2866
2867Py_UCS4
2868PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2869{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02002870 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
2871 PyErr_BadArgument();
2872 return (Py_UCS4)-1;
2873 }
2874 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
2875 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002876 return (Py_UCS4)-1;
2877 }
2878 return PyUnicode_READ_CHAR(unicode, index);
2879}
2880
2881int
2882PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2883{
2884 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02002885 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002886 return -1;
2887 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02002888 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
2889 PyErr_SetString(PyExc_IndexError, "string index out of range");
2890 return -1;
2891 }
2892 if (_PyUnicode_Dirty(unicode))
2893 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002894 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2895 index, ch);
2896 return 0;
2897}
2898
Alexander Belopolsky40018472011-02-26 01:02:56 +00002899const char *
2900PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002901{
Victor Stinner42cb4622010-09-01 19:39:01 +00002902 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002903}
2904
Victor Stinner554f3f02010-06-16 23:33:54 +00002905/* create or adjust a UnicodeDecodeError */
2906static void
2907make_decode_exception(PyObject **exceptionObject,
2908 const char *encoding,
2909 const char *input, Py_ssize_t length,
2910 Py_ssize_t startpos, Py_ssize_t endpos,
2911 const char *reason)
2912{
2913 if (*exceptionObject == NULL) {
2914 *exceptionObject = PyUnicodeDecodeError_Create(
2915 encoding, input, length, startpos, endpos, reason);
2916 }
2917 else {
2918 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2919 goto onError;
2920 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2921 goto onError;
2922 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2923 goto onError;
2924 }
2925 return;
2926
2927onError:
2928 Py_DECREF(*exceptionObject);
2929 *exceptionObject = NULL;
2930}
2931
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002932/* error handling callback helper:
2933 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002934 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002935 and adjust various state variables.
2936 return 0 on success, -1 on error
2937*/
2938
Alexander Belopolsky40018472011-02-26 01:02:56 +00002939static int
2940unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002941 const char *encoding, const char *reason,
2942 const char **input, const char **inend, Py_ssize_t *startinpos,
2943 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2944 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002945{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002946 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002947
2948 PyObject *restuple = NULL;
2949 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002950 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002951 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002952 Py_ssize_t requiredsize;
2953 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002955 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002956 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002957 int res = -1;
2958
2959 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002960 *errorHandler = PyCodec_LookupError(errors);
2961 if (*errorHandler == NULL)
2962 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002963 }
2964
Victor Stinner554f3f02010-06-16 23:33:54 +00002965 make_decode_exception(exceptionObject,
2966 encoding,
2967 *input, *inend - *input,
2968 *startinpos, *endinpos,
2969 reason);
2970 if (*exceptionObject == NULL)
2971 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002972
2973 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2974 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002975 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002976 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002977 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002979 }
2980 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002981 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002982
2983 /* Copy back the bytes variables, which might have been modified by the
2984 callback */
2985 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2986 if (!inputobj)
2987 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002988 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002989 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002990 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002991 *input = PyBytes_AS_STRING(inputobj);
2992 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002993 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002994 /* we can DECREF safely, as the exception has another reference,
2995 so the object won't go away. */
2996 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002997
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002998 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002999 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003000 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003001 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3002 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003003 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003004
3005 /* need more space? (at least enough for what we
3006 have+the replacement+the rest of the string (starting
3007 at the new input position), so we won't have to check space
3008 when there are no errors in the rest of the string) */
3009 repptr = PyUnicode_AS_UNICODE(repunicode);
3010 repsize = PyUnicode_GET_SIZE(repunicode);
3011 requiredsize = *outpos + repsize + insize-newpos;
3012 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003013 if (requiredsize<2*outsize)
3014 requiredsize = 2*outsize;
3015 if (_PyUnicode_Resize(output, requiredsize) < 0)
3016 goto onError;
3017 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003018 }
3019 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003020 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003021 Py_UNICODE_COPY(*outptr, repptr, repsize);
3022 *outptr += repsize;
3023 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003024
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 /* we made it! */
3026 res = 0;
3027
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003029 Py_XDECREF(restuple);
3030 return res;
3031}
3032
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003033/* --- UTF-7 Codec -------------------------------------------------------- */
3034
Antoine Pitrou244651a2009-05-04 18:56:13 +00003035/* See RFC2152 for details. We encode conservatively and decode liberally. */
3036
3037/* Three simple macros defining base-64. */
3038
3039/* Is c a base-64 character? */
3040
3041#define IS_BASE64(c) \
3042 (((c) >= 'A' && (c) <= 'Z') || \
3043 ((c) >= 'a' && (c) <= 'z') || \
3044 ((c) >= '0' && (c) <= '9') || \
3045 (c) == '+' || (c) == '/')
3046
3047/* given that c is a base-64 character, what is its base-64 value? */
3048
3049#define FROM_BASE64(c) \
3050 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3051 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3052 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3053 (c) == '+' ? 62 : 63)
3054
3055/* What is the base-64 character of the bottom 6 bits of n? */
3056
3057#define TO_BASE64(n) \
3058 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3059
3060/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3061 * decoded as itself. We are permissive on decoding; the only ASCII
3062 * byte not decoding to itself is the + which begins a base64
3063 * string. */
3064
3065#define DECODE_DIRECT(c) \
3066 ((c) <= 127 && (c) != '+')
3067
3068/* The UTF-7 encoder treats ASCII characters differently according to
3069 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3070 * the above). See RFC2152. This array identifies these different
3071 * sets:
3072 * 0 : "Set D"
3073 * alphanumeric and '(),-./:?
3074 * 1 : "Set O"
3075 * !"#$%&*;<=>@[]^_`{|}
3076 * 2 : "whitespace"
3077 * ht nl cr sp
3078 * 3 : special (must be base64 encoded)
3079 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3080 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003081
Tim Petersced69f82003-09-16 20:30:58 +00003082static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003083char utf7_category[128] = {
3084/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3085 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3086/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3087 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3088/* sp ! " # $ % & ' ( ) * + , - . / */
3089 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3090/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3091 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3092/* @ A B C D E F G H I J K L M N O */
3093 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3094/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3096/* ` a b c d e f g h i j k l m n o */
3097 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3098/* p q r s t u v w x y z { | } ~ del */
3099 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003100};
3101
Antoine Pitrou244651a2009-05-04 18:56:13 +00003102/* ENCODE_DIRECT: this character should be encoded as itself. The
3103 * answer depends on whether we are encoding set O as itself, and also
3104 * on whether we are encoding whitespace as itself. RFC2152 makes it
3105 * clear that the answers to these questions vary between
3106 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003107
Antoine Pitrou244651a2009-05-04 18:56:13 +00003108#define ENCODE_DIRECT(c, directO, directWS) \
3109 ((c) < 128 && (c) > 0 && \
3110 ((utf7_category[(c)] == 0) || \
3111 (directWS && (utf7_category[(c)] == 2)) || \
3112 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003113
Alexander Belopolsky40018472011-02-26 01:02:56 +00003114PyObject *
3115PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003116 Py_ssize_t size,
3117 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003118{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003119 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3120}
3121
Antoine Pitrou244651a2009-05-04 18:56:13 +00003122/* The decoder. The only state we preserve is our read position,
3123 * i.e. how many characters we have consumed. So if we end in the
3124 * middle of a shift sequence we have to back off the read position
3125 * and the output to the beginning of the sequence, otherwise we lose
3126 * all the shift state (seen bits, number of bits seen, high
3127 * surrogate). */
3128
Alexander Belopolsky40018472011-02-26 01:02:56 +00003129PyObject *
3130PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003131 Py_ssize_t size,
3132 const char *errors,
3133 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003134{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003135 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003136 Py_ssize_t startinpos;
3137 Py_ssize_t endinpos;
3138 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003139 const char *e;
3140 PyUnicodeObject *unicode;
3141 Py_UNICODE *p;
3142 const char *errmsg = "";
3143 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003144 Py_UNICODE *shiftOutStart;
3145 unsigned int base64bits = 0;
3146 unsigned long base64buffer = 0;
3147 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003148 PyObject *errorHandler = NULL;
3149 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003150
3151 unicode = _PyUnicode_New(size);
3152 if (!unicode)
3153 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003154 if (size == 0) {
3155 if (consumed)
3156 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003157 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003158 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003160 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003161 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003162 e = s + size;
3163
3164 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003165 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003166 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003167 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003168
Antoine Pitrou244651a2009-05-04 18:56:13 +00003169 if (inShift) { /* in a base-64 section */
3170 if (IS_BASE64(ch)) { /* consume a base-64 character */
3171 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3172 base64bits += 6;
3173 s++;
3174 if (base64bits >= 16) {
3175 /* we have enough bits for a UTF-16 value */
3176 Py_UNICODE outCh = (Py_UNICODE)
3177 (base64buffer >> (base64bits-16));
3178 base64bits -= 16;
3179 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3180 if (surrogate) {
3181 /* expecting a second surrogate */
3182 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3183#ifdef Py_UNICODE_WIDE
3184 *p++ = (((surrogate & 0x3FF)<<10)
3185 | (outCh & 0x3FF)) + 0x10000;
3186#else
3187 *p++ = surrogate;
3188 *p++ = outCh;
3189#endif
3190 surrogate = 0;
3191 }
3192 else {
3193 surrogate = 0;
3194 errmsg = "second surrogate missing";
3195 goto utf7Error;
3196 }
3197 }
3198 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3199 /* first surrogate */
3200 surrogate = outCh;
3201 }
3202 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3203 errmsg = "unexpected second surrogate";
3204 goto utf7Error;
3205 }
3206 else {
3207 *p++ = outCh;
3208 }
3209 }
3210 }
3211 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003212 inShift = 0;
3213 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003214 if (surrogate) {
3215 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003216 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003217 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003218 if (base64bits > 0) { /* left-over bits */
3219 if (base64bits >= 6) {
3220 /* We've seen at least one base-64 character */
3221 errmsg = "partial character in shift sequence";
3222 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003223 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003224 else {
3225 /* Some bits remain; they should be zero */
3226 if (base64buffer != 0) {
3227 errmsg = "non-zero padding bits in shift sequence";
3228 goto utf7Error;
3229 }
3230 }
3231 }
3232 if (ch != '-') {
3233 /* '-' is absorbed; other terminating
3234 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003235 *p++ = ch;
3236 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003237 }
3238 }
3239 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003241 s++; /* consume '+' */
3242 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003243 s++;
3244 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003245 }
3246 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003247 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003248 shiftOutStart = p;
3249 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003250 }
3251 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003252 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003253 *p++ = ch;
3254 s++;
3255 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003256 else {
3257 startinpos = s-starts;
3258 s++;
3259 errmsg = "unexpected special character";
3260 goto utf7Error;
3261 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003262 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003263utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003264 outpos = p-PyUnicode_AS_UNICODE(unicode);
3265 endinpos = s-starts;
3266 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 errors, &errorHandler,
3268 "utf7", errmsg,
3269 &starts, &e, &startinpos, &endinpos, &exc, &s,
3270 &unicode, &outpos, &p))
3271 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003272 }
3273
Antoine Pitrou244651a2009-05-04 18:56:13 +00003274 /* end of string */
3275
3276 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3277 /* if we're in an inconsistent state, that's an error */
3278 if (surrogate ||
3279 (base64bits >= 6) ||
3280 (base64bits > 0 && base64buffer != 0)) {
3281 outpos = p-PyUnicode_AS_UNICODE(unicode);
3282 endinpos = size;
3283 if (unicode_decode_call_errorhandler(
3284 errors, &errorHandler,
3285 "utf7", "unterminated shift sequence",
3286 &starts, &e, &startinpos, &endinpos, &exc, &s,
3287 &unicode, &outpos, &p))
3288 goto onError;
3289 if (s < e)
3290 goto restart;
3291 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003292 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003293
3294 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003295 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003296 if (inShift) {
3297 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003298 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003299 }
3300 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003301 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003302 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003303 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003304
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003305 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003306 goto onError;
3307
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308 Py_XDECREF(errorHandler);
3309 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003310 if (PyUnicode_READY(unicode) == -1) {
3311 Py_DECREF(unicode);
3312 return NULL;
3313 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003314 return (PyObject *)unicode;
3315
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317 Py_XDECREF(errorHandler);
3318 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003319 Py_DECREF(unicode);
3320 return NULL;
3321}
3322
3323
Alexander Belopolsky40018472011-02-26 01:02:56 +00003324PyObject *
3325PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003326 Py_ssize_t size,
3327 int base64SetO,
3328 int base64WhiteSpace,
3329 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003330{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003331 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003332 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003333 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003334 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003335 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003336 unsigned int base64bits = 0;
3337 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003338 char * out;
3339 char * start;
3340
3341 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003342 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003343
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003344 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003345 return PyErr_NoMemory();
3346
Antoine Pitrou244651a2009-05-04 18:56:13 +00003347 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003348 if (v == NULL)
3349 return NULL;
3350
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003351 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003352 for (;i < size; ++i) {
3353 Py_UNICODE ch = s[i];
3354
Antoine Pitrou244651a2009-05-04 18:56:13 +00003355 if (inShift) {
3356 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3357 /* shifting out */
3358 if (base64bits) { /* output remaining bits */
3359 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3360 base64buffer = 0;
3361 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003362 }
3363 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003364 /* Characters not in the BASE64 set implicitly unshift the sequence
3365 so no '-' is required, except if the character is itself a '-' */
3366 if (IS_BASE64(ch) || ch == '-') {
3367 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003368 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003369 *out++ = (char) ch;
3370 }
3371 else {
3372 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003373 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003374 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003375 else { /* not in a shift sequence */
3376 if (ch == '+') {
3377 *out++ = '+';
3378 *out++ = '-';
3379 }
3380 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3381 *out++ = (char) ch;
3382 }
3383 else {
3384 *out++ = '+';
3385 inShift = 1;
3386 goto encode_char;
3387 }
3388 }
3389 continue;
3390encode_char:
3391#ifdef Py_UNICODE_WIDE
3392 if (ch >= 0x10000) {
3393 /* code first surrogate */
3394 base64bits += 16;
3395 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3396 while (base64bits >= 6) {
3397 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3398 base64bits -= 6;
3399 }
3400 /* prepare second surrogate */
3401 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3402 }
3403#endif
3404 base64bits += 16;
3405 base64buffer = (base64buffer << 16) | ch;
3406 while (base64bits >= 6) {
3407 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3408 base64bits -= 6;
3409 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003410 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003411 if (base64bits)
3412 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3413 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003414 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003415 if (_PyBytes_Resize(&v, out - start) < 0)
3416 return NULL;
3417 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003418}
3419
Antoine Pitrou244651a2009-05-04 18:56:13 +00003420#undef IS_BASE64
3421#undef FROM_BASE64
3422#undef TO_BASE64
3423#undef DECODE_DIRECT
3424#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003425
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426/* --- UTF-8 Codec -------------------------------------------------------- */
3427
Tim Petersced69f82003-09-16 20:30:58 +00003428static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003430 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3431 illegal prefix. See RFC 3629 for details */
3432 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3433 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003434 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3436 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3437 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3438 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003439 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3442 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003443 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3444 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3445 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3446 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3447 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448};
3449
Alexander Belopolsky40018472011-02-26 01:02:56 +00003450PyObject *
3451PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003452 Py_ssize_t size,
3453 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454{
Walter Dörwald69652032004-09-07 20:24:22 +00003455 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3456}
3457
Antoine Pitrouab868312009-01-10 15:40:25 +00003458/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3459#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3460
3461/* Mask to quickly check whether a C 'long' contains a
3462 non-ASCII, UTF8-encoded char. */
3463#if (SIZEOF_LONG == 8)
3464# define ASCII_CHAR_MASK 0x8080808080808080L
3465#elif (SIZEOF_LONG == 4)
3466# define ASCII_CHAR_MASK 0x80808080L
3467#else
3468# error C 'long' size should be either 4 or 8!
3469#endif
3470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003471/* Scans a UTF-8 string and returns the maximum character to be expected,
3472 the size of the decoded unicode string and if any major errors were
3473 encountered.
3474
3475 This function does check basic UTF-8 sanity, it does however NOT CHECK
3476 if the string contains surrogates, and if all continuation bytes are
3477 within the correct ranges, these checks are performed in
3478 PyUnicode_DecodeUTF8Stateful.
3479
3480 If it sets has_errors to 1, it means the value of unicode_size and max_char
3481 will be bogus and you should not rely on useful information in them.
3482 */
3483static Py_UCS4
3484utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3485 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3486 int *has_errors)
3487{
3488 Py_ssize_t n;
3489 Py_ssize_t char_count = 0;
3490 Py_UCS4 max_char = 127, new_max;
3491 Py_UCS4 upper_bound;
3492 const unsigned char *p = (const unsigned char *)s;
3493 const unsigned char *end = p + string_size;
3494 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3495 int err = 0;
3496
3497 for (; p < end && !err; ++p, ++char_count) {
3498 /* Only check value if it's not a ASCII char... */
3499 if (*p < 0x80) {
3500 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3501 an explanation. */
3502 if (!((size_t) p & LONG_PTR_MASK)) {
3503 /* Help register allocation */
3504 register const unsigned char *_p = p;
3505 while (_p < aligned_end) {
3506 unsigned long value = *(unsigned long *) _p;
3507 if (value & ASCII_CHAR_MASK)
3508 break;
3509 _p += SIZEOF_LONG;
3510 char_count += SIZEOF_LONG;
3511 }
3512 p = _p;
3513 if (p == end)
3514 break;
3515 }
3516 }
3517 if (*p >= 0x80) {
3518 n = utf8_code_length[*p];
3519 new_max = max_char;
3520 switch (n) {
3521 /* invalid start byte */
3522 case 0:
3523 err = 1;
3524 break;
3525 case 2:
3526 /* Code points between 0x00FF and 0x07FF inclusive.
3527 Approximate the upper bound of the code point,
3528 if this flips over 255 we can be sure it will be more
3529 than 255 and the string will need 2 bytes per code coint,
3530 if it stays under or equal to 255, we can be sure 1 byte
3531 is enough.
3532 ((*p & 0b00011111) << 6) | 0b00111111 */
3533 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3534 if (max_char < upper_bound)
3535 new_max = upper_bound;
3536 /* Ensure we track at least that we left ASCII space. */
3537 if (new_max < 128)
3538 new_max = 128;
3539 break;
3540 case 3:
3541 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3542 always > 255 and <= 65535 and will always need 2 bytes. */
3543 if (max_char < 65535)
3544 new_max = 65535;
3545 break;
3546 case 4:
3547 /* Code point will be above 0xFFFF for sure in this case. */
3548 new_max = 65537;
3549 break;
3550 /* Internal error, this should be caught by the first if */
3551 case 1:
3552 default:
3553 assert(0 && "Impossible case in utf8_max_char_and_size");
3554 err = 1;
3555 }
3556 /* Instead of number of overall bytes for this code point,
3557 n containts the number of following bytes: */
3558 --n;
3559 /* Check if the follow up chars are all valid continuation bytes */
3560 if (n >= 1) {
3561 const unsigned char *cont;
3562 if ((p + n) >= end) {
3563 if (consumed == 0)
3564 /* incomplete data, non-incremental decoding */
3565 err = 1;
3566 break;
3567 }
3568 for (cont = p + 1; cont < (p + n); ++cont) {
3569 if ((*cont & 0xc0) != 0x80) {
3570 err = 1;
3571 break;
3572 }
3573 }
3574 p += n;
3575 }
3576 else
3577 err = 1;
3578 max_char = new_max;
3579 }
3580 }
3581
3582 if (unicode_size)
3583 *unicode_size = char_count;
3584 if (has_errors)
3585 *has_errors = err;
3586 return max_char;
3587}
3588
3589/* Similar to PyUnicode_WRITE but can also write into wstr field
3590 of the legacy unicode representation */
3591#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3592 do { \
3593 const int k_ = (kind); \
3594 if (k_ == PyUnicode_WCHAR_KIND) \
3595 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3596 else if (k_ == PyUnicode_1BYTE_KIND) \
3597 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3598 else if (k_ == PyUnicode_2BYTE_KIND) \
3599 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3600 else \
3601 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3602 } while (0)
3603
Alexander Belopolsky40018472011-02-26 01:02:56 +00003604PyObject *
3605PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003606 Py_ssize_t size,
3607 const char *errors,
3608 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003609{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003612 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003613 Py_ssize_t startinpos;
3614 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003615 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003617 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003618 PyObject *errorHandler = NULL;
3619 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003620 Py_UCS4 maxchar = 0;
3621 Py_ssize_t unicode_size;
3622 Py_ssize_t i;
3623 int kind;
3624 void *data;
3625 int has_errors;
3626 Py_UNICODE *error_outptr;
3627#if SIZEOF_WCHAR_T == 2
3628 Py_ssize_t wchar_offset = 0;
3629#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630
Walter Dörwald69652032004-09-07 20:24:22 +00003631 if (size == 0) {
3632 if (consumed)
3633 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003634 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003636 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3637 consumed, &has_errors);
3638 if (has_errors) {
3639 unicode = _PyUnicode_New(size);
3640 if (!unicode)
3641 return NULL;
3642 kind = PyUnicode_WCHAR_KIND;
3643 data = PyUnicode_AS_UNICODE(unicode);
3644 assert(data != NULL);
3645 }
3646 else {
3647 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3648 if (!unicode)
3649 return NULL;
3650 /* When the string is ASCII only, just use memcpy and return.
3651 unicode_size may be != size if there is an incomplete UTF-8
3652 sequence at the end of the ASCII block. */
3653 if (maxchar < 128 && size == unicode_size) {
3654 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3655 return (PyObject *)unicode;
3656 }
3657 kind = PyUnicode_KIND(unicode);
3658 data = PyUnicode_DATA(unicode);
3659 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003661 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003663 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664
3665 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003666 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003667
3668 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003669 /* Fast path for runs of ASCII characters. Given that common UTF-8
3670 input will consist of an overwhelming majority of ASCII
3671 characters, we try to optimize for this case by checking
3672 as many characters as a C 'long' can contain.
3673 First, check if we can do an aligned read, as most CPUs have
3674 a penalty for unaligned reads.
3675 */
3676 if (!((size_t) s & LONG_PTR_MASK)) {
3677 /* Help register allocation */
3678 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003679 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003680 while (_s < aligned_end) {
3681 /* Read a whole long at a time (either 4 or 8 bytes),
3682 and do a fast unrolled copy if it only contains ASCII
3683 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003684 unsigned long value = *(unsigned long *) _s;
3685 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003686 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003687 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3688 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3689 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3690 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003691#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003692 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3693 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3694 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3695 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003696#endif
3697 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003698 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003699 }
3700 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003701 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003702 if (s == e)
3703 break;
3704 ch = (unsigned char)*s;
3705 }
3706 }
3707
3708 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003709 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710 s++;
3711 continue;
3712 }
3713
3714 n = utf8_code_length[ch];
3715
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003716 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003717 if (consumed)
3718 break;
3719 else {
3720 errmsg = "unexpected end of data";
3721 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003722 endinpos = startinpos+1;
3723 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3724 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003725 goto utf8Error;
3726 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003727 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728
3729 switch (n) {
3730
3731 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003732 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003733 startinpos = s-starts;
3734 endinpos = startinpos+1;
3735 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003736
3737 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003738 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003739 startinpos = s-starts;
3740 endinpos = startinpos+1;
3741 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742
3743 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003744 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003745 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003746 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003747 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003748 goto utf8Error;
3749 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003751 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 break;
3754
3755 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003756 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3757 will result in surrogates in range d800-dfff. Surrogates are
3758 not valid UTF-8 so they are rejected.
3759 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3760 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003761 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003762 (s[2] & 0xc0) != 0x80 ||
3763 ((unsigned char)s[0] == 0xE0 &&
3764 (unsigned char)s[1] < 0xA0) ||
3765 ((unsigned char)s[0] == 0xED &&
3766 (unsigned char)s[1] > 0x9F)) {
3767 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003768 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003769 endinpos = startinpos + 1;
3770
3771 /* if s[1] first two bits are 1 and 0, then the invalid
3772 continuation byte is s[2], so increment endinpos by 1,
3773 if not, s[1] is invalid and endinpos doesn't need to
3774 be incremented. */
3775 if ((s[1] & 0xC0) == 0x80)
3776 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003777 goto utf8Error;
3778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003780 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003782 break;
3783
3784 case 4:
3785 if ((s[1] & 0xc0) != 0x80 ||
3786 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003787 (s[3] & 0xc0) != 0x80 ||
3788 ((unsigned char)s[0] == 0xF0 &&
3789 (unsigned char)s[1] < 0x90) ||
3790 ((unsigned char)s[0] == 0xF4 &&
3791 (unsigned char)s[1] > 0x8F)) {
3792 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003793 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003794 endinpos = startinpos + 1;
3795 if ((s[1] & 0xC0) == 0x80) {
3796 endinpos++;
3797 if ((s[2] & 0xC0) == 0x80)
3798 endinpos++;
3799 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003800 goto utf8Error;
3801 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003802 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003803 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3804 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003806 /* If the string is flexible or we have native UCS-4, write
3807 directly.. */
3808 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3809 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 else {
3812 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814 /* translate from 10000..10FFFF to 0..FFFF */
3815 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817 /* high surrogate = top 10 bits added to D800 */
3818 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3819 (Py_UNICODE)(0xD800 + (ch >> 10)));
3820
3821 /* low surrogate = bottom 10 bits added to DC00 */
3822 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3823 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3824 }
3825#if SIZEOF_WCHAR_T == 2
3826 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003827#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 }
3830 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003831 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003832
Benjamin Peterson29060642009-01-31 22:14:21 +00003833 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003834 /* If this is not yet a resizable string, make it one.. */
3835 if (kind != PyUnicode_WCHAR_KIND) {
3836 const Py_UNICODE *u;
3837 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3838 if (!new_unicode)
3839 goto onError;
3840 u = PyUnicode_AsUnicode((PyObject *)unicode);
3841 if (!u)
3842 goto onError;
3843#if SIZEOF_WCHAR_T == 2
3844 i += wchar_offset;
3845#endif
3846 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3847 Py_DECREF(unicode);
3848 unicode = new_unicode;
3849 kind = 0;
3850 data = PyUnicode_AS_UNICODE(new_unicode);
3851 assert(data != NULL);
3852 }
3853 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003854 if (unicode_decode_call_errorhandler(
3855 errors, &errorHandler,
3856 "utf8", errmsg,
3857 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003858 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003859 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860 /* Update data because unicode_decode_call_errorhandler might have
3861 re-created or resized the unicode object. */
3862 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003863 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003865 /* Ensure the unicode_size calculation above was correct: */
3866 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3867
Walter Dörwald69652032004-09-07 20:24:22 +00003868 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003869 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871 /* Adjust length and ready string when it contained errors and
3872 is of the old resizable kind. */
3873 if (kind == PyUnicode_WCHAR_KIND) {
3874 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3875 PyUnicode_READY(unicode) == -1)
3876 goto onError;
3877 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003879 Py_XDECREF(errorHandler);
3880 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 if (PyUnicode_READY(unicode) == -1) {
3882 Py_DECREF(unicode);
3883 return NULL;
3884 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885 return (PyObject *)unicode;
3886
Benjamin Peterson29060642009-01-31 22:14:21 +00003887 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003888 Py_XDECREF(errorHandler);
3889 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 Py_DECREF(unicode);
3891 return NULL;
3892}
3893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003894#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003895
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003896#ifdef __APPLE__
3897
3898/* Simplified UTF-8 decoder using surrogateescape error handler,
3899 used to decode the command line arguments on Mac OS X. */
3900
3901wchar_t*
3902_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3903{
3904 int n;
3905 const char *e;
3906 wchar_t *unicode, *p;
3907
3908 /* Note: size will always be longer than the resulting Unicode
3909 character count */
3910 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3911 PyErr_NoMemory();
3912 return NULL;
3913 }
3914 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3915 if (!unicode)
3916 return NULL;
3917
3918 /* Unpack UTF-8 encoded data */
3919 p = unicode;
3920 e = s + size;
3921 while (s < e) {
3922 Py_UCS4 ch = (unsigned char)*s;
3923
3924 if (ch < 0x80) {
3925 *p++ = (wchar_t)ch;
3926 s++;
3927 continue;
3928 }
3929
3930 n = utf8_code_length[ch];
3931 if (s + n > e) {
3932 goto surrogateescape;
3933 }
3934
3935 switch (n) {
3936 case 0:
3937 case 1:
3938 goto surrogateescape;
3939
3940 case 2:
3941 if ((s[1] & 0xc0) != 0x80)
3942 goto surrogateescape;
3943 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3944 assert ((ch > 0x007F) && (ch <= 0x07FF));
3945 *p++ = (wchar_t)ch;
3946 break;
3947
3948 case 3:
3949 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3950 will result in surrogates in range d800-dfff. Surrogates are
3951 not valid UTF-8 so they are rejected.
3952 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3953 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3954 if ((s[1] & 0xc0) != 0x80 ||
3955 (s[2] & 0xc0) != 0x80 ||
3956 ((unsigned char)s[0] == 0xE0 &&
3957 (unsigned char)s[1] < 0xA0) ||
3958 ((unsigned char)s[0] == 0xED &&
3959 (unsigned char)s[1] > 0x9F)) {
3960
3961 goto surrogateescape;
3962 }
3963 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3964 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003965 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003966 break;
3967
3968 case 4:
3969 if ((s[1] & 0xc0) != 0x80 ||
3970 (s[2] & 0xc0) != 0x80 ||
3971 (s[3] & 0xc0) != 0x80 ||
3972 ((unsigned char)s[0] == 0xF0 &&
3973 (unsigned char)s[1] < 0x90) ||
3974 ((unsigned char)s[0] == 0xF4 &&
3975 (unsigned char)s[1] > 0x8F)) {
3976 goto surrogateescape;
3977 }
3978 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3979 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3980 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3981
3982#if SIZEOF_WCHAR_T == 4
3983 *p++ = (wchar_t)ch;
3984#else
3985 /* compute and append the two surrogates: */
3986
3987 /* translate from 10000..10FFFF to 0..FFFF */
3988 ch -= 0x10000;
3989
3990 /* high surrogate = top 10 bits added to D800 */
3991 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3992
3993 /* low surrogate = bottom 10 bits added to DC00 */
3994 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3995#endif
3996 break;
3997 }
3998 s += n;
3999 continue;
4000
4001 surrogateescape:
4002 *p++ = 0xDC00 + ch;
4003 s++;
4004 }
4005 *p = L'\0';
4006 return unicode;
4007}
4008
4009#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011/* Primary internal function which creates utf8 encoded bytes objects.
4012
4013 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004014 and allocate exactly as much space needed at the end. Else allocate the
4015 maximum possible needed (4 result bytes per Unicode character), and return
4016 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004017*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004018PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020{
Tim Peters602f7402002-04-27 18:03:26 +00004021#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004022
Guido van Rossum98297ee2007-11-06 21:34:58 +00004023 Py_ssize_t i; /* index into s of next input byte */
4024 PyObject *result; /* result string object */
4025 char *p; /* next free byte in output buffer */
4026 Py_ssize_t nallocated; /* number of result bytes allocated */
4027 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004028 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004029 PyObject *errorHandler = NULL;
4030 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004031 int kind;
4032 void *data;
4033 Py_ssize_t size;
4034 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4035#if SIZEOF_WCHAR_T == 2
4036 Py_ssize_t wchar_offset = 0;
4037#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039 if (!PyUnicode_Check(unicode)) {
4040 PyErr_BadArgument();
4041 return NULL;
4042 }
4043
4044 if (PyUnicode_READY(unicode) == -1)
4045 return NULL;
4046
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004047 if (PyUnicode_UTF8(unicode))
4048 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4049 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050
4051 kind = PyUnicode_KIND(unicode);
4052 data = PyUnicode_DATA(unicode);
4053 size = PyUnicode_GET_LENGTH(unicode);
4054
Tim Peters602f7402002-04-27 18:03:26 +00004055 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056
Tim Peters602f7402002-04-27 18:03:26 +00004057 if (size <= MAX_SHORT_UNICHARS) {
4058 /* Write into the stack buffer; nallocated can't overflow.
4059 * At the end, we'll allocate exactly as much heap space as it
4060 * turns out we need.
4061 */
4062 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004063 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004064 p = stackbuf;
4065 }
4066 else {
4067 /* Overallocate on the heap, and give the excess back at the end. */
4068 nallocated = size * 4;
4069 if (nallocated / 4 != size) /* overflow! */
4070 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004071 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004072 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004073 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004074 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004075 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004076
Tim Peters602f7402002-04-27 18:03:26 +00004077 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004078 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004079
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004080 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004081 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004083
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004085 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004086 *p++ = (char)(0xc0 | (ch >> 6));
4087 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004088 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089 Py_ssize_t newpos;
4090 PyObject *rep;
4091 Py_ssize_t repsize, k, startpos;
4092 startpos = i-1;
4093#if SIZEOF_WCHAR_T == 2
4094 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004095#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004096 rep = unicode_encode_call_errorhandler(
4097 errors, &errorHandler, "utf-8", "surrogates not allowed",
4098 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4099 &exc, startpos, startpos+1, &newpos);
4100 if (!rep)
4101 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004103 if (PyBytes_Check(rep))
4104 repsize = PyBytes_GET_SIZE(rep);
4105 else
4106 repsize = PyUnicode_GET_SIZE(rep);
4107
4108 if (repsize > 4) {
4109 Py_ssize_t offset;
4110
4111 if (result == NULL)
4112 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004113 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004114 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004116 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4117 /* integer overflow */
4118 PyErr_NoMemory();
4119 goto error;
4120 }
4121 nallocated += repsize - 4;
4122 if (result != NULL) {
4123 if (_PyBytes_Resize(&result, nallocated) < 0)
4124 goto error;
4125 } else {
4126 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004127 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004128 goto error;
4129 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4130 }
4131 p = PyBytes_AS_STRING(result) + offset;
4132 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004134 if (PyBytes_Check(rep)) {
4135 char *prep = PyBytes_AS_STRING(rep);
4136 for(k = repsize; k > 0; k--)
4137 *p++ = *prep++;
4138 } else /* rep is unicode */ {
4139 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4140 Py_UNICODE c;
4141
4142 for(k=0; k<repsize; k++) {
4143 c = prep[k];
4144 if (0x80 <= c) {
4145 raise_encode_exception(&exc, "utf-8",
4146 PyUnicode_AS_UNICODE(unicode),
4147 size, i-1, i,
4148 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004149 goto error;
4150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004151 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004152 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004153 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004154 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004155 } else if (ch < 0x10000) {
4156 *p++ = (char)(0xe0 | (ch >> 12));
4157 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4158 *p++ = (char)(0x80 | (ch & 0x3f));
4159 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004160 /* Encode UCS4 Unicode ordinals */
4161 *p++ = (char)(0xf0 | (ch >> 18));
4162 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4163 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4164 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004165#if SIZEOF_WCHAR_T == 2
4166 wchar_offset++;
4167#endif
Tim Peters602f7402002-04-27 18:03:26 +00004168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004170
Guido van Rossum98297ee2007-11-06 21:34:58 +00004171 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004172 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004173 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004174 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004175 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004176 }
4177 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004178 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004179 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004180 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004181 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004183
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004184 Py_XDECREF(errorHandler);
4185 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004186 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004187 error:
4188 Py_XDECREF(errorHandler);
4189 Py_XDECREF(exc);
4190 Py_XDECREF(result);
4191 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004192
Tim Peters602f7402002-04-27 18:03:26 +00004193#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194}
4195
Alexander Belopolsky40018472011-02-26 01:02:56 +00004196PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004197PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4198 Py_ssize_t size,
4199 const char *errors)
4200{
4201 PyObject *v, *unicode;
4202
4203 unicode = PyUnicode_FromUnicode(s, size);
4204 if (unicode == NULL)
4205 return NULL;
4206 v = _PyUnicode_AsUTF8String(unicode, errors);
4207 Py_DECREF(unicode);
4208 return v;
4209}
4210
4211PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004212PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004214 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215}
4216
Walter Dörwald41980ca2007-08-16 21:55:45 +00004217/* --- UTF-32 Codec ------------------------------------------------------- */
4218
4219PyObject *
4220PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004221 Py_ssize_t size,
4222 const char *errors,
4223 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004224{
4225 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4226}
4227
4228PyObject *
4229PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004230 Py_ssize_t size,
4231 const char *errors,
4232 int *byteorder,
4233 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004234{
4235 const char *starts = s;
4236 Py_ssize_t startinpos;
4237 Py_ssize_t endinpos;
4238 Py_ssize_t outpos;
4239 PyUnicodeObject *unicode;
4240 Py_UNICODE *p;
4241#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004242 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004243 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004244#else
4245 const int pairs = 0;
4246#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004247 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004248 int bo = 0; /* assume native ordering by default */
4249 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004250 /* Offsets from q for retrieving bytes in the right order. */
4251#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4252 int iorder[] = {0, 1, 2, 3};
4253#else
4254 int iorder[] = {3, 2, 1, 0};
4255#endif
4256 PyObject *errorHandler = NULL;
4257 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004258
Walter Dörwald41980ca2007-08-16 21:55:45 +00004259 q = (unsigned char *)s;
4260 e = q + size;
4261
4262 if (byteorder)
4263 bo = *byteorder;
4264
4265 /* Check for BOM marks (U+FEFF) in the input and adjust current
4266 byte order setting accordingly. In native mode, the leading BOM
4267 mark is skipped, in all other modes, it is copied to the output
4268 stream as-is (giving a ZWNBSP character). */
4269 if (bo == 0) {
4270 if (size >= 4) {
4271 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004272 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004273#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 if (bom == 0x0000FEFF) {
4275 q += 4;
4276 bo = -1;
4277 }
4278 else if (bom == 0xFFFE0000) {
4279 q += 4;
4280 bo = 1;
4281 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004282#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 if (bom == 0x0000FEFF) {
4284 q += 4;
4285 bo = 1;
4286 }
4287 else if (bom == 0xFFFE0000) {
4288 q += 4;
4289 bo = -1;
4290 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004291#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004293 }
4294
4295 if (bo == -1) {
4296 /* force LE */
4297 iorder[0] = 0;
4298 iorder[1] = 1;
4299 iorder[2] = 2;
4300 iorder[3] = 3;
4301 }
4302 else if (bo == 1) {
4303 /* force BE */
4304 iorder[0] = 3;
4305 iorder[1] = 2;
4306 iorder[2] = 1;
4307 iorder[3] = 0;
4308 }
4309
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004310 /* On narrow builds we split characters outside the BMP into two
4311 codepoints => count how much extra space we need. */
4312#ifndef Py_UNICODE_WIDE
4313 for (qq = q; qq < e; qq += 4)
4314 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4315 pairs++;
4316#endif
4317
4318 /* This might be one to much, because of a BOM */
4319 unicode = _PyUnicode_New((size+3)/4+pairs);
4320 if (!unicode)
4321 return NULL;
4322 if (size == 0)
4323 return (PyObject *)unicode;
4324
4325 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004326 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004327
Walter Dörwald41980ca2007-08-16 21:55:45 +00004328 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004329 Py_UCS4 ch;
4330 /* remaining bytes at the end? (size should be divisible by 4) */
4331 if (e-q<4) {
4332 if (consumed)
4333 break;
4334 errmsg = "truncated data";
4335 startinpos = ((const char *)q)-starts;
4336 endinpos = ((const char *)e)-starts;
4337 goto utf32Error;
4338 /* The remaining input chars are ignored if the callback
4339 chooses to skip the input */
4340 }
4341 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4342 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004343
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 if (ch >= 0x110000)
4345 {
4346 errmsg = "codepoint not in range(0x110000)";
4347 startinpos = ((const char *)q)-starts;
4348 endinpos = startinpos+4;
4349 goto utf32Error;
4350 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004351#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 if (ch >= 0x10000)
4353 {
4354 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4355 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4356 }
4357 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004358#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004359 *p++ = ch;
4360 q += 4;
4361 continue;
4362 utf32Error:
4363 outpos = p-PyUnicode_AS_UNICODE(unicode);
4364 if (unicode_decode_call_errorhandler(
4365 errors, &errorHandler,
4366 "utf32", errmsg,
4367 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4368 &unicode, &outpos, &p))
4369 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004370 }
4371
4372 if (byteorder)
4373 *byteorder = bo;
4374
4375 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004377
4378 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004379 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004380 goto onError;
4381
4382 Py_XDECREF(errorHandler);
4383 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004384 if (PyUnicode_READY(unicode) == -1) {
4385 Py_DECREF(unicode);
4386 return NULL;
4387 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004388 return (PyObject *)unicode;
4389
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004391 Py_DECREF(unicode);
4392 Py_XDECREF(errorHandler);
4393 Py_XDECREF(exc);
4394 return NULL;
4395}
4396
4397PyObject *
4398PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 Py_ssize_t size,
4400 const char *errors,
4401 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004402{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004403 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004404 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004405 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004406#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004407 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004408#else
4409 const int pairs = 0;
4410#endif
4411 /* Offsets from p for storing byte pairs in the right order. */
4412#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4413 int iorder[] = {0, 1, 2, 3};
4414#else
4415 int iorder[] = {3, 2, 1, 0};
4416#endif
4417
Benjamin Peterson29060642009-01-31 22:14:21 +00004418#define STORECHAR(CH) \
4419 do { \
4420 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4421 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4422 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4423 p[iorder[0]] = (CH) & 0xff; \
4424 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004425 } while(0)
4426
4427 /* In narrow builds we can output surrogate pairs as one codepoint,
4428 so we need less space. */
4429#ifndef Py_UNICODE_WIDE
4430 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4432 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4433 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004434#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004435 nsize = (size - pairs + (byteorder == 0));
4436 bytesize = nsize * 4;
4437 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004439 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004440 if (v == NULL)
4441 return NULL;
4442
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004443 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004444 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004446 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004447 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004448
4449 if (byteorder == -1) {
4450 /* force LE */
4451 iorder[0] = 0;
4452 iorder[1] = 1;
4453 iorder[2] = 2;
4454 iorder[3] = 3;
4455 }
4456 else if (byteorder == 1) {
4457 /* force BE */
4458 iorder[0] = 3;
4459 iorder[1] = 2;
4460 iorder[2] = 1;
4461 iorder[3] = 0;
4462 }
4463
4464 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004466#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004467 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4468 Py_UCS4 ch2 = *s;
4469 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4470 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4471 s++;
4472 size--;
4473 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004474 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004475#endif
4476 STORECHAR(ch);
4477 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004478
4479 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004480 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004481#undef STORECHAR
4482}
4483
Alexander Belopolsky40018472011-02-26 01:02:56 +00004484PyObject *
4485PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004486{
4487 if (!PyUnicode_Check(unicode)) {
4488 PyErr_BadArgument();
4489 return NULL;
4490 }
4491 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004492 PyUnicode_GET_SIZE(unicode),
4493 NULL,
4494 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004495}
4496
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497/* --- UTF-16 Codec ------------------------------------------------------- */
4498
Tim Peters772747b2001-08-09 22:21:55 +00004499PyObject *
4500PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 Py_ssize_t size,
4502 const char *errors,
4503 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504{
Walter Dörwald69652032004-09-07 20:24:22 +00004505 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4506}
4507
Antoine Pitrouab868312009-01-10 15:40:25 +00004508/* Two masks for fast checking of whether a C 'long' may contain
4509 UTF16-encoded surrogate characters. This is an efficient heuristic,
4510 assuming that non-surrogate characters with a code point >= 0x8000 are
4511 rare in most input.
4512 FAST_CHAR_MASK is used when the input is in native byte ordering,
4513 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004514*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004515#if (SIZEOF_LONG == 8)
4516# define FAST_CHAR_MASK 0x8000800080008000L
4517# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4518#elif (SIZEOF_LONG == 4)
4519# define FAST_CHAR_MASK 0x80008000L
4520# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4521#else
4522# error C 'long' size should be either 4 or 8!
4523#endif
4524
Walter Dörwald69652032004-09-07 20:24:22 +00004525PyObject *
4526PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 Py_ssize_t size,
4528 const char *errors,
4529 int *byteorder,
4530 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004531{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004533 Py_ssize_t startinpos;
4534 Py_ssize_t endinpos;
4535 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 PyUnicodeObject *unicode;
4537 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004538 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004539 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004540 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004541 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004542 /* Offsets from q for retrieving byte pairs in the right order. */
4543#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4544 int ihi = 1, ilo = 0;
4545#else
4546 int ihi = 0, ilo = 1;
4547#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548 PyObject *errorHandler = NULL;
4549 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550
4551 /* Note: size will always be longer than the resulting Unicode
4552 character count */
4553 unicode = _PyUnicode_New(size);
4554 if (!unicode)
4555 return NULL;
4556 if (size == 0)
4557 return (PyObject *)unicode;
4558
4559 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004560 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004561 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004562 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563
4564 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004565 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004566
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004567 /* Check for BOM marks (U+FEFF) in the input and adjust current
4568 byte order setting accordingly. In native mode, the leading BOM
4569 mark is skipped, in all other modes, it is copied to the output
4570 stream as-is (giving a ZWNBSP character). */
4571 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004572 if (size >= 2) {
4573 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004574#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004575 if (bom == 0xFEFF) {
4576 q += 2;
4577 bo = -1;
4578 }
4579 else if (bom == 0xFFFE) {
4580 q += 2;
4581 bo = 1;
4582 }
Tim Petersced69f82003-09-16 20:30:58 +00004583#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 if (bom == 0xFEFF) {
4585 q += 2;
4586 bo = 1;
4587 }
4588 else if (bom == 0xFFFE) {
4589 q += 2;
4590 bo = -1;
4591 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004592#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595
Tim Peters772747b2001-08-09 22:21:55 +00004596 if (bo == -1) {
4597 /* force LE */
4598 ihi = 1;
4599 ilo = 0;
4600 }
4601 else if (bo == 1) {
4602 /* force BE */
4603 ihi = 0;
4604 ilo = 1;
4605 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004606#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4607 native_ordering = ilo < ihi;
4608#else
4609 native_ordering = ilo > ihi;
4610#endif
Tim Peters772747b2001-08-09 22:21:55 +00004611
Antoine Pitrouab868312009-01-10 15:40:25 +00004612 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004613 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004614 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004615 /* First check for possible aligned read of a C 'long'. Unaligned
4616 reads are more expensive, better to defer to another iteration. */
4617 if (!((size_t) q & LONG_PTR_MASK)) {
4618 /* Fast path for runs of non-surrogate chars. */
4619 register const unsigned char *_q = q;
4620 Py_UNICODE *_p = p;
4621 if (native_ordering) {
4622 /* Native ordering is simple: as long as the input cannot
4623 possibly contain a surrogate char, do an unrolled copy
4624 of several 16-bit code points to the target object.
4625 The non-surrogate check is done on several input bytes
4626 at a time (as many as a C 'long' can contain). */
4627 while (_q < aligned_end) {
4628 unsigned long data = * (unsigned long *) _q;
4629 if (data & FAST_CHAR_MASK)
4630 break;
4631 _p[0] = ((unsigned short *) _q)[0];
4632 _p[1] = ((unsigned short *) _q)[1];
4633#if (SIZEOF_LONG == 8)
4634 _p[2] = ((unsigned short *) _q)[2];
4635 _p[3] = ((unsigned short *) _q)[3];
4636#endif
4637 _q += SIZEOF_LONG;
4638 _p += SIZEOF_LONG / 2;
4639 }
4640 }
4641 else {
4642 /* Byteswapped ordering is similar, but we must decompose
4643 the copy bytewise, and take care of zero'ing out the
4644 upper bytes if the target object is in 32-bit units
4645 (that is, in UCS-4 builds). */
4646 while (_q < aligned_end) {
4647 unsigned long data = * (unsigned long *) _q;
4648 if (data & SWAPPED_FAST_CHAR_MASK)
4649 break;
4650 /* Zero upper bytes in UCS-4 builds */
4651#if (Py_UNICODE_SIZE > 2)
4652 _p[0] = 0;
4653 _p[1] = 0;
4654#if (SIZEOF_LONG == 8)
4655 _p[2] = 0;
4656 _p[3] = 0;
4657#endif
4658#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004659 /* Issue #4916; UCS-4 builds on big endian machines must
4660 fill the two last bytes of each 4-byte unit. */
4661#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4662# define OFF 2
4663#else
4664# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004665#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004666 ((unsigned char *) _p)[OFF + 1] = _q[0];
4667 ((unsigned char *) _p)[OFF + 0] = _q[1];
4668 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4669 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4670#if (SIZEOF_LONG == 8)
4671 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4672 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4673 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4674 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4675#endif
4676#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004677 _q += SIZEOF_LONG;
4678 _p += SIZEOF_LONG / 2;
4679 }
4680 }
4681 p = _p;
4682 q = _q;
4683 if (q >= e)
4684 break;
4685 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004687
Benjamin Peterson14339b62009-01-31 16:36:08 +00004688 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004689
4690 if (ch < 0xD800 || ch > 0xDFFF) {
4691 *p++ = ch;
4692 continue;
4693 }
4694
4695 /* UTF-16 code pair: */
4696 if (q > e) {
4697 errmsg = "unexpected end of data";
4698 startinpos = (((const char *)q) - 2) - starts;
4699 endinpos = ((const char *)e) + 1 - starts;
4700 goto utf16Error;
4701 }
4702 if (0xD800 <= ch && ch <= 0xDBFF) {
4703 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4704 q += 2;
4705 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004706#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004707 *p++ = ch;
4708 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004709#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004710 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004711#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004712 continue;
4713 }
4714 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004715 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004716 startinpos = (((const char *)q)-4)-starts;
4717 endinpos = startinpos+2;
4718 goto utf16Error;
4719 }
4720
Benjamin Peterson14339b62009-01-31 16:36:08 +00004721 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004722 errmsg = "illegal encoding";
4723 startinpos = (((const char *)q)-2)-starts;
4724 endinpos = startinpos+2;
4725 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004726
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 utf16Error:
4728 outpos = p - PyUnicode_AS_UNICODE(unicode);
4729 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004730 errors,
4731 &errorHandler,
4732 "utf16", errmsg,
4733 &starts,
4734 (const char **)&e,
4735 &startinpos,
4736 &endinpos,
4737 &exc,
4738 (const char **)&q,
4739 &unicode,
4740 &outpos,
4741 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004744 /* remaining byte at the end? (size should be even) */
4745 if (e == q) {
4746 if (!consumed) {
4747 errmsg = "truncated data";
4748 startinpos = ((const char *)q) - starts;
4749 endinpos = ((const char *)e) + 1 - starts;
4750 outpos = p - PyUnicode_AS_UNICODE(unicode);
4751 if (unicode_decode_call_errorhandler(
4752 errors,
4753 &errorHandler,
4754 "utf16", errmsg,
4755 &starts,
4756 (const char **)&e,
4757 &startinpos,
4758 &endinpos,
4759 &exc,
4760 (const char **)&q,
4761 &unicode,
4762 &outpos,
4763 &p))
4764 goto onError;
4765 /* The remaining input chars are ignored if the callback
4766 chooses to skip the input */
4767 }
4768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769
4770 if (byteorder)
4771 *byteorder = bo;
4772
Walter Dörwald69652032004-09-07 20:24:22 +00004773 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004774 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004775
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 goto onError;
4779
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 Py_XDECREF(errorHandler);
4781 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004782 if (PyUnicode_READY(unicode) == -1) {
4783 Py_DECREF(unicode);
4784 return NULL;
4785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 return (PyObject *)unicode;
4787
Benjamin Peterson29060642009-01-31 22:14:21 +00004788 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790 Py_XDECREF(errorHandler);
4791 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 return NULL;
4793}
4794
Antoine Pitrouab868312009-01-10 15:40:25 +00004795#undef FAST_CHAR_MASK
4796#undef SWAPPED_FAST_CHAR_MASK
4797
Tim Peters772747b2001-08-09 22:21:55 +00004798PyObject *
4799PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004800 Py_ssize_t size,
4801 const char *errors,
4802 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004804 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004805 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004806 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004807#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004808 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004809#else
4810 const int pairs = 0;
4811#endif
Tim Peters772747b2001-08-09 22:21:55 +00004812 /* Offsets from p for storing byte pairs in the right order. */
4813#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4814 int ihi = 1, ilo = 0;
4815#else
4816 int ihi = 0, ilo = 1;
4817#endif
4818
Benjamin Peterson29060642009-01-31 22:14:21 +00004819#define STORECHAR(CH) \
4820 do { \
4821 p[ihi] = ((CH) >> 8) & 0xff; \
4822 p[ilo] = (CH) & 0xff; \
4823 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004824 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004826#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004827 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004828 if (s[i] >= 0x10000)
4829 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004830#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004831 /* 2 * (size + pairs + (byteorder == 0)) */
4832 if (size > PY_SSIZE_T_MAX ||
4833 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004834 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004835 nsize = size + pairs + (byteorder == 0);
4836 bytesize = nsize * 2;
4837 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004838 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004839 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 if (v == NULL)
4841 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004843 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004846 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004847 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004848
4849 if (byteorder == -1) {
4850 /* force LE */
4851 ihi = 1;
4852 ilo = 0;
4853 }
4854 else if (byteorder == 1) {
4855 /* force BE */
4856 ihi = 0;
4857 ilo = 1;
4858 }
4859
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004860 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 Py_UNICODE ch = *s++;
4862 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004863#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004864 if (ch >= 0x10000) {
4865 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4866 ch = 0xD800 | ((ch-0x10000) >> 10);
4867 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004868#endif
Tim Peters772747b2001-08-09 22:21:55 +00004869 STORECHAR(ch);
4870 if (ch2)
4871 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004872 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004873
4874 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004875 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004876#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877}
4878
Alexander Belopolsky40018472011-02-26 01:02:56 +00004879PyObject *
4880PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881{
4882 if (!PyUnicode_Check(unicode)) {
4883 PyErr_BadArgument();
4884 return NULL;
4885 }
4886 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004887 PyUnicode_GET_SIZE(unicode),
4888 NULL,
4889 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890}
4891
4892/* --- Unicode Escape Codec ----------------------------------------------- */
4893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004894/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4895 if all the escapes in the string make it still a valid ASCII string.
4896 Returns -1 if any escapes were found which cause the string to
4897 pop out of ASCII range. Otherwise returns the length of the
4898 required buffer to hold the string.
4899 */
4900Py_ssize_t
4901length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4902{
4903 const unsigned char *p = (const unsigned char *)s;
4904 const unsigned char *end = p + size;
4905 Py_ssize_t length = 0;
4906
4907 if (size < 0)
4908 return -1;
4909
4910 for (; p < end; ++p) {
4911 if (*p > 127) {
4912 /* Non-ASCII */
4913 return -1;
4914 }
4915 else if (*p != '\\') {
4916 /* Normal character */
4917 ++length;
4918 }
4919 else {
4920 /* Backslash-escape, check next char */
4921 ++p;
4922 /* Escape sequence reaches till end of string or
4923 non-ASCII follow-up. */
4924 if (p >= end || *p > 127)
4925 return -1;
4926 switch (*p) {
4927 case '\n':
4928 /* backslash + \n result in zero characters */
4929 break;
4930 case '\\': case '\'': case '\"':
4931 case 'b': case 'f': case 't':
4932 case 'n': case 'r': case 'v': case 'a':
4933 ++length;
4934 break;
4935 case '0': case '1': case '2': case '3':
4936 case '4': case '5': case '6': case '7':
4937 case 'x': case 'u': case 'U': case 'N':
4938 /* these do not guarantee ASCII characters */
4939 return -1;
4940 default:
4941 /* count the backslash + the other character */
4942 length += 2;
4943 }
4944 }
4945 }
4946 return length;
4947}
4948
4949/* Similar to PyUnicode_WRITE but either write into wstr field
4950 or treat string as ASCII. */
4951#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4952 do { \
4953 if ((kind) != PyUnicode_WCHAR_KIND) \
4954 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4955 else \
4956 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4957 } while (0)
4958
4959#define WRITE_WSTR(buf, index, value) \
4960 assert(kind == PyUnicode_WCHAR_KIND), \
4961 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4962
4963
Fredrik Lundh06d12682001-01-24 07:59:11 +00004964static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004965
Alexander Belopolsky40018472011-02-26 01:02:56 +00004966PyObject *
4967PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004968 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02004969 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004971 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004972 Py_ssize_t startinpos;
4973 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004974 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004976 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004978 char* message;
4979 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004980 PyObject *errorHandler = NULL;
4981 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004982 Py_ssize_t ascii_length;
4983 Py_ssize_t i;
4984 int kind;
4985 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004987 ascii_length = length_of_escaped_ascii_string(s, size);
4988
4989 /* After length_of_escaped_ascii_string() there are two alternatives,
4990 either the string is pure ASCII with named escapes like \n, etc.
4991 and we determined it's exact size (common case)
4992 or it contains \x, \u, ... escape sequences. then we create a
4993 legacy wchar string and resize it at the end of this function. */
4994 if (ascii_length >= 0) {
4995 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4996 if (!v)
4997 goto onError;
4998 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4999 kind = PyUnicode_1BYTE_KIND;
5000 data = PyUnicode_DATA(v);
5001 }
5002 else {
5003 /* Escaped strings will always be longer than the resulting
5004 Unicode string, so we start with size here and then reduce the
5005 length after conversion to the true value.
5006 (but if the error callback returns a long replacement string
5007 we'll have to allocate more space) */
5008 v = _PyUnicode_New(size);
5009 if (!v)
5010 goto onError;
5011 kind = PyUnicode_WCHAR_KIND;
5012 data = PyUnicode_AS_UNICODE(v);
5013 }
5014
Guido van Rossumd57fd912000-03-10 22:53:23 +00005015 if (size == 0)
5016 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005017 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005018 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005019
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 while (s < end) {
5021 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005022 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005023 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005025 if (kind == PyUnicode_WCHAR_KIND) {
5026 assert(i < _PyUnicode_WSTR_LENGTH(v));
5027 }
5028 else {
5029 /* The only case in which i == ascii_length is a backslash
5030 followed by a newline. */
5031 assert(i <= ascii_length);
5032 }
5033
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034 /* Non-escape characters are interpreted as Unicode ordinals */
5035 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005036 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 continue;
5038 }
5039
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005040 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041 /* \ - Escapes */
5042 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005043 c = *s++;
5044 if (s > end)
5045 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005046
5047 if (kind == PyUnicode_WCHAR_KIND) {
5048 assert(i < _PyUnicode_WSTR_LENGTH(v));
5049 }
5050 else {
5051 /* The only case in which i == ascii_length is a backslash
5052 followed by a newline. */
5053 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5054 }
5055
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005056 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057
Benjamin Peterson29060642009-01-31 22:14:21 +00005058 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005060 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5061 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5062 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5063 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5064 /* FF */
5065 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5066 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5067 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5068 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5069 /* VT */
5070 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5071 /* BEL, not classic C */
5072 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073
Benjamin Peterson29060642009-01-31 22:14:21 +00005074 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075 case '0': case '1': case '2': case '3':
5076 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005077 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005078 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005079 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005080 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005081 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005083 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084 break;
5085
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 /* hex escapes */
5087 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005089 digits = 2;
5090 message = "truncated \\xXX escape";
5091 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005095 digits = 4;
5096 message = "truncated \\uXXXX escape";
5097 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098
Benjamin Peterson29060642009-01-31 22:14:21 +00005099 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005100 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005101 digits = 8;
5102 message = "truncated \\UXXXXXXXX escape";
5103 hexescape:
5104 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005105 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005106 if (s+digits>end) {
5107 endinpos = size;
5108 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 errors, &errorHandler,
5110 "unicodeescape", "end of string in escape sequence",
5111 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005112 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005114 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005115 goto nextByte;
5116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005117 for (j = 0; j < digits; ++j) {
5118 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005119 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005120 endinpos = (s+j+1)-starts;
5121 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005122 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005123 errors, &errorHandler,
5124 "unicodeescape", message,
5125 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005126 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005127 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005128 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005130 }
5131 chr = (chr<<4) & ~0xF;
5132 if (c >= '0' && c <= '9')
5133 chr += c - '0';
5134 else if (c >= 'a' && c <= 'f')
5135 chr += 10 + c - 'a';
5136 else
5137 chr += 10 + c - 'A';
5138 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005139 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005140 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005141 /* _decoding_error will have already written into the
5142 target buffer. */
5143 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005144 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005145 /* when we get here, chr is a 32-bit unicode character */
5146 if (chr <= 0xffff)
5147 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005148 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005149 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005150 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005151 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005152#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005153 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005154#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005155 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005156 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5157 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005158#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005159 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005160 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005161 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005162 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005163 errors, &errorHandler,
5164 "unicodeescape", "illegal Unicode character",
5165 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005166 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005167 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005168 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005169 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005170 break;
5171
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005173 case 'N':
5174 message = "malformed \\N character escape";
5175 if (ucnhash_CAPI == NULL) {
5176 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005177 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5178 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005179 if (ucnhash_CAPI == NULL)
5180 goto ucnhashError;
5181 }
5182 if (*s == '{') {
5183 const char *start = s+1;
5184 /* look for the closing brace */
5185 while (*s != '}' && s < end)
5186 s++;
5187 if (s > start && s < end && *s == '}') {
5188 /* found a name. look it up in the unicode database */
5189 message = "unknown Unicode character name";
5190 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005191 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5192 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005193 goto store;
5194 }
5195 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005196 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005197 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005198 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 errors, &errorHandler,
5200 "unicodeescape", message,
5201 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005202 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005203 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005204 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005205 break;
5206
5207 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005208 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005209 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005210 message = "\\ at end of string";
5211 s--;
5212 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005213 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005215 errors, &errorHandler,
5216 "unicodeescape", message,
5217 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005218 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005219 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005220 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005221 }
5222 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005223 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5224 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005225 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005226 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005229 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005231 /* Ensure the length prediction worked in case of ASCII strings */
5232 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5233
5234 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5235 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005236 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005237 Py_XDECREF(errorHandler);
5238 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005240
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005242 PyErr_SetString(
5243 PyExc_UnicodeError,
5244 "\\N escapes not supported (can't load unicodedata module)"
5245 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005246 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005247 Py_XDECREF(errorHandler);
5248 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005249 return NULL;
5250
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005253 Py_XDECREF(errorHandler);
5254 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 return NULL;
5256}
5257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005258#undef WRITE_ASCII_OR_WSTR
5259#undef WRITE_WSTR
5260
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261/* Return a Unicode-Escape string version of the Unicode object.
5262
5263 If quotes is true, the string is enclosed in u"" or u'' quotes as
5264 appropriate.
5265
5266*/
5267
Walter Dörwald79e913e2007-05-12 11:08:06 +00005268static const char *hexdigits = "0123456789abcdef";
5269
Alexander Belopolsky40018472011-02-26 01:02:56 +00005270PyObject *
5271PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005272 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005274 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005277#ifdef Py_UNICODE_WIDE
5278 const Py_ssize_t expandsize = 10;
5279#else
5280 const Py_ssize_t expandsize = 6;
5281#endif
5282
Thomas Wouters89f507f2006-12-13 04:49:30 +00005283 /* XXX(nnorwitz): rather than over-allocating, it would be
5284 better to choose a different scheme. Perhaps scan the
5285 first N-chars of the string and allocate based on that size.
5286 */
5287 /* Initial allocation is based on the longest-possible unichr
5288 escape.
5289
5290 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5291 unichr, so in this case it's the longest unichr escape. In
5292 narrow (UTF-16) builds this is five chars per source unichr
5293 since there are two unichrs in the surrogate pair, so in narrow
5294 (UTF-16) builds it's not the longest unichr escape.
5295
5296 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5297 so in the narrow (UTF-16) build case it's the longest unichr
5298 escape.
5299 */
5300
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005301 if (size == 0)
5302 return PyBytes_FromStringAndSize(NULL, 0);
5303
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005304 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005305 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005306
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005307 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005308 2
5309 + expandsize*size
5310 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 if (repr == NULL)
5312 return NULL;
5313
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005314 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 while (size-- > 0) {
5317 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005318
Walter Dörwald79e913e2007-05-12 11:08:06 +00005319 /* Escape backslashes */
5320 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321 *p++ = '\\';
5322 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005323 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005324 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005325
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005326#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005327 /* Map 21-bit characters to '\U00xxxxxx' */
5328 else if (ch >= 0x10000) {
5329 *p++ = '\\';
5330 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005331 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5332 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5333 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5334 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5335 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5336 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5337 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5338 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005340 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005341#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5343 else if (ch >= 0xD800 && ch < 0xDC00) {
5344 Py_UNICODE ch2;
5345 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005346
Benjamin Peterson29060642009-01-31 22:14:21 +00005347 ch2 = *s++;
5348 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005349 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5351 *p++ = '\\';
5352 *p++ = 'U';
5353 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5354 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5355 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5356 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5357 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5358 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5359 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5360 *p++ = hexdigits[ucs & 0x0000000F];
5361 continue;
5362 }
5363 /* Fall through: isolated surrogates are copied as-is */
5364 s--;
5365 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005366 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005367#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005368
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005370 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 *p++ = '\\';
5372 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005373 *p++ = hexdigits[(ch >> 12) & 0x000F];
5374 *p++ = hexdigits[(ch >> 8) & 0x000F];
5375 *p++ = hexdigits[(ch >> 4) & 0x000F];
5376 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005378
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005379 /* Map special whitespace to '\t', \n', '\r' */
5380 else if (ch == '\t') {
5381 *p++ = '\\';
5382 *p++ = 't';
5383 }
5384 else if (ch == '\n') {
5385 *p++ = '\\';
5386 *p++ = 'n';
5387 }
5388 else if (ch == '\r') {
5389 *p++ = '\\';
5390 *p++ = 'r';
5391 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005392
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005393 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005394 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005396 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005397 *p++ = hexdigits[(ch >> 4) & 0x000F];
5398 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005399 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005400
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 /* Copy everything else as-is */
5402 else
5403 *p++ = (char) ch;
5404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005406 assert(p - PyBytes_AS_STRING(repr) > 0);
5407 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5408 return NULL;
5409 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410}
5411
Alexander Belopolsky40018472011-02-26 01:02:56 +00005412PyObject *
5413PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005415 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416 if (!PyUnicode_Check(unicode)) {
5417 PyErr_BadArgument();
5418 return NULL;
5419 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005420 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5421 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005422 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423}
5424
5425/* --- Raw Unicode Escape Codec ------------------------------------------- */
5426
Alexander Belopolsky40018472011-02-26 01:02:56 +00005427PyObject *
5428PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005429 Py_ssize_t size,
5430 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005432 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005433 Py_ssize_t startinpos;
5434 Py_ssize_t endinpos;
5435 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005437 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 const char *end;
5439 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005440 PyObject *errorHandler = NULL;
5441 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005442
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 /* Escaped strings will always be longer than the resulting
5444 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005445 length after conversion to the true value. (But decoding error
5446 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 v = _PyUnicode_New(size);
5448 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005449 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005451 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005452 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 end = s + size;
5454 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 unsigned char c;
5456 Py_UCS4 x;
5457 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005458 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 /* Non-escape characters are interpreted as Unicode ordinals */
5461 if (*s != '\\') {
5462 *p++ = (unsigned char)*s++;
5463 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005464 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 startinpos = s-starts;
5466
5467 /* \u-escapes are only interpreted iff the number of leading
5468 backslashes if odd */
5469 bs = s;
5470 for (;s < end;) {
5471 if (*s != '\\')
5472 break;
5473 *p++ = (unsigned char)*s++;
5474 }
5475 if (((s - bs) & 1) == 0 ||
5476 s >= end ||
5477 (*s != 'u' && *s != 'U')) {
5478 continue;
5479 }
5480 p--;
5481 count = *s=='u' ? 4 : 8;
5482 s++;
5483
5484 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5485 outpos = p-PyUnicode_AS_UNICODE(v);
5486 for (x = 0, i = 0; i < count; ++i, ++s) {
5487 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005488 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 endinpos = s-starts;
5490 if (unicode_decode_call_errorhandler(
5491 errors, &errorHandler,
5492 "rawunicodeescape", "truncated \\uXXXX",
5493 &starts, &end, &startinpos, &endinpos, &exc, &s,
5494 &v, &outpos, &p))
5495 goto onError;
5496 goto nextByte;
5497 }
5498 x = (x<<4) & ~0xF;
5499 if (c >= '0' && c <= '9')
5500 x += c - '0';
5501 else if (c >= 'a' && c <= 'f')
5502 x += 10 + c - 'a';
5503 else
5504 x += 10 + c - 'A';
5505 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005506 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 /* UCS-2 character */
5508 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005509 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 /* UCS-4 character. Either store directly, or as
5511 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005512#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005514#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 x -= 0x10000L;
5516 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5517 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005518#endif
5519 } else {
5520 endinpos = s-starts;
5521 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005522 if (unicode_decode_call_errorhandler(
5523 errors, &errorHandler,
5524 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 &starts, &end, &startinpos, &endinpos, &exc, &s,
5526 &v, &outpos, &p))
5527 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005528 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 nextByte:
5530 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005532 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005533 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005534 Py_XDECREF(errorHandler);
5535 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005536 if (PyUnicode_READY(v) == -1) {
5537 Py_DECREF(v);
5538 return NULL;
5539 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005541
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005544 Py_XDECREF(errorHandler);
5545 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 return NULL;
5547}
5548
Alexander Belopolsky40018472011-02-26 01:02:56 +00005549PyObject *
5550PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005551 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005553 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 char *p;
5555 char *q;
5556
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005557#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005558 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005559#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005560 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005561#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005562
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005563 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005565
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005566 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 if (repr == NULL)
5568 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005569 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005570 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005572 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 while (size-- > 0) {
5574 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005575#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 /* Map 32-bit characters to '\Uxxxxxxxx' */
5577 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005578 *p++ = '\\';
5579 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005580 *p++ = hexdigits[(ch >> 28) & 0xf];
5581 *p++ = hexdigits[(ch >> 24) & 0xf];
5582 *p++ = hexdigits[(ch >> 20) & 0xf];
5583 *p++ = hexdigits[(ch >> 16) & 0xf];
5584 *p++ = hexdigits[(ch >> 12) & 0xf];
5585 *p++ = hexdigits[(ch >> 8) & 0xf];
5586 *p++ = hexdigits[(ch >> 4) & 0xf];
5587 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005588 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005589 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005590#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5592 if (ch >= 0xD800 && ch < 0xDC00) {
5593 Py_UNICODE ch2;
5594 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005595
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 ch2 = *s++;
5597 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005598 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5600 *p++ = '\\';
5601 *p++ = 'U';
5602 *p++ = hexdigits[(ucs >> 28) & 0xf];
5603 *p++ = hexdigits[(ucs >> 24) & 0xf];
5604 *p++ = hexdigits[(ucs >> 20) & 0xf];
5605 *p++ = hexdigits[(ucs >> 16) & 0xf];
5606 *p++ = hexdigits[(ucs >> 12) & 0xf];
5607 *p++ = hexdigits[(ucs >> 8) & 0xf];
5608 *p++ = hexdigits[(ucs >> 4) & 0xf];
5609 *p++ = hexdigits[ucs & 0xf];
5610 continue;
5611 }
5612 /* Fall through: isolated surrogates are copied as-is */
5613 s--;
5614 size++;
5615 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005616#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 /* Map 16-bit characters to '\uxxxx' */
5618 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 *p++ = '\\';
5620 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005621 *p++ = hexdigits[(ch >> 12) & 0xf];
5622 *p++ = hexdigits[(ch >> 8) & 0xf];
5623 *p++ = hexdigits[(ch >> 4) & 0xf];
5624 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 /* Copy everything else as-is */
5627 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 *p++ = (char) ch;
5629 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005630 size = p - q;
5631
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005632 assert(size > 0);
5633 if (_PyBytes_Resize(&repr, size) < 0)
5634 return NULL;
5635 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636}
5637
Alexander Belopolsky40018472011-02-26 01:02:56 +00005638PyObject *
5639PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005641 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005643 PyErr_BadArgument();
5644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005646 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5647 PyUnicode_GET_SIZE(unicode));
5648
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005649 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650}
5651
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005652/* --- Unicode Internal Codec ------------------------------------------- */
5653
Alexander Belopolsky40018472011-02-26 01:02:56 +00005654PyObject *
5655_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005656 Py_ssize_t size,
5657 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005658{
5659 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005660 Py_ssize_t startinpos;
5661 Py_ssize_t endinpos;
5662 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005663 PyUnicodeObject *v;
5664 Py_UNICODE *p;
5665 const char *end;
5666 const char *reason;
5667 PyObject *errorHandler = NULL;
5668 PyObject *exc = NULL;
5669
Neal Norwitzd43069c2006-01-08 01:12:10 +00005670#ifdef Py_UNICODE_WIDE
5671 Py_UNICODE unimax = PyUnicode_GetMax();
5672#endif
5673
Thomas Wouters89f507f2006-12-13 04:49:30 +00005674 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005675 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5676 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005678 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5679 as string was created with the old API. */
5680 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005682 p = PyUnicode_AS_UNICODE(v);
5683 end = s + size;
5684
5685 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005686 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005687 /* We have to sanity check the raw data, otherwise doom looms for
5688 some malformed UCS-4 data. */
5689 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005690#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005691 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005692#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005693 end-s < Py_UNICODE_SIZE
5694 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005696 startinpos = s - starts;
5697 if (end-s < Py_UNICODE_SIZE) {
5698 endinpos = end-starts;
5699 reason = "truncated input";
5700 }
5701 else {
5702 endinpos = s - starts + Py_UNICODE_SIZE;
5703 reason = "illegal code point (> 0x10FFFF)";
5704 }
5705 outpos = p - PyUnicode_AS_UNICODE(v);
5706 if (unicode_decode_call_errorhandler(
5707 errors, &errorHandler,
5708 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005709 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005710 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005711 goto onError;
5712 }
5713 }
5714 else {
5715 p++;
5716 s += Py_UNICODE_SIZE;
5717 }
5718 }
5719
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005720 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005721 goto onError;
5722 Py_XDECREF(errorHandler);
5723 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005724 if (PyUnicode_READY(v) == -1) {
5725 Py_DECREF(v);
5726 return NULL;
5727 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005728 return (PyObject *)v;
5729
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005731 Py_XDECREF(v);
5732 Py_XDECREF(errorHandler);
5733 Py_XDECREF(exc);
5734 return NULL;
5735}
5736
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737/* --- Latin-1 Codec ------------------------------------------------------ */
5738
Alexander Belopolsky40018472011-02-26 01:02:56 +00005739PyObject *
5740PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005741 Py_ssize_t size,
5742 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005745 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746}
5747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005748/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005749static void
5750make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005751 const char *encoding,
5752 const Py_UNICODE *unicode, Py_ssize_t size,
5753 Py_ssize_t startpos, Py_ssize_t endpos,
5754 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005757 *exceptionObject = PyUnicodeEncodeError_Create(
5758 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 }
5760 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5762 goto onError;
5763 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5764 goto onError;
5765 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5766 goto onError;
5767 return;
5768 onError:
5769 Py_DECREF(*exceptionObject);
5770 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 }
5772}
5773
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005775static void
5776raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005777 const char *encoding,
5778 const Py_UNICODE *unicode, Py_ssize_t size,
5779 Py_ssize_t startpos, Py_ssize_t endpos,
5780 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781{
5782 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005786}
5787
5788/* error handling callback helper:
5789 build arguments, call the callback and check the arguments,
5790 put the result into newpos and return the replacement string, which
5791 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005792static PyObject *
5793unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005794 PyObject **errorHandler,
5795 const char *encoding, const char *reason,
5796 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5797 Py_ssize_t startpos, Py_ssize_t endpos,
5798 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005799{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005800 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005801
5802 PyObject *restuple;
5803 PyObject *resunicode;
5804
5805 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005807 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 }
5810
5811 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005813 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005814 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005815
5816 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005817 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005820 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005821 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 Py_DECREF(restuple);
5823 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005825 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 &resunicode, newpos)) {
5827 Py_DECREF(restuple);
5828 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005830 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5831 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5832 Py_DECREF(restuple);
5833 return NULL;
5834 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005835 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005837 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5839 Py_DECREF(restuple);
5840 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005841 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005842 Py_INCREF(resunicode);
5843 Py_DECREF(restuple);
5844 return resunicode;
5845}
5846
Alexander Belopolsky40018472011-02-26 01:02:56 +00005847static PyObject *
5848unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005849 Py_ssize_t size,
5850 const char *errors,
5851 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005852{
5853 /* output object */
5854 PyObject *res;
5855 /* pointers to the beginning and end+1 of input */
5856 const Py_UNICODE *startp = p;
5857 const Py_UNICODE *endp = p + size;
5858 /* pointer to the beginning of the unencodable characters */
5859 /* const Py_UNICODE *badp = NULL; */
5860 /* pointer into the output */
5861 char *str;
5862 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005863 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005864 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5865 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866 PyObject *errorHandler = NULL;
5867 PyObject *exc = NULL;
5868 /* the following variable is used for caching string comparisons
5869 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5870 int known_errorHandler = -1;
5871
5872 /* allocate enough for a simple encoding without
5873 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005874 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005875 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005876 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005877 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005878 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005879 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880 ressize = size;
5881
5882 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005884
Benjamin Peterson29060642009-01-31 22:14:21 +00005885 /* can we encode this? */
5886 if (c<limit) {
5887 /* no overflow check, because we know that the space is enough */
5888 *str++ = (char)c;
5889 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005890 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 else {
5892 Py_ssize_t unicodepos = p-startp;
5893 Py_ssize_t requiredsize;
5894 PyObject *repunicode;
5895 Py_ssize_t repsize;
5896 Py_ssize_t newpos;
5897 Py_ssize_t respos;
5898 Py_UNICODE *uni2;
5899 /* startpos for collecting unencodable chars */
5900 const Py_UNICODE *collstart = p;
5901 const Py_UNICODE *collend = p;
5902 /* find all unecodable characters */
5903 while ((collend < endp) && ((*collend)>=limit))
5904 ++collend;
5905 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5906 if (known_errorHandler==-1) {
5907 if ((errors==NULL) || (!strcmp(errors, "strict")))
5908 known_errorHandler = 1;
5909 else if (!strcmp(errors, "replace"))
5910 known_errorHandler = 2;
5911 else if (!strcmp(errors, "ignore"))
5912 known_errorHandler = 3;
5913 else if (!strcmp(errors, "xmlcharrefreplace"))
5914 known_errorHandler = 4;
5915 else
5916 known_errorHandler = 0;
5917 }
5918 switch (known_errorHandler) {
5919 case 1: /* strict */
5920 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5921 goto onError;
5922 case 2: /* replace */
5923 while (collstart++<collend)
5924 *str++ = '?'; /* fall through */
5925 case 3: /* ignore */
5926 p = collend;
5927 break;
5928 case 4: /* xmlcharrefreplace */
5929 respos = str - PyBytes_AS_STRING(res);
5930 /* determine replacement size (temporarily (mis)uses p) */
5931 for (p = collstart, repsize = 0; p < collend; ++p) {
5932 if (*p<10)
5933 repsize += 2+1+1;
5934 else if (*p<100)
5935 repsize += 2+2+1;
5936 else if (*p<1000)
5937 repsize += 2+3+1;
5938 else if (*p<10000)
5939 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005940#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 else
5942 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005943#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 else if (*p<100000)
5945 repsize += 2+5+1;
5946 else if (*p<1000000)
5947 repsize += 2+6+1;
5948 else
5949 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005950#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 }
5952 requiredsize = respos+repsize+(endp-collend);
5953 if (requiredsize > ressize) {
5954 if (requiredsize<2*ressize)
5955 requiredsize = 2*ressize;
5956 if (_PyBytes_Resize(&res, requiredsize))
5957 goto onError;
5958 str = PyBytes_AS_STRING(res) + respos;
5959 ressize = requiredsize;
5960 }
5961 /* generate replacement (temporarily (mis)uses p) */
5962 for (p = collstart; p < collend; ++p) {
5963 str += sprintf(str, "&#%d;", (int)*p);
5964 }
5965 p = collend;
5966 break;
5967 default:
5968 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5969 encoding, reason, startp, size, &exc,
5970 collstart-startp, collend-startp, &newpos);
5971 if (repunicode == NULL)
5972 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005973 if (PyBytes_Check(repunicode)) {
5974 /* Directly copy bytes result to output. */
5975 repsize = PyBytes_Size(repunicode);
5976 if (repsize > 1) {
5977 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005978 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005979 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5980 Py_DECREF(repunicode);
5981 goto onError;
5982 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005983 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005984 ressize += repsize-1;
5985 }
5986 memcpy(str, PyBytes_AsString(repunicode), repsize);
5987 str += repsize;
5988 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005989 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005990 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005991 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 /* need more space? (at least enough for what we
5993 have+the replacement+the rest of the string, so
5994 we won't have to check space for encodable characters) */
5995 respos = str - PyBytes_AS_STRING(res);
5996 repsize = PyUnicode_GET_SIZE(repunicode);
5997 requiredsize = respos+repsize+(endp-collend);
5998 if (requiredsize > ressize) {
5999 if (requiredsize<2*ressize)
6000 requiredsize = 2*ressize;
6001 if (_PyBytes_Resize(&res, requiredsize)) {
6002 Py_DECREF(repunicode);
6003 goto onError;
6004 }
6005 str = PyBytes_AS_STRING(res) + respos;
6006 ressize = requiredsize;
6007 }
6008 /* check if there is anything unencodable in the replacement
6009 and copy it to the output */
6010 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6011 c = *uni2;
6012 if (c >= limit) {
6013 raise_encode_exception(&exc, encoding, startp, size,
6014 unicodepos, unicodepos+1, reason);
6015 Py_DECREF(repunicode);
6016 goto onError;
6017 }
6018 *str = (char)c;
6019 }
6020 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006021 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006022 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006023 }
6024 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006025 /* Resize if we allocated to much */
6026 size = str - PyBytes_AS_STRING(res);
6027 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006028 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006029 if (_PyBytes_Resize(&res, size) < 0)
6030 goto onError;
6031 }
6032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033 Py_XDECREF(errorHandler);
6034 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006035 return res;
6036
6037 onError:
6038 Py_XDECREF(res);
6039 Py_XDECREF(errorHandler);
6040 Py_XDECREF(exc);
6041 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042}
6043
Alexander Belopolsky40018472011-02-26 01:02:56 +00006044PyObject *
6045PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006046 Py_ssize_t size,
6047 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006049 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050}
6051
Alexander Belopolsky40018472011-02-26 01:02:56 +00006052PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006053_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054{
6055 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 PyErr_BadArgument();
6057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006059 if (PyUnicode_READY(unicode) == -1)
6060 return NULL;
6061 /* Fast path: if it is a one-byte string, construct
6062 bytes object directly. */
6063 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6064 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6065 PyUnicode_GET_LENGTH(unicode));
6066 /* Non-Latin-1 characters present. Defer to above function to
6067 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006070 errors);
6071}
6072
6073PyObject*
6074PyUnicode_AsLatin1String(PyObject *unicode)
6075{
6076 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077}
6078
6079/* --- 7-bit ASCII Codec -------------------------------------------------- */
6080
Alexander Belopolsky40018472011-02-26 01:02:56 +00006081PyObject *
6082PyUnicode_DecodeASCII(const char *s,
6083 Py_ssize_t size,
6084 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006086 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 PyUnicodeObject *v;
6088 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006089 Py_ssize_t startinpos;
6090 Py_ssize_t endinpos;
6091 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006092 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006093 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006094 PyObject *errorHandler = NULL;
6095 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006096 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006097
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006099 if (size == 1 && *(unsigned char*)s < 128)
6100 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6101
6102 /* Fast path. Assume the input actually *is* ASCII, and allocate
6103 a single-block Unicode object with that assumption. If there is
6104 an error, drop the object and start over. */
6105 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6106 if (v == NULL)
6107 goto onError;
6108 d = PyUnicode_1BYTE_DATA(v);
6109 for (i = 0; i < size; i++) {
6110 unsigned char ch = ((unsigned char*)s)[i];
6111 if (ch < 128)
6112 d[i] = ch;
6113 else
6114 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006115 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006116 if (i == size)
6117 return (PyObject*)v;
6118 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006119
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 v = _PyUnicode_New(size);
6121 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 e = s + size;
6127 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 register unsigned char c = (unsigned char)*s;
6129 if (c < 128) {
6130 *p++ = c;
6131 ++s;
6132 }
6133 else {
6134 startinpos = s-starts;
6135 endinpos = startinpos + 1;
6136 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6137 if (unicode_decode_call_errorhandler(
6138 errors, &errorHandler,
6139 "ascii", "ordinal not in range(128)",
6140 &starts, &e, &startinpos, &endinpos, &exc, &s,
6141 &v, &outpos, &p))
6142 goto onError;
6143 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006145 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006146 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6147 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006148 Py_XDECREF(errorHandler);
6149 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006150 if (PyUnicode_READY(v) == -1) {
6151 Py_DECREF(v);
6152 return NULL;
6153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006155
Benjamin Peterson29060642009-01-31 22:14:21 +00006156 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 Py_XDECREF(errorHandler);
6159 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 return NULL;
6161}
6162
Alexander Belopolsky40018472011-02-26 01:02:56 +00006163PyObject *
6164PyUnicode_EncodeASCII(const Py_UNICODE *p,
6165 Py_ssize_t size,
6166 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006168 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169}
6170
Alexander Belopolsky40018472011-02-26 01:02:56 +00006171PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006172_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173{
6174 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006175 PyErr_BadArgument();
6176 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006178 if (PyUnicode_READY(unicode) == -1)
6179 return NULL;
6180 /* Fast path: if it is an ASCII-only string, construct bytes object
6181 directly. Else defer to above function to raise the exception. */
6182 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6183 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6184 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006187 errors);
6188}
6189
6190PyObject *
6191PyUnicode_AsASCIIString(PyObject *unicode)
6192{
6193 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194}
6195
Victor Stinner99b95382011-07-04 14:23:54 +02006196#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006197
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006198/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006199
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006200#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006201#define NEED_RETRY
6202#endif
6203
6204/* XXX This code is limited to "true" double-byte encodings, as
6205 a) it assumes an incomplete character consists of a single byte, and
6206 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006208
Alexander Belopolsky40018472011-02-26 01:02:56 +00006209static int
6210is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006211{
6212 const char *curr = s + offset;
6213
6214 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006215 const char *prev = CharPrev(s, curr);
6216 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006217 }
6218 return 0;
6219}
6220
6221/*
6222 * Decode MBCS string into unicode object. If 'final' is set, converts
6223 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6224 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006225static int
6226decode_mbcs(PyUnicodeObject **v,
6227 const char *s, /* MBCS string */
6228 int size, /* sizeof MBCS string */
6229 int final,
6230 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006231{
6232 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006233 Py_ssize_t n;
6234 DWORD usize;
6235 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006236
6237 assert(size >= 0);
6238
Victor Stinner554f3f02010-06-16 23:33:54 +00006239 /* check and handle 'errors' arg */
6240 if (errors==NULL || strcmp(errors, "strict")==0)
6241 flags = MB_ERR_INVALID_CHARS;
6242 else if (strcmp(errors, "ignore")==0)
6243 flags = 0;
6244 else {
6245 PyErr_Format(PyExc_ValueError,
6246 "mbcs encoding does not support errors='%s'",
6247 errors);
6248 return -1;
6249 }
6250
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006251 /* Skip trailing lead-byte unless 'final' is set */
6252 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006254
6255 /* First get the size of the result */
6256 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006257 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6258 if (usize==0)
6259 goto mbcs_decode_error;
6260 } else
6261 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006262
6263 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 /* Create unicode object */
6265 *v = _PyUnicode_New(usize);
6266 if (*v == NULL)
6267 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006268 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006269 }
6270 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 /* Extend unicode object */
6272 n = PyUnicode_GET_SIZE(*v);
6273 if (_PyUnicode_Resize(v, n + usize) < 0)
6274 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006275 }
6276
6277 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006278 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006280 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6281 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006283 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006284 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006285
6286mbcs_decode_error:
6287 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6288 we raise a UnicodeDecodeError - else it is a 'generic'
6289 windows error
6290 */
6291 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6292 /* Ideally, we should get reason from FormatMessage - this
6293 is the Windows 2000 English version of the message
6294 */
6295 PyObject *exc = NULL;
6296 const char *reason = "No mapping for the Unicode character exists "
6297 "in the target multi-byte code page.";
6298 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6299 if (exc != NULL) {
6300 PyCodec_StrictErrors(exc);
6301 Py_DECREF(exc);
6302 }
6303 } else {
6304 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6305 }
6306 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006307}
6308
Alexander Belopolsky40018472011-02-26 01:02:56 +00006309PyObject *
6310PyUnicode_DecodeMBCSStateful(const char *s,
6311 Py_ssize_t size,
6312 const char *errors,
6313 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006314{
6315 PyUnicodeObject *v = NULL;
6316 int done;
6317
6318 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006320
6321#ifdef NEED_RETRY
6322 retry:
6323 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006324 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006325 else
6326#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006327 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006328
6329 if (done < 0) {
6330 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006332 }
6333
6334 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006336
6337#ifdef NEED_RETRY
6338 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 s += done;
6340 size -= done;
6341 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006342 }
6343#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006344 if (PyUnicode_READY(v) == -1) {
6345 Py_DECREF(v);
6346 return NULL;
6347 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006348 return (PyObject *)v;
6349}
6350
Alexander Belopolsky40018472011-02-26 01:02:56 +00006351PyObject *
6352PyUnicode_DecodeMBCS(const char *s,
6353 Py_ssize_t size,
6354 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006355{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006356 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6357}
6358
6359/*
6360 * Convert unicode into string object (MBCS).
6361 * Returns 0 if succeed, -1 otherwise.
6362 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006363static int
6364encode_mbcs(PyObject **repr,
6365 const Py_UNICODE *p, /* unicode */
6366 int size, /* size of unicode */
6367 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006368{
Victor Stinner554f3f02010-06-16 23:33:54 +00006369 BOOL usedDefaultChar = FALSE;
6370 BOOL *pusedDefaultChar;
6371 int mbcssize;
6372 Py_ssize_t n;
6373 PyObject *exc = NULL;
6374 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006375
6376 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006377
Victor Stinner554f3f02010-06-16 23:33:54 +00006378 /* check and handle 'errors' arg */
6379 if (errors==NULL || strcmp(errors, "strict")==0) {
6380 flags = WC_NO_BEST_FIT_CHARS;
6381 pusedDefaultChar = &usedDefaultChar;
6382 } else if (strcmp(errors, "replace")==0) {
6383 flags = 0;
6384 pusedDefaultChar = NULL;
6385 } else {
6386 PyErr_Format(PyExc_ValueError,
6387 "mbcs encoding does not support errors='%s'",
6388 errors);
6389 return -1;
6390 }
6391
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006392 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006393 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006394 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6395 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 if (mbcssize == 0) {
6397 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6398 return -1;
6399 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006400 /* If we used a default char, then we failed! */
6401 if (pusedDefaultChar && *pusedDefaultChar)
6402 goto mbcs_encode_error;
6403 } else {
6404 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006405 }
6406
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006407 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 /* Create string object */
6409 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6410 if (*repr == NULL)
6411 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006412 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006413 }
6414 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 /* Extend string object */
6416 n = PyBytes_Size(*repr);
6417 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6418 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006419 }
6420
6421 /* Do the conversion */
6422 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006424 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6425 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6427 return -1;
6428 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006429 if (pusedDefaultChar && *pusedDefaultChar)
6430 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006431 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006432 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006433
6434mbcs_encode_error:
6435 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6436 Py_XDECREF(exc);
6437 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006438}
6439
Alexander Belopolsky40018472011-02-26 01:02:56 +00006440PyObject *
6441PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6442 Py_ssize_t size,
6443 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006444{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006445 PyObject *repr = NULL;
6446 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006447
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006448#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006450 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006451 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006452 else
6453#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006454 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006455
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006456 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006457 Py_XDECREF(repr);
6458 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006459 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006460
6461#ifdef NEED_RETRY
6462 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 p += INT_MAX;
6464 size -= INT_MAX;
6465 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006466 }
6467#endif
6468
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006469 return repr;
6470}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006471
Alexander Belopolsky40018472011-02-26 01:02:56 +00006472PyObject *
6473PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006474{
6475 if (!PyUnicode_Check(unicode)) {
6476 PyErr_BadArgument();
6477 return NULL;
6478 }
6479 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006480 PyUnicode_GET_SIZE(unicode),
6481 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006482}
6483
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006484#undef NEED_RETRY
6485
Victor Stinner99b95382011-07-04 14:23:54 +02006486#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006487
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488/* --- Character Mapping Codec -------------------------------------------- */
6489
Alexander Belopolsky40018472011-02-26 01:02:56 +00006490PyObject *
6491PyUnicode_DecodeCharmap(const char *s,
6492 Py_ssize_t size,
6493 PyObject *mapping,
6494 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006496 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006497 Py_ssize_t startinpos;
6498 Py_ssize_t endinpos;
6499 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006500 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 PyUnicodeObject *v;
6502 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006503 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006504 PyObject *errorHandler = NULL;
6505 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006506 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006507 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006508
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 /* Default to Latin-1 */
6510 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512
6513 v = _PyUnicode_New(size);
6514 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006519 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006520 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 mapstring = PyUnicode_AS_UNICODE(mapping);
6522 maplen = PyUnicode_GET_SIZE(mapping);
6523 while (s < e) {
6524 unsigned char ch = *s;
6525 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 if (ch < maplen)
6528 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 if (x == 0xfffe) {
6531 /* undefined mapping */
6532 outpos = p-PyUnicode_AS_UNICODE(v);
6533 startinpos = s-starts;
6534 endinpos = startinpos+1;
6535 if (unicode_decode_call_errorhandler(
6536 errors, &errorHandler,
6537 "charmap", "character maps to <undefined>",
6538 &starts, &e, &startinpos, &endinpos, &exc, &s,
6539 &v, &outpos, &p)) {
6540 goto onError;
6541 }
6542 continue;
6543 }
6544 *p++ = x;
6545 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006546 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006547 }
6548 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 while (s < e) {
6550 unsigned char ch = *s;
6551 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006552
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6554 w = PyLong_FromLong((long)ch);
6555 if (w == NULL)
6556 goto onError;
6557 x = PyObject_GetItem(mapping, w);
6558 Py_DECREF(w);
6559 if (x == NULL) {
6560 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6561 /* No mapping found means: mapping is undefined. */
6562 PyErr_Clear();
6563 x = Py_None;
6564 Py_INCREF(x);
6565 } else
6566 goto onError;
6567 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006568
Benjamin Peterson29060642009-01-31 22:14:21 +00006569 /* Apply mapping */
6570 if (PyLong_Check(x)) {
6571 long value = PyLong_AS_LONG(x);
6572 if (value < 0 || value > 65535) {
6573 PyErr_SetString(PyExc_TypeError,
6574 "character mapping must be in range(65536)");
6575 Py_DECREF(x);
6576 goto onError;
6577 }
6578 *p++ = (Py_UNICODE)value;
6579 }
6580 else if (x == Py_None) {
6581 /* undefined mapping */
6582 outpos = p-PyUnicode_AS_UNICODE(v);
6583 startinpos = s-starts;
6584 endinpos = startinpos+1;
6585 if (unicode_decode_call_errorhandler(
6586 errors, &errorHandler,
6587 "charmap", "character maps to <undefined>",
6588 &starts, &e, &startinpos, &endinpos, &exc, &s,
6589 &v, &outpos, &p)) {
6590 Py_DECREF(x);
6591 goto onError;
6592 }
6593 Py_DECREF(x);
6594 continue;
6595 }
6596 else if (PyUnicode_Check(x)) {
6597 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006598
Benjamin Peterson29060642009-01-31 22:14:21 +00006599 if (targetsize == 1)
6600 /* 1-1 mapping */
6601 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006602
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 else if (targetsize > 1) {
6604 /* 1-n mapping */
6605 if (targetsize > extrachars) {
6606 /* resize first */
6607 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6608 Py_ssize_t needed = (targetsize - extrachars) + \
6609 (targetsize << 2);
6610 extrachars += needed;
6611 /* XXX overflow detection missing */
6612 if (_PyUnicode_Resize(&v,
6613 PyUnicode_GET_SIZE(v) + needed) < 0) {
6614 Py_DECREF(x);
6615 goto onError;
6616 }
6617 p = PyUnicode_AS_UNICODE(v) + oldpos;
6618 }
6619 Py_UNICODE_COPY(p,
6620 PyUnicode_AS_UNICODE(x),
6621 targetsize);
6622 p += targetsize;
6623 extrachars -= targetsize;
6624 }
6625 /* 1-0 mapping: skip the character */
6626 }
6627 else {
6628 /* wrong return value */
6629 PyErr_SetString(PyExc_TypeError,
6630 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006631 Py_DECREF(x);
6632 goto onError;
6633 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 Py_DECREF(x);
6635 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 }
6638 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006639 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6640 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641 Py_XDECREF(errorHandler);
6642 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006643 if (PyUnicode_READY(v) == -1) {
6644 Py_DECREF(v);
6645 return NULL;
6646 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006648
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006650 Py_XDECREF(errorHandler);
6651 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 Py_XDECREF(v);
6653 return NULL;
6654}
6655
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006656/* Charmap encoding: the lookup table */
6657
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 PyObject_HEAD
6660 unsigned char level1[32];
6661 int count2, count3;
6662 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006663};
6664
6665static PyObject*
6666encoding_map_size(PyObject *obj, PyObject* args)
6667{
6668 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006669 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006671}
6672
6673static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006674 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 PyDoc_STR("Return the size (in bytes) of this object") },
6676 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006677};
6678
6679static void
6680encoding_map_dealloc(PyObject* o)
6681{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006682 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006683}
6684
6685static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006686 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006687 "EncodingMap", /*tp_name*/
6688 sizeof(struct encoding_map), /*tp_basicsize*/
6689 0, /*tp_itemsize*/
6690 /* methods */
6691 encoding_map_dealloc, /*tp_dealloc*/
6692 0, /*tp_print*/
6693 0, /*tp_getattr*/
6694 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006695 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 0, /*tp_repr*/
6697 0, /*tp_as_number*/
6698 0, /*tp_as_sequence*/
6699 0, /*tp_as_mapping*/
6700 0, /*tp_hash*/
6701 0, /*tp_call*/
6702 0, /*tp_str*/
6703 0, /*tp_getattro*/
6704 0, /*tp_setattro*/
6705 0, /*tp_as_buffer*/
6706 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6707 0, /*tp_doc*/
6708 0, /*tp_traverse*/
6709 0, /*tp_clear*/
6710 0, /*tp_richcompare*/
6711 0, /*tp_weaklistoffset*/
6712 0, /*tp_iter*/
6713 0, /*tp_iternext*/
6714 encoding_map_methods, /*tp_methods*/
6715 0, /*tp_members*/
6716 0, /*tp_getset*/
6717 0, /*tp_base*/
6718 0, /*tp_dict*/
6719 0, /*tp_descr_get*/
6720 0, /*tp_descr_set*/
6721 0, /*tp_dictoffset*/
6722 0, /*tp_init*/
6723 0, /*tp_alloc*/
6724 0, /*tp_new*/
6725 0, /*tp_free*/
6726 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006727};
6728
6729PyObject*
6730PyUnicode_BuildEncodingMap(PyObject* string)
6731{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006732 PyObject *result;
6733 struct encoding_map *mresult;
6734 int i;
6735 int need_dict = 0;
6736 unsigned char level1[32];
6737 unsigned char level2[512];
6738 unsigned char *mlevel1, *mlevel2, *mlevel3;
6739 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006740 int kind;
6741 void *data;
6742 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006744 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006745 PyErr_BadArgument();
6746 return NULL;
6747 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006748 kind = PyUnicode_KIND(string);
6749 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006750 memset(level1, 0xFF, sizeof level1);
6751 memset(level2, 0xFF, sizeof level2);
6752
6753 /* If there isn't a one-to-one mapping of NULL to \0,
6754 or if there are non-BMP characters, we need to use
6755 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006756 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006757 need_dict = 1;
6758 for (i = 1; i < 256; i++) {
6759 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006760 ch = PyUnicode_READ(kind, data, i);
6761 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006762 need_dict = 1;
6763 break;
6764 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006765 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006766 /* unmapped character */
6767 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006768 l1 = ch >> 11;
6769 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006770 if (level1[l1] == 0xFF)
6771 level1[l1] = count2++;
6772 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006773 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006774 }
6775
6776 if (count2 >= 0xFF || count3 >= 0xFF)
6777 need_dict = 1;
6778
6779 if (need_dict) {
6780 PyObject *result = PyDict_New();
6781 PyObject *key, *value;
6782 if (!result)
6783 return NULL;
6784 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006785 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006786 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006787 if (!key || !value)
6788 goto failed1;
6789 if (PyDict_SetItem(result, key, value) == -1)
6790 goto failed1;
6791 Py_DECREF(key);
6792 Py_DECREF(value);
6793 }
6794 return result;
6795 failed1:
6796 Py_XDECREF(key);
6797 Py_XDECREF(value);
6798 Py_DECREF(result);
6799 return NULL;
6800 }
6801
6802 /* Create a three-level trie */
6803 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6804 16*count2 + 128*count3 - 1);
6805 if (!result)
6806 return PyErr_NoMemory();
6807 PyObject_Init(result, &EncodingMapType);
6808 mresult = (struct encoding_map*)result;
6809 mresult->count2 = count2;
6810 mresult->count3 = count3;
6811 mlevel1 = mresult->level1;
6812 mlevel2 = mresult->level23;
6813 mlevel3 = mresult->level23 + 16*count2;
6814 memcpy(mlevel1, level1, 32);
6815 memset(mlevel2, 0xFF, 16*count2);
6816 memset(mlevel3, 0, 128*count3);
6817 count3 = 0;
6818 for (i = 1; i < 256; i++) {
6819 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006820 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006821 /* unmapped character */
6822 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006823 o1 = PyUnicode_READ(kind, data, i)>>11;
6824 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006825 i2 = 16*mlevel1[o1] + o2;
6826 if (mlevel2[i2] == 0xFF)
6827 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006828 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006829 i3 = 128*mlevel2[i2] + o3;
6830 mlevel3[i3] = i;
6831 }
6832 return result;
6833}
6834
6835static int
6836encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6837{
6838 struct encoding_map *map = (struct encoding_map*)mapping;
6839 int l1 = c>>11;
6840 int l2 = (c>>7) & 0xF;
6841 int l3 = c & 0x7F;
6842 int i;
6843
6844#ifdef Py_UNICODE_WIDE
6845 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006847 }
6848#endif
6849 if (c == 0)
6850 return 0;
6851 /* level 1*/
6852 i = map->level1[l1];
6853 if (i == 0xFF) {
6854 return -1;
6855 }
6856 /* level 2*/
6857 i = map->level23[16*i+l2];
6858 if (i == 0xFF) {
6859 return -1;
6860 }
6861 /* level 3 */
6862 i = map->level23[16*map->count2 + 128*i + l3];
6863 if (i == 0) {
6864 return -1;
6865 }
6866 return i;
6867}
6868
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006869/* Lookup the character ch in the mapping. If the character
6870 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006871 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006872static PyObject *
6873charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874{
Christian Heimes217cfd12007-12-02 14:31:20 +00006875 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006876 PyObject *x;
6877
6878 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006880 x = PyObject_GetItem(mapping, w);
6881 Py_DECREF(w);
6882 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6884 /* No mapping found means: mapping is undefined. */
6885 PyErr_Clear();
6886 x = Py_None;
6887 Py_INCREF(x);
6888 return x;
6889 } else
6890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006892 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006894 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 long value = PyLong_AS_LONG(x);
6896 if (value < 0 || value > 255) {
6897 PyErr_SetString(PyExc_TypeError,
6898 "character mapping must be in range(256)");
6899 Py_DECREF(x);
6900 return NULL;
6901 }
6902 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006904 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006905 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 /* wrong return value */
6908 PyErr_Format(PyExc_TypeError,
6909 "character mapping must return integer, bytes or None, not %.400s",
6910 x->ob_type->tp_name);
6911 Py_DECREF(x);
6912 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 }
6914}
6915
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006916static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006917charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006918{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006919 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6920 /* exponentially overallocate to minimize reallocations */
6921 if (requiredsize < 2*outsize)
6922 requiredsize = 2*outsize;
6923 if (_PyBytes_Resize(outobj, requiredsize))
6924 return -1;
6925 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006926}
6927
Benjamin Peterson14339b62009-01-31 16:36:08 +00006928typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006930} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006931/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006932 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006933 space is available. Return a new reference to the object that
6934 was put in the output buffer, or Py_None, if the mapping was undefined
6935 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006936 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006937static charmapencode_result
6938charmapencode_output(Py_UNICODE c, PyObject *mapping,
6939 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006940{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006941 PyObject *rep;
6942 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006943 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006944
Christian Heimes90aa7642007-12-19 02:45:37 +00006945 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006946 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006947 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006948 if (res == -1)
6949 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006950 if (outsize<requiredsize)
6951 if (charmapencode_resize(outobj, outpos, requiredsize))
6952 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006953 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 outstart[(*outpos)++] = (char)res;
6955 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006956 }
6957
6958 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006959 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006960 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006961 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006962 Py_DECREF(rep);
6963 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006964 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 if (PyLong_Check(rep)) {
6966 Py_ssize_t requiredsize = *outpos+1;
6967 if (outsize<requiredsize)
6968 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6969 Py_DECREF(rep);
6970 return enc_EXCEPTION;
6971 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006972 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006974 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 else {
6976 const char *repchars = PyBytes_AS_STRING(rep);
6977 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6978 Py_ssize_t requiredsize = *outpos+repsize;
6979 if (outsize<requiredsize)
6980 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6981 Py_DECREF(rep);
6982 return enc_EXCEPTION;
6983 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006984 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 memcpy(outstart + *outpos, repchars, repsize);
6986 *outpos += repsize;
6987 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006988 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006989 Py_DECREF(rep);
6990 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006991}
6992
6993/* handle an error in PyUnicode_EncodeCharmap
6994 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006995static int
6996charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006997 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006998 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006999 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007000 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007001{
7002 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007003 Py_ssize_t repsize;
7004 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007005 Py_UNICODE *uni2;
7006 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007007 Py_ssize_t collstartpos = *inpos;
7008 Py_ssize_t collendpos = *inpos+1;
7009 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007010 char *encoding = "charmap";
7011 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007012 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007014 /* find all unencodable characters */
7015 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007016 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007017 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 int res = encoding_map_lookup(p[collendpos], mapping);
7019 if (res != -1)
7020 break;
7021 ++collendpos;
7022 continue;
7023 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007024
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 rep = charmapencode_lookup(p[collendpos], mapping);
7026 if (rep==NULL)
7027 return -1;
7028 else if (rep!=Py_None) {
7029 Py_DECREF(rep);
7030 break;
7031 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007032 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007033 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007034 }
7035 /* cache callback name lookup
7036 * (if not done yet, i.e. it's the first error) */
7037 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 if ((errors==NULL) || (!strcmp(errors, "strict")))
7039 *known_errorHandler = 1;
7040 else if (!strcmp(errors, "replace"))
7041 *known_errorHandler = 2;
7042 else if (!strcmp(errors, "ignore"))
7043 *known_errorHandler = 3;
7044 else if (!strcmp(errors, "xmlcharrefreplace"))
7045 *known_errorHandler = 4;
7046 else
7047 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007048 }
7049 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007050 case 1: /* strict */
7051 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7052 return -1;
7053 case 2: /* replace */
7054 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 x = charmapencode_output('?', mapping, res, respos);
7056 if (x==enc_EXCEPTION) {
7057 return -1;
7058 }
7059 else if (x==enc_FAILED) {
7060 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7061 return -1;
7062 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007063 }
7064 /* fall through */
7065 case 3: /* ignore */
7066 *inpos = collendpos;
7067 break;
7068 case 4: /* xmlcharrefreplace */
7069 /* generate replacement (temporarily (mis)uses p) */
7070 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007071 char buffer[2+29+1+1];
7072 char *cp;
7073 sprintf(buffer, "&#%d;", (int)p[collpos]);
7074 for (cp = buffer; *cp; ++cp) {
7075 x = charmapencode_output(*cp, mapping, res, respos);
7076 if (x==enc_EXCEPTION)
7077 return -1;
7078 else if (x==enc_FAILED) {
7079 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7080 return -1;
7081 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007082 }
7083 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007084 *inpos = collendpos;
7085 break;
7086 default:
7087 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 encoding, reason, p, size, exceptionObject,
7089 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007090 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007092 if (PyBytes_Check(repunicode)) {
7093 /* Directly copy bytes result to output. */
7094 Py_ssize_t outsize = PyBytes_Size(*res);
7095 Py_ssize_t requiredsize;
7096 repsize = PyBytes_Size(repunicode);
7097 requiredsize = *respos + repsize;
7098 if (requiredsize > outsize)
7099 /* Make room for all additional bytes. */
7100 if (charmapencode_resize(res, respos, requiredsize)) {
7101 Py_DECREF(repunicode);
7102 return -1;
7103 }
7104 memcpy(PyBytes_AsString(*res) + *respos,
7105 PyBytes_AsString(repunicode), repsize);
7106 *respos += repsize;
7107 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007108 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007109 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007110 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007111 /* generate replacement */
7112 repsize = PyUnicode_GET_SIZE(repunicode);
7113 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 x = charmapencode_output(*uni2, mapping, res, respos);
7115 if (x==enc_EXCEPTION) {
7116 return -1;
7117 }
7118 else if (x==enc_FAILED) {
7119 Py_DECREF(repunicode);
7120 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7121 return -1;
7122 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007123 }
7124 *inpos = newpos;
7125 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007126 }
7127 return 0;
7128}
7129
Alexander Belopolsky40018472011-02-26 01:02:56 +00007130PyObject *
7131PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7132 Py_ssize_t size,
7133 PyObject *mapping,
7134 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007136 /* output object */
7137 PyObject *res = NULL;
7138 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007139 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007140 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007141 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007142 PyObject *errorHandler = NULL;
7143 PyObject *exc = NULL;
7144 /* the following variable is used for caching string comparisons
7145 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7146 * 3=ignore, 4=xmlcharrefreplace */
7147 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148
7149 /* Default to Latin-1 */
7150 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007151 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007153 /* allocate enough for a simple encoding without
7154 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007155 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007156 if (res == NULL)
7157 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007158 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007159 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007161 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 /* try to encode it */
7163 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7164 if (x==enc_EXCEPTION) /* error */
7165 goto onError;
7166 if (x==enc_FAILED) { /* unencodable character */
7167 if (charmap_encoding_error(p, size, &inpos, mapping,
7168 &exc,
7169 &known_errorHandler, &errorHandler, errors,
7170 &res, &respos)) {
7171 goto onError;
7172 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007173 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 else
7175 /* done with this character => adjust input position */
7176 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007179 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007180 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007181 if (_PyBytes_Resize(&res, respos) < 0)
7182 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007183
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007184 Py_XDECREF(exc);
7185 Py_XDECREF(errorHandler);
7186 return res;
7187
Benjamin Peterson29060642009-01-31 22:14:21 +00007188 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007189 Py_XDECREF(res);
7190 Py_XDECREF(exc);
7191 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192 return NULL;
7193}
7194
Alexander Belopolsky40018472011-02-26 01:02:56 +00007195PyObject *
7196PyUnicode_AsCharmapString(PyObject *unicode,
7197 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198{
7199 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 PyErr_BadArgument();
7201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202 }
7203 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007204 PyUnicode_GET_SIZE(unicode),
7205 mapping,
7206 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207}
7208
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007209/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007210static void
7211make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007212 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007213 Py_ssize_t startpos, Py_ssize_t endpos,
7214 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007216 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007217 *exceptionObject = _PyUnicodeTranslateError_Create(
7218 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219 }
7220 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007221 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7222 goto onError;
7223 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7224 goto onError;
7225 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7226 goto onError;
7227 return;
7228 onError:
7229 Py_DECREF(*exceptionObject);
7230 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 }
7232}
7233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007234/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007235static void
7236raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007237 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007238 Py_ssize_t startpos, Py_ssize_t endpos,
7239 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007240{
7241 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007242 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007243 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007244 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007245}
7246
7247/* error handling callback helper:
7248 build arguments, call the callback and check the arguments,
7249 put the result into newpos and return the replacement string, which
7250 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007251static PyObject *
7252unicode_translate_call_errorhandler(const char *errors,
7253 PyObject **errorHandler,
7254 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007255 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007256 Py_ssize_t startpos, Py_ssize_t endpos,
7257 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007258{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007259 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007260
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007261 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007262 PyObject *restuple;
7263 PyObject *resunicode;
7264
7265 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007267 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007268 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007269 }
7270
7271 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007272 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007273 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007274 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007275
7276 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007278 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007280 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007281 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007282 Py_DECREF(restuple);
7283 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007284 }
7285 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007286 &resunicode, &i_newpos)) {
7287 Py_DECREF(restuple);
7288 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007289 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007290 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007291 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007292 else
7293 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007294 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7296 Py_DECREF(restuple);
7297 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007298 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007299 Py_INCREF(resunicode);
7300 Py_DECREF(restuple);
7301 return resunicode;
7302}
7303
7304/* Lookup the character ch in the mapping and put the result in result,
7305 which must be decrefed by the caller.
7306 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007307static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007308charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007309{
Christian Heimes217cfd12007-12-02 14:31:20 +00007310 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007311 PyObject *x;
7312
7313 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007314 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007315 x = PyObject_GetItem(mapping, w);
7316 Py_DECREF(w);
7317 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007318 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7319 /* No mapping found means: use 1:1 mapping. */
7320 PyErr_Clear();
7321 *result = NULL;
7322 return 0;
7323 } else
7324 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007325 }
7326 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 *result = x;
7328 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007329 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007330 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 long value = PyLong_AS_LONG(x);
7332 long max = PyUnicode_GetMax();
7333 if (value < 0 || value > max) {
7334 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007335 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007336 Py_DECREF(x);
7337 return -1;
7338 }
7339 *result = x;
7340 return 0;
7341 }
7342 else if (PyUnicode_Check(x)) {
7343 *result = x;
7344 return 0;
7345 }
7346 else {
7347 /* wrong return value */
7348 PyErr_SetString(PyExc_TypeError,
7349 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007350 Py_DECREF(x);
7351 return -1;
7352 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007353}
7354/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 if not reallocate and adjust various state variables.
7356 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007357static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007358charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007361 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007362 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007363 /* exponentially overallocate to minimize reallocations */
7364 if (requiredsize < 2 * oldsize)
7365 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007366 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7367 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007369 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007370 }
7371 return 0;
7372}
7373/* lookup the character, put the result in the output string and adjust
7374 various state variables. Return a new reference to the object that
7375 was put in the output buffer in *result, or Py_None, if the mapping was
7376 undefined (in which case no character was written).
7377 The called must decref result.
7378 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007379static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007380charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7381 PyObject *mapping, Py_UCS4 **output,
7382 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007383 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007385 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7386 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007387 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007388 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007389 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007390 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007391 }
7392 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007394 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007395 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007396 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007397 }
7398 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007399 Py_ssize_t repsize;
7400 if (PyUnicode_READY(*res) == -1)
7401 return -1;
7402 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 if (repsize==1) {
7404 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007405 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 }
7407 else if (repsize!=0) {
7408 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007409 Py_ssize_t requiredsize = *opos +
7410 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007412 Py_ssize_t i;
7413 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007415 for(i = 0; i < repsize; i++)
7416 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007418 }
7419 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007421 return 0;
7422}
7423
Alexander Belopolsky40018472011-02-26 01:02:56 +00007424PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007425_PyUnicode_TranslateCharmap(PyObject *input,
7426 PyObject *mapping,
7427 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007429 /* input object */
7430 char *idata;
7431 Py_ssize_t size, i;
7432 int kind;
7433 /* output buffer */
7434 Py_UCS4 *output = NULL;
7435 Py_ssize_t osize;
7436 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007437 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007438 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007439 char *reason = "character maps to <undefined>";
7440 PyObject *errorHandler = NULL;
7441 PyObject *exc = NULL;
7442 /* the following variable is used for caching string comparisons
7443 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7444 * 3=ignore, 4=xmlcharrefreplace */
7445 int known_errorHandler = -1;
7446
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 PyErr_BadArgument();
7449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007452 if (PyUnicode_READY(input) == -1)
7453 return NULL;
7454 idata = (char*)PyUnicode_DATA(input);
7455 kind = PyUnicode_KIND(input);
7456 size = PyUnicode_GET_LENGTH(input);
7457 i = 0;
7458
7459 if (size == 0) {
7460 Py_INCREF(input);
7461 return input;
7462 }
7463
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007464 /* allocate enough for a simple 1:1 translation without
7465 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007466 osize = size;
7467 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7468 opos = 0;
7469 if (output == NULL) {
7470 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007472 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007474 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 /* try to encode it */
7476 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007477 if (charmaptranslate_output(input, i, mapping,
7478 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 Py_XDECREF(x);
7480 goto onError;
7481 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007482 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007484 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 else { /* untranslatable character */
7486 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7487 Py_ssize_t repsize;
7488 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007489 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007491 Py_ssize_t collstart = i;
7492 Py_ssize_t collend = i+1;
7493 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007496 while (collend < size) {
7497 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 goto onError;
7499 Py_XDECREF(x);
7500 if (x!=Py_None)
7501 break;
7502 ++collend;
7503 }
7504 /* cache callback name lookup
7505 * (if not done yet, i.e. it's the first error) */
7506 if (known_errorHandler==-1) {
7507 if ((errors==NULL) || (!strcmp(errors, "strict")))
7508 known_errorHandler = 1;
7509 else if (!strcmp(errors, "replace"))
7510 known_errorHandler = 2;
7511 else if (!strcmp(errors, "ignore"))
7512 known_errorHandler = 3;
7513 else if (!strcmp(errors, "xmlcharrefreplace"))
7514 known_errorHandler = 4;
7515 else
7516 known_errorHandler = 0;
7517 }
7518 switch (known_errorHandler) {
7519 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007520 raise_translate_exception(&exc, input, collstart,
7521 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007522 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 case 2: /* replace */
7524 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007525 for (coll = collstart; coll<collend; coll++)
7526 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 /* fall through */
7528 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007529 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 break;
7531 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007532 /* generate replacement (temporarily (mis)uses i) */
7533 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007534 char buffer[2+29+1+1];
7535 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007536 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7537 if (charmaptranslate_makespace(&output, &osize,
7538 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 goto onError;
7540 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007541 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007543 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 break;
7545 default:
7546 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007547 reason, input, &exc,
7548 collstart, collend, &newpos);
7549 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007550 goto onError;
7551 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007552 repsize = PyUnicode_GET_LENGTH(repunicode);
7553 if (charmaptranslate_makespace(&output, &osize,
7554 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 Py_DECREF(repunicode);
7556 goto onError;
7557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007558 for (uni2 = 0; repsize-->0; ++uni2)
7559 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7560 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007561 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007562 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007563 }
7564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007565 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7566 if (!res)
7567 goto onError;
7568 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007569 Py_XDECREF(exc);
7570 Py_XDECREF(errorHandler);
7571 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007574 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007575 Py_XDECREF(exc);
7576 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 return NULL;
7578}
7579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007580/* Deprecated. Use PyUnicode_Translate instead. */
7581PyObject *
7582PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7583 Py_ssize_t size,
7584 PyObject *mapping,
7585 const char *errors)
7586{
7587 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7588 if (!unicode)
7589 return NULL;
7590 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7591}
7592
Alexander Belopolsky40018472011-02-26 01:02:56 +00007593PyObject *
7594PyUnicode_Translate(PyObject *str,
7595 PyObject *mapping,
7596 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597{
7598 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007599
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 str = PyUnicode_FromObject(str);
7601 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007602 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007603 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604 Py_DECREF(str);
7605 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007606
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 Py_XDECREF(str);
7609 return NULL;
7610}
Tim Petersced69f82003-09-16 20:30:58 +00007611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007612static Py_UCS4
7613fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7614{
7615 /* No need to call PyUnicode_READY(self) because this function is only
7616 called as a callback from fixup() which does it already. */
7617 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7618 const int kind = PyUnicode_KIND(self);
7619 void *data = PyUnicode_DATA(self);
7620 Py_UCS4 maxchar = 0, ch, fixed;
7621 Py_ssize_t i;
7622
7623 for (i = 0; i < len; ++i) {
7624 ch = PyUnicode_READ(kind, data, i);
7625 fixed = 0;
7626 if (ch > 127) {
7627 if (Py_UNICODE_ISSPACE(ch))
7628 fixed = ' ';
7629 else {
7630 const int decimal = Py_UNICODE_TODECIMAL(ch);
7631 if (decimal >= 0)
7632 fixed = '0' + decimal;
7633 }
7634 if (fixed != 0) {
7635 if (fixed > maxchar)
7636 maxchar = fixed;
7637 PyUnicode_WRITE(kind, data, i, fixed);
7638 }
7639 else if (ch > maxchar)
7640 maxchar = ch;
7641 }
7642 else if (ch > maxchar)
7643 maxchar = ch;
7644 }
7645
7646 return maxchar;
7647}
7648
7649PyObject *
7650_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7651{
7652 if (!PyUnicode_Check(unicode)) {
7653 PyErr_BadInternalCall();
7654 return NULL;
7655 }
7656 if (PyUnicode_READY(unicode) == -1)
7657 return NULL;
7658 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7659 /* If the string is already ASCII, just return the same string */
7660 Py_INCREF(unicode);
7661 return unicode;
7662 }
7663 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7664}
7665
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007666PyObject *
7667PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7668 Py_ssize_t length)
7669{
7670 PyObject *result;
7671 Py_UNICODE *p; /* write pointer into result */
7672 Py_ssize_t i;
7673 /* Copy to a new string */
7674 result = (PyObject *)_PyUnicode_New(length);
7675 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7676 if (result == NULL)
7677 return result;
7678 p = PyUnicode_AS_UNICODE(result);
7679 /* Iterate over code points */
7680 for (i = 0; i < length; i++) {
7681 Py_UNICODE ch =s[i];
7682 if (ch > 127) {
7683 int decimal = Py_UNICODE_TODECIMAL(ch);
7684 if (decimal >= 0)
7685 p[i] = '0' + decimal;
7686 }
7687 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007688 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7689 Py_DECREF(result);
7690 return NULL;
7691 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007692 return result;
7693}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007694/* --- Decimal Encoder ---------------------------------------------------- */
7695
Alexander Belopolsky40018472011-02-26 01:02:56 +00007696int
7697PyUnicode_EncodeDecimal(Py_UNICODE *s,
7698 Py_ssize_t length,
7699 char *output,
7700 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007701{
7702 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007703 PyObject *errorHandler = NULL;
7704 PyObject *exc = NULL;
7705 const char *encoding = "decimal";
7706 const char *reason = "invalid decimal Unicode string";
7707 /* the following variable is used for caching string comparisons
7708 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7709 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007710
7711 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 PyErr_BadArgument();
7713 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007714 }
7715
7716 p = s;
7717 end = s + length;
7718 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 register Py_UNICODE ch = *p;
7720 int decimal;
7721 PyObject *repunicode;
7722 Py_ssize_t repsize;
7723 Py_ssize_t newpos;
7724 Py_UNICODE *uni2;
7725 Py_UNICODE *collstart;
7726 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007727
Benjamin Peterson29060642009-01-31 22:14:21 +00007728 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007729 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 ++p;
7731 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007732 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 decimal = Py_UNICODE_TODECIMAL(ch);
7734 if (decimal >= 0) {
7735 *output++ = '0' + decimal;
7736 ++p;
7737 continue;
7738 }
7739 if (0 < ch && ch < 256) {
7740 *output++ = (char)ch;
7741 ++p;
7742 continue;
7743 }
7744 /* All other characters are considered unencodable */
7745 collstart = p;
7746 collend = p+1;
7747 while (collend < end) {
7748 if ((0 < *collend && *collend < 256) ||
7749 !Py_UNICODE_ISSPACE(*collend) ||
7750 Py_UNICODE_TODECIMAL(*collend))
7751 break;
7752 }
7753 /* cache callback name lookup
7754 * (if not done yet, i.e. it's the first error) */
7755 if (known_errorHandler==-1) {
7756 if ((errors==NULL) || (!strcmp(errors, "strict")))
7757 known_errorHandler = 1;
7758 else if (!strcmp(errors, "replace"))
7759 known_errorHandler = 2;
7760 else if (!strcmp(errors, "ignore"))
7761 known_errorHandler = 3;
7762 else if (!strcmp(errors, "xmlcharrefreplace"))
7763 known_errorHandler = 4;
7764 else
7765 known_errorHandler = 0;
7766 }
7767 switch (known_errorHandler) {
7768 case 1: /* strict */
7769 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7770 goto onError;
7771 case 2: /* replace */
7772 for (p = collstart; p < collend; ++p)
7773 *output++ = '?';
7774 /* fall through */
7775 case 3: /* ignore */
7776 p = collend;
7777 break;
7778 case 4: /* xmlcharrefreplace */
7779 /* generate replacement (temporarily (mis)uses p) */
7780 for (p = collstart; p < collend; ++p)
7781 output += sprintf(output, "&#%d;", (int)*p);
7782 p = collend;
7783 break;
7784 default:
7785 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7786 encoding, reason, s, length, &exc,
7787 collstart-s, collend-s, &newpos);
7788 if (repunicode == NULL)
7789 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007790 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007791 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007792 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7793 Py_DECREF(repunicode);
7794 goto onError;
7795 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 /* generate replacement */
7797 repsize = PyUnicode_GET_SIZE(repunicode);
7798 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7799 Py_UNICODE ch = *uni2;
7800 if (Py_UNICODE_ISSPACE(ch))
7801 *output++ = ' ';
7802 else {
7803 decimal = Py_UNICODE_TODECIMAL(ch);
7804 if (decimal >= 0)
7805 *output++ = '0' + decimal;
7806 else if (0 < ch && ch < 256)
7807 *output++ = (char)ch;
7808 else {
7809 Py_DECREF(repunicode);
7810 raise_encode_exception(&exc, encoding,
7811 s, length, collstart-s, collend-s, reason);
7812 goto onError;
7813 }
7814 }
7815 }
7816 p = s + newpos;
7817 Py_DECREF(repunicode);
7818 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007819 }
7820 /* 0-terminate the output string */
7821 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007822 Py_XDECREF(exc);
7823 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007824 return 0;
7825
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007827 Py_XDECREF(exc);
7828 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007829 return -1;
7830}
7831
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832/* --- Helpers ------------------------------------------------------------ */
7833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007834#include "stringlib/ucs1lib.h"
7835#include "stringlib/fastsearch.h"
7836#include "stringlib/partition.h"
7837#include "stringlib/split.h"
7838#include "stringlib/count.h"
7839#include "stringlib/find.h"
7840#include "stringlib/localeutil.h"
7841#include "stringlib/undef.h"
7842
7843#include "stringlib/ucs2lib.h"
7844#include "stringlib/fastsearch.h"
7845#include "stringlib/partition.h"
7846#include "stringlib/split.h"
7847#include "stringlib/count.h"
7848#include "stringlib/find.h"
7849#include "stringlib/localeutil.h"
7850#include "stringlib/undef.h"
7851
7852#include "stringlib/ucs4lib.h"
7853#include "stringlib/fastsearch.h"
7854#include "stringlib/partition.h"
7855#include "stringlib/split.h"
7856#include "stringlib/count.h"
7857#include "stringlib/find.h"
7858#include "stringlib/localeutil.h"
7859#include "stringlib/undef.h"
7860
7861static Py_ssize_t
7862any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7863 const Py_UCS1*, Py_ssize_t,
7864 Py_ssize_t, Py_ssize_t),
7865 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7866 const Py_UCS2*, Py_ssize_t,
7867 Py_ssize_t, Py_ssize_t),
7868 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7869 const Py_UCS4*, Py_ssize_t,
7870 Py_ssize_t, Py_ssize_t),
7871 PyObject* s1, PyObject* s2,
7872 Py_ssize_t start,
7873 Py_ssize_t end)
7874{
7875 int kind1, kind2, kind;
7876 void *buf1, *buf2;
7877 Py_ssize_t len1, len2, result;
7878
7879 kind1 = PyUnicode_KIND(s1);
7880 kind2 = PyUnicode_KIND(s2);
7881 kind = kind1 > kind2 ? kind1 : kind2;
7882 buf1 = PyUnicode_DATA(s1);
7883 buf2 = PyUnicode_DATA(s2);
7884 if (kind1 != kind)
7885 buf1 = _PyUnicode_AsKind(s1, kind);
7886 if (!buf1)
7887 return -2;
7888 if (kind2 != kind)
7889 buf2 = _PyUnicode_AsKind(s2, kind);
7890 if (!buf2) {
7891 if (kind1 != kind) PyMem_Free(buf1);
7892 return -2;
7893 }
7894 len1 = PyUnicode_GET_LENGTH(s1);
7895 len2 = PyUnicode_GET_LENGTH(s2);
7896
7897 switch(kind) {
7898 case PyUnicode_1BYTE_KIND:
7899 result = ucs1(buf1, len1, buf2, len2, start, end);
7900 break;
7901 case PyUnicode_2BYTE_KIND:
7902 result = ucs2(buf1, len1, buf2, len2, start, end);
7903 break;
7904 case PyUnicode_4BYTE_KIND:
7905 result = ucs4(buf1, len1, buf2, len2, start, end);
7906 break;
7907 default:
7908 assert(0); result = -2;
7909 }
7910
7911 if (kind1 != kind)
7912 PyMem_Free(buf1);
7913 if (kind2 != kind)
7914 PyMem_Free(buf2);
7915
7916 return result;
7917}
7918
7919Py_ssize_t
7920_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7921 Py_ssize_t n_buffer,
7922 void *digits, Py_ssize_t n_digits,
7923 Py_ssize_t min_width,
7924 const char *grouping,
7925 const char *thousands_sep)
7926{
7927 switch(kind) {
7928 case PyUnicode_1BYTE_KIND:
7929 return _PyUnicode_ucs1_InsertThousandsGrouping(
7930 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7931 min_width, grouping, thousands_sep);
7932 case PyUnicode_2BYTE_KIND:
7933 return _PyUnicode_ucs2_InsertThousandsGrouping(
7934 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7935 min_width, grouping, thousands_sep);
7936 case PyUnicode_4BYTE_KIND:
7937 return _PyUnicode_ucs4_InsertThousandsGrouping(
7938 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7939 min_width, grouping, thousands_sep);
7940 }
7941 assert(0);
7942 return -1;
7943}
7944
7945
Eric Smith8c663262007-08-25 02:26:07 +00007946#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007947#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007948
Thomas Wouters477c8d52006-05-27 19:21:47 +00007949#include "stringlib/count.h"
7950#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007951
Thomas Wouters477c8d52006-05-27 19:21:47 +00007952/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007953#define ADJUST_INDICES(start, end, len) \
7954 if (end > len) \
7955 end = len; \
7956 else if (end < 0) { \
7957 end += len; \
7958 if (end < 0) \
7959 end = 0; \
7960 } \
7961 if (start < 0) { \
7962 start += len; \
7963 if (start < 0) \
7964 start = 0; \
7965 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007966
Alexander Belopolsky40018472011-02-26 01:02:56 +00007967Py_ssize_t
7968PyUnicode_Count(PyObject *str,
7969 PyObject *substr,
7970 Py_ssize_t start,
7971 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007973 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007974 PyUnicodeObject* str_obj;
7975 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007976 int kind1, kind2, kind;
7977 void *buf1 = NULL, *buf2 = NULL;
7978 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007979
Thomas Wouters477c8d52006-05-27 19:21:47 +00007980 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007981 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007983 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02007984 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 Py_DECREF(str_obj);
7986 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 }
Tim Petersced69f82003-09-16 20:30:58 +00007988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007989 kind1 = PyUnicode_KIND(str_obj);
7990 kind2 = PyUnicode_KIND(sub_obj);
7991 kind = kind1 > kind2 ? kind1 : kind2;
7992 buf1 = PyUnicode_DATA(str_obj);
7993 if (kind1 != kind)
7994 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7995 if (!buf1)
7996 goto onError;
7997 buf2 = PyUnicode_DATA(sub_obj);
7998 if (kind2 != kind)
7999 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8000 if (!buf2)
8001 goto onError;
8002 len1 = PyUnicode_GET_LENGTH(str_obj);
8003 len2 = PyUnicode_GET_LENGTH(sub_obj);
8004
8005 ADJUST_INDICES(start, end, len1);
8006 switch(kind) {
8007 case PyUnicode_1BYTE_KIND:
8008 result = ucs1lib_count(
8009 ((Py_UCS1*)buf1) + start, end - start,
8010 buf2, len2, PY_SSIZE_T_MAX
8011 );
8012 break;
8013 case PyUnicode_2BYTE_KIND:
8014 result = ucs2lib_count(
8015 ((Py_UCS2*)buf1) + start, end - start,
8016 buf2, len2, PY_SSIZE_T_MAX
8017 );
8018 break;
8019 case PyUnicode_4BYTE_KIND:
8020 result = ucs4lib_count(
8021 ((Py_UCS4*)buf1) + start, end - start,
8022 buf2, len2, PY_SSIZE_T_MAX
8023 );
8024 break;
8025 default:
8026 assert(0); result = 0;
8027 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008028
8029 Py_DECREF(sub_obj);
8030 Py_DECREF(str_obj);
8031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008032 if (kind1 != kind)
8033 PyMem_Free(buf1);
8034 if (kind2 != kind)
8035 PyMem_Free(buf2);
8036
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008038 onError:
8039 Py_DECREF(sub_obj);
8040 Py_DECREF(str_obj);
8041 if (kind1 != kind && buf1)
8042 PyMem_Free(buf1);
8043 if (kind2 != kind && buf2)
8044 PyMem_Free(buf2);
8045 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046}
8047
Alexander Belopolsky40018472011-02-26 01:02:56 +00008048Py_ssize_t
8049PyUnicode_Find(PyObject *str,
8050 PyObject *sub,
8051 Py_ssize_t start,
8052 Py_ssize_t end,
8053 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008055 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008056
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008058 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008060 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008061 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 Py_DECREF(str);
8063 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 }
Tim Petersced69f82003-09-16 20:30:58 +00008065
Thomas Wouters477c8d52006-05-27 19:21:47 +00008066 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008067 result = any_find_slice(
8068 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8069 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008070 );
8071 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008072 result = any_find_slice(
8073 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8074 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008075 );
8076
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008078 Py_DECREF(sub);
8079
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080 return result;
8081}
8082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008083Py_ssize_t
8084PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8085 Py_ssize_t start, Py_ssize_t end,
8086 int direction)
8087{
8088 char *result;
8089 int kind;
8090 if (PyUnicode_READY(str) == -1)
8091 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008092 if (start < 0 || end < 0) {
8093 PyErr_SetString(PyExc_IndexError, "string index out of range");
8094 return -2;
8095 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008096 if (end > PyUnicode_GET_LENGTH(str))
8097 end = PyUnicode_GET_LENGTH(str);
8098 kind = PyUnicode_KIND(str);
8099 result = findchar(PyUnicode_1BYTE_DATA(str)
8100 + PyUnicode_KIND_SIZE(kind, start),
8101 kind,
8102 end-start, ch, direction);
8103 if (!result)
8104 return -1;
8105 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8106}
8107
Alexander Belopolsky40018472011-02-26 01:02:56 +00008108static int
8109tailmatch(PyUnicodeObject *self,
8110 PyUnicodeObject *substring,
8111 Py_ssize_t start,
8112 Py_ssize_t end,
8113 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008115 int kind_self;
8116 int kind_sub;
8117 void *data_self;
8118 void *data_sub;
8119 Py_ssize_t offset;
8120 Py_ssize_t i;
8121 Py_ssize_t end_sub;
8122
8123 if (PyUnicode_READY(self) == -1 ||
8124 PyUnicode_READY(substring) == -1)
8125 return 0;
8126
8127 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 return 1;
8129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008130 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8131 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008135 kind_self = PyUnicode_KIND(self);
8136 data_self = PyUnicode_DATA(self);
8137 kind_sub = PyUnicode_KIND(substring);
8138 data_sub = PyUnicode_DATA(substring);
8139 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8140
8141 if (direction > 0)
8142 offset = end;
8143 else
8144 offset = start;
8145
8146 if (PyUnicode_READ(kind_self, data_self, offset) ==
8147 PyUnicode_READ(kind_sub, data_sub, 0) &&
8148 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8149 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8150 /* If both are of the same kind, memcmp is sufficient */
8151 if (kind_self == kind_sub) {
8152 return ! memcmp((char *)data_self +
8153 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8154 data_sub,
8155 PyUnicode_GET_LENGTH(substring) *
8156 PyUnicode_CHARACTER_SIZE(substring));
8157 }
8158 /* otherwise we have to compare each character by first accesing it */
8159 else {
8160 /* We do not need to compare 0 and len(substring)-1 because
8161 the if statement above ensured already that they are equal
8162 when we end up here. */
8163 // TODO: honor direction and do a forward or backwards search
8164 for (i = 1; i < end_sub; ++i) {
8165 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8166 PyUnicode_READ(kind_sub, data_sub, i))
8167 return 0;
8168 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008169 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008170 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171 }
8172
8173 return 0;
8174}
8175
Alexander Belopolsky40018472011-02-26 01:02:56 +00008176Py_ssize_t
8177PyUnicode_Tailmatch(PyObject *str,
8178 PyObject *substr,
8179 Py_ssize_t start,
8180 Py_ssize_t end,
8181 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008183 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008184
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185 str = PyUnicode_FromObject(str);
8186 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008187 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188 substr = PyUnicode_FromObject(substr);
8189 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 Py_DECREF(str);
8191 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008192 }
Tim Petersced69f82003-09-16 20:30:58 +00008193
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 (PyUnicodeObject *)substr,
8196 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197 Py_DECREF(str);
8198 Py_DECREF(substr);
8199 return result;
8200}
8201
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202/* Apply fixfct filter to the Unicode object self and return a
8203 reference to the modified object */
8204
Alexander Belopolsky40018472011-02-26 01:02:56 +00008205static PyObject *
8206fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008207 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008209 PyObject *u;
8210 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008212 if (PyUnicode_READY(self) == -1)
8213 return NULL;
8214 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8215 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8216 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008220 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8221 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008223 /* fix functions return the new maximum character in a string,
8224 if the kind of the resulting unicode object does not change,
8225 everything is fine. Otherwise we need to change the string kind
8226 and re-run the fix function. */
8227 maxchar_new = fixfct((PyUnicodeObject*)u);
8228 if (maxchar_new == 0)
8229 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8230 else if (maxchar_new <= 127)
8231 maxchar_new = 127;
8232 else if (maxchar_new <= 255)
8233 maxchar_new = 255;
8234 else if (maxchar_new <= 65535)
8235 maxchar_new = 65535;
8236 else
8237 maxchar_new = 1114111; /* 0x10ffff */
8238
8239 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 /* fixfct should return TRUE if it modified the buffer. If
8241 FALSE, return a reference to the original buffer instead
8242 (to save space, not time) */
8243 Py_INCREF(self);
8244 Py_DECREF(u);
8245 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008247 else if (maxchar_new == maxchar_old) {
8248 return u;
8249 }
8250 else {
8251 /* In case the maximum character changed, we need to
8252 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008253 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008254 if (v == NULL) {
8255 Py_DECREF(u);
8256 return NULL;
8257 }
8258 if (maxchar_new > maxchar_old) {
8259 /* If the maxchar increased so that the kind changed, not all
8260 characters are representable anymore and we need to fix the
8261 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008262 if (PyUnicode_CopyCharacters(v, 0,
8263 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008264 PyUnicode_GET_LENGTH(self)) < 0)
8265 {
8266 Py_DECREF(u);
8267 return NULL;
8268 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008269 maxchar_old = fixfct((PyUnicodeObject*)v);
8270 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8271 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008272 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008273 if (PyUnicode_CopyCharacters(v, 0,
8274 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008275 PyUnicode_GET_LENGTH(self)) < 0)
8276 {
8277 Py_DECREF(u);
8278 return NULL;
8279 }
8280 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008281
8282 Py_DECREF(u);
8283 return v;
8284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285}
8286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008287static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008288fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008290 /* No need to call PyUnicode_READY(self) because this function is only
8291 called as a callback from fixup() which does it already. */
8292 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8293 const int kind = PyUnicode_KIND(self);
8294 void *data = PyUnicode_DATA(self);
8295 int touched = 0;
8296 Py_UCS4 maxchar = 0;
8297 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008298
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008299 for (i = 0; i < len; ++i) {
8300 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8301 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8302 if (up != ch) {
8303 if (up > maxchar)
8304 maxchar = up;
8305 PyUnicode_WRITE(kind, data, i, up);
8306 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008308 else if (ch > maxchar)
8309 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310 }
8311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312 if (touched)
8313 return maxchar;
8314 else
8315 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316}
8317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008318static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008319fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008321 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8322 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8323 const int kind = PyUnicode_KIND(self);
8324 void *data = PyUnicode_DATA(self);
8325 int touched = 0;
8326 Py_UCS4 maxchar = 0;
8327 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 for(i = 0; i < len; ++i) {
8330 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8331 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8332 if (lo != ch) {
8333 if (lo > maxchar)
8334 maxchar = lo;
8335 PyUnicode_WRITE(kind, data, i, lo);
8336 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008338 else if (ch > maxchar)
8339 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 }
8341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008342 if (touched)
8343 return maxchar;
8344 else
8345 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346}
8347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008349fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8352 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8353 const int kind = PyUnicode_KIND(self);
8354 void *data = PyUnicode_DATA(self);
8355 int touched = 0;
8356 Py_UCS4 maxchar = 0;
8357 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008359 for(i = 0; i < len; ++i) {
8360 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8361 Py_UCS4 nu = 0;
8362
8363 if (Py_UNICODE_ISUPPER(ch))
8364 nu = Py_UNICODE_TOLOWER(ch);
8365 else if (Py_UNICODE_ISLOWER(ch))
8366 nu = Py_UNICODE_TOUPPER(ch);
8367
8368 if (nu != 0) {
8369 if (nu > maxchar)
8370 maxchar = nu;
8371 PyUnicode_WRITE(kind, data, i, nu);
8372 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 else if (ch > maxchar)
8375 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376 }
8377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008378 if (touched)
8379 return maxchar;
8380 else
8381 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382}
8383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008384static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008385fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8388 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8389 const int kind = PyUnicode_KIND(self);
8390 void *data = PyUnicode_DATA(self);
8391 int touched = 0;
8392 Py_UCS4 maxchar = 0;
8393 Py_ssize_t i = 0;
8394 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008395
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008396 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398
8399 ch = PyUnicode_READ(kind, data, i);
8400 if (!Py_UNICODE_ISUPPER(ch)) {
8401 maxchar = Py_UNICODE_TOUPPER(ch);
8402 PyUnicode_WRITE(kind, data, i, maxchar);
8403 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008405 ++i;
8406 for(; i < len; ++i) {
8407 ch = PyUnicode_READ(kind, data, i);
8408 if (!Py_UNICODE_ISLOWER(ch)) {
8409 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8410 if (lo > maxchar)
8411 maxchar = lo;
8412 PyUnicode_WRITE(kind, data, i, lo);
8413 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008414 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 else if (ch > maxchar)
8416 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008418
8419 if (touched)
8420 return maxchar;
8421 else
8422 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423}
8424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008425static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008426fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008428 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8429 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8430 const int kind = PyUnicode_KIND(self);
8431 void *data = PyUnicode_DATA(self);
8432 Py_UCS4 maxchar = 0;
8433 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 int previous_is_cased;
8435
8436 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008437 if (len == 1) {
8438 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8439 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8440 if (ti != ch) {
8441 PyUnicode_WRITE(kind, data, i, ti);
8442 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 }
8444 else
8445 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448 for(; i < len; ++i) {
8449 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8450 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008451
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008453 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455 nu = Py_UNICODE_TOTITLE(ch);
8456
8457 if (nu > maxchar)
8458 maxchar = nu;
8459 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008460
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 if (Py_UNICODE_ISLOWER(ch) ||
8462 Py_UNICODE_ISUPPER(ch) ||
8463 Py_UNICODE_ISTITLE(ch))
8464 previous_is_cased = 1;
8465 else
8466 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469}
8470
Tim Peters8ce9f162004-08-27 01:49:32 +00008471PyObject *
8472PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008474 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008475 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008476 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008477 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008478 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8479 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008480 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 Py_ssize_t sz, i, res_offset;
8482 Py_UCS4 maxchar = 0;
8483 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484
Tim Peters05eba1f2004-08-27 21:32:02 +00008485 fseq = PySequence_Fast(seq, "");
8486 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008487 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008488 }
8489
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008490 /* NOTE: the following code can't call back into Python code,
8491 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008492 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008493
Tim Peters05eba1f2004-08-27 21:32:02 +00008494 seqlen = PySequence_Fast_GET_SIZE(fseq);
8495 /* If empty sequence, return u"". */
8496 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008497 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008498 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008499 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008500 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008501 /* If singleton sequence with an exact Unicode, return that. */
8502 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 item = items[0];
8504 if (PyUnicode_CheckExact(item)) {
8505 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008506 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 goto Done;
8508 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008509 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008510 else {
8511 /* Set up sep and seplen */
8512 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008513 /* fall back to a blank space separator */
8514 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008515 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008517 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008518 else {
8519 if (!PyUnicode_Check(separator)) {
8520 PyErr_Format(PyExc_TypeError,
8521 "separator: expected str instance,"
8522 " %.80s found",
8523 Py_TYPE(separator)->tp_name);
8524 goto onError;
8525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 if (PyUnicode_READY(separator) == -1)
8527 goto onError;
8528 sep = separator;
8529 seplen = PyUnicode_GET_LENGTH(separator);
8530 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8531 /* inc refcount to keep this code path symetric with the
8532 above case of a blank separator */
8533 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008534 }
8535 }
8536
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008537 /* There are at least two things to join, or else we have a subclass
8538 * of str in the sequence.
8539 * Do a pre-pass to figure out the total amount of space we'll
8540 * need (sz), and see whether all argument are strings.
8541 */
8542 sz = 0;
8543 for (i = 0; i < seqlen; i++) {
8544 const Py_ssize_t old_sz = sz;
8545 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 if (!PyUnicode_Check(item)) {
8547 PyErr_Format(PyExc_TypeError,
8548 "sequence item %zd: expected str instance,"
8549 " %.80s found",
8550 i, Py_TYPE(item)->tp_name);
8551 goto onError;
8552 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008553 if (PyUnicode_READY(item) == -1)
8554 goto onError;
8555 sz += PyUnicode_GET_LENGTH(item);
8556 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8557 if (item_maxchar > maxchar)
8558 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008559 if (i != 0)
8560 sz += seplen;
8561 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8562 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008564 goto onError;
8565 }
8566 }
Tim Petersced69f82003-09-16 20:30:58 +00008567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008568 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008569 if (res == NULL)
8570 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008571
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008572 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008574 Py_ssize_t itemlen;
8575 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008576 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 /* Copy item, and maybe the separator. */
8578 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008579 if (PyUnicode_CopyCharacters(res, res_offset,
8580 sep, 0, seplen) < 0)
8581 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008584 if (PyUnicode_CopyCharacters(res, res_offset,
8585 item, 0, itemlen) < 0)
8586 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008587 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008588 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008590
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008592 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593 Py_XDECREF(sep);
8594 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008597 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008599 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 return NULL;
8601}
8602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603#define FILL(kind, data, value, start, length) \
8604 do { \
8605 Py_ssize_t i_ = 0; \
8606 assert(kind != PyUnicode_WCHAR_KIND); \
8607 switch ((kind)) { \
8608 case PyUnicode_1BYTE_KIND: { \
8609 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8610 memset(to_, (unsigned char)value, length); \
8611 break; \
8612 } \
8613 case PyUnicode_2BYTE_KIND: { \
8614 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8615 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8616 break; \
8617 } \
8618 default: { \
8619 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8620 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8621 break; \
8622 } \
8623 } \
8624 } while (0)
8625
Alexander Belopolsky40018472011-02-26 01:02:56 +00008626static PyUnicodeObject *
8627pad(PyUnicodeObject *self,
8628 Py_ssize_t left,
8629 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 PyObject *u;
8633 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008634 int kind;
8635 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636
8637 if (left < 0)
8638 left = 0;
8639 if (right < 0)
8640 right = 0;
8641
Tim Peters7a29bd52001-09-12 03:03:31 +00008642 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 Py_INCREF(self);
8644 return self;
8645 }
8646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8648 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008649 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8650 return NULL;
8651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8653 if (fill > maxchar)
8654 maxchar = fill;
8655 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008656 if (!u)
8657 return NULL;
8658
8659 kind = PyUnicode_KIND(u);
8660 data = PyUnicode_DATA(u);
8661 if (left)
8662 FILL(kind, data, fill, 0, left);
8663 if (right)
8664 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008665 if (PyUnicode_CopyCharacters(u, left,
8666 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008667 _PyUnicode_LENGTH(self)) < 0)
8668 {
8669 Py_DECREF(u);
8670 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 }
8672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676
Alexander Belopolsky40018472011-02-26 01:02:56 +00008677PyObject *
8678PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681
8682 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008683 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 switch(PyUnicode_KIND(string)) {
8687 case PyUnicode_1BYTE_KIND:
8688 list = ucs1lib_splitlines(
8689 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8690 PyUnicode_GET_LENGTH(string), keepends);
8691 break;
8692 case PyUnicode_2BYTE_KIND:
8693 list = ucs2lib_splitlines(
8694 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8695 PyUnicode_GET_LENGTH(string), keepends);
8696 break;
8697 case PyUnicode_4BYTE_KIND:
8698 list = ucs4lib_splitlines(
8699 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8700 PyUnicode_GET_LENGTH(string), keepends);
8701 break;
8702 default:
8703 assert(0);
8704 list = 0;
8705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706 Py_DECREF(string);
8707 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708}
8709
Alexander Belopolsky40018472011-02-26 01:02:56 +00008710static PyObject *
8711split(PyUnicodeObject *self,
8712 PyUnicodeObject *substring,
8713 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715 int kind1, kind2, kind;
8716 void *buf1, *buf2;
8717 Py_ssize_t len1, len2;
8718 PyObject* out;
8719
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008721 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 if (PyUnicode_READY(self) == -1)
8724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 if (substring == NULL)
8727 switch(PyUnicode_KIND(self)) {
8728 case PyUnicode_1BYTE_KIND:
8729 return ucs1lib_split_whitespace(
8730 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8731 PyUnicode_GET_LENGTH(self), maxcount
8732 );
8733 case PyUnicode_2BYTE_KIND:
8734 return ucs2lib_split_whitespace(
8735 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8736 PyUnicode_GET_LENGTH(self), maxcount
8737 );
8738 case PyUnicode_4BYTE_KIND:
8739 return ucs4lib_split_whitespace(
8740 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8741 PyUnicode_GET_LENGTH(self), maxcount
8742 );
8743 default:
8744 assert(0);
8745 return NULL;
8746 }
8747
8748 if (PyUnicode_READY(substring) == -1)
8749 return NULL;
8750
8751 kind1 = PyUnicode_KIND(self);
8752 kind2 = PyUnicode_KIND(substring);
8753 kind = kind1 > kind2 ? kind1 : kind2;
8754 buf1 = PyUnicode_DATA(self);
8755 buf2 = PyUnicode_DATA(substring);
8756 if (kind1 != kind)
8757 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8758 if (!buf1)
8759 return NULL;
8760 if (kind2 != kind)
8761 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8762 if (!buf2) {
8763 if (kind1 != kind) PyMem_Free(buf1);
8764 return NULL;
8765 }
8766 len1 = PyUnicode_GET_LENGTH(self);
8767 len2 = PyUnicode_GET_LENGTH(substring);
8768
8769 switch(kind) {
8770 case PyUnicode_1BYTE_KIND:
8771 out = ucs1lib_split(
8772 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8773 break;
8774 case PyUnicode_2BYTE_KIND:
8775 out = ucs2lib_split(
8776 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8777 break;
8778 case PyUnicode_4BYTE_KIND:
8779 out = ucs4lib_split(
8780 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8781 break;
8782 default:
8783 out = NULL;
8784 }
8785 if (kind1 != kind)
8786 PyMem_Free(buf1);
8787 if (kind2 != kind)
8788 PyMem_Free(buf2);
8789 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008790}
8791
Alexander Belopolsky40018472011-02-26 01:02:56 +00008792static PyObject *
8793rsplit(PyUnicodeObject *self,
8794 PyUnicodeObject *substring,
8795 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 int kind1, kind2, kind;
8798 void *buf1, *buf2;
8799 Py_ssize_t len1, len2;
8800 PyObject* out;
8801
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008802 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008803 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805 if (PyUnicode_READY(self) == -1)
8806 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008808 if (substring == NULL)
8809 switch(PyUnicode_KIND(self)) {
8810 case PyUnicode_1BYTE_KIND:
8811 return ucs1lib_rsplit_whitespace(
8812 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8813 PyUnicode_GET_LENGTH(self), maxcount
8814 );
8815 case PyUnicode_2BYTE_KIND:
8816 return ucs2lib_rsplit_whitespace(
8817 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8818 PyUnicode_GET_LENGTH(self), maxcount
8819 );
8820 case PyUnicode_4BYTE_KIND:
8821 return ucs4lib_rsplit_whitespace(
8822 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8823 PyUnicode_GET_LENGTH(self), maxcount
8824 );
8825 default:
8826 assert(0);
8827 return NULL;
8828 }
8829
8830 if (PyUnicode_READY(substring) == -1)
8831 return NULL;
8832
8833 kind1 = PyUnicode_KIND(self);
8834 kind2 = PyUnicode_KIND(substring);
8835 kind = kind1 > kind2 ? kind1 : kind2;
8836 buf1 = PyUnicode_DATA(self);
8837 buf2 = PyUnicode_DATA(substring);
8838 if (kind1 != kind)
8839 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8840 if (!buf1)
8841 return NULL;
8842 if (kind2 != kind)
8843 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8844 if (!buf2) {
8845 if (kind1 != kind) PyMem_Free(buf1);
8846 return NULL;
8847 }
8848 len1 = PyUnicode_GET_LENGTH(self);
8849 len2 = PyUnicode_GET_LENGTH(substring);
8850
8851 switch(kind) {
8852 case PyUnicode_1BYTE_KIND:
8853 out = ucs1lib_rsplit(
8854 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8855 break;
8856 case PyUnicode_2BYTE_KIND:
8857 out = ucs2lib_rsplit(
8858 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8859 break;
8860 case PyUnicode_4BYTE_KIND:
8861 out = ucs4lib_rsplit(
8862 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8863 break;
8864 default:
8865 out = NULL;
8866 }
8867 if (kind1 != kind)
8868 PyMem_Free(buf1);
8869 if (kind2 != kind)
8870 PyMem_Free(buf2);
8871 return out;
8872}
8873
8874static Py_ssize_t
8875anylib_find(int kind, void *buf1, Py_ssize_t len1,
8876 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8877{
8878 switch(kind) {
8879 case PyUnicode_1BYTE_KIND:
8880 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8881 case PyUnicode_2BYTE_KIND:
8882 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8883 case PyUnicode_4BYTE_KIND:
8884 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8885 }
8886 assert(0);
8887 return -1;
8888}
8889
8890static Py_ssize_t
8891anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8892 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8893{
8894 switch(kind) {
8895 case PyUnicode_1BYTE_KIND:
8896 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8897 case PyUnicode_2BYTE_KIND:
8898 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8899 case PyUnicode_4BYTE_KIND:
8900 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8901 }
8902 assert(0);
8903 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008904}
8905
Alexander Belopolsky40018472011-02-26 01:02:56 +00008906static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008907replace(PyObject *self, PyObject *str1,
8908 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008909{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008910 PyObject *u;
8911 char *sbuf = PyUnicode_DATA(self);
8912 char *buf1 = PyUnicode_DATA(str1);
8913 char *buf2 = PyUnicode_DATA(str2);
8914 int srelease = 0, release1 = 0, release2 = 0;
8915 int skind = PyUnicode_KIND(self);
8916 int kind1 = PyUnicode_KIND(str1);
8917 int kind2 = PyUnicode_KIND(str2);
8918 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8919 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8920 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921
8922 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008925 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 if (skind < kind1)
8928 /* substring too wide to be present */
8929 goto nothing;
8930
8931 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008932 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008933 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008935 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008937 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938 Py_UCS4 u1, u2, maxchar;
8939 int mayshrink, rkind;
8940 u1 = PyUnicode_READ_CHAR(str1, 0);
8941 if (!findchar(sbuf, PyUnicode_KIND(self),
8942 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008943 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 u2 = PyUnicode_READ_CHAR(str2, 0);
8945 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8946 /* Replacing u1 with u2 may cause a maxchar reduction in the
8947 result string. */
8948 mayshrink = maxchar > 127;
8949 if (u2 > maxchar) {
8950 maxchar = u2;
8951 mayshrink = 0;
8952 }
8953 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008954 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008956 if (PyUnicode_CopyCharacters(u, 0,
8957 (PyObject*)self, 0, slen) < 0)
8958 {
8959 Py_DECREF(u);
8960 return NULL;
8961 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 rkind = PyUnicode_KIND(u);
8963 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8964 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008965 if (--maxcount < 0)
8966 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 if (mayshrink) {
8970 PyObject *tmp = u;
8971 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8972 PyUnicode_GET_LENGTH(tmp));
8973 Py_DECREF(tmp);
8974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 int rkind = skind;
8977 char *res;
8978 if (kind1 < rkind) {
8979 /* widen substring */
8980 buf1 = _PyUnicode_AsKind(str1, rkind);
8981 if (!buf1) goto error;
8982 release1 = 1;
8983 }
8984 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008985 if (i < 0)
8986 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987 if (rkind > kind2) {
8988 /* widen replacement */
8989 buf2 = _PyUnicode_AsKind(str2, rkind);
8990 if (!buf2) goto error;
8991 release2 = 1;
8992 }
8993 else if (rkind < kind2) {
8994 /* widen self and buf1 */
8995 rkind = kind2;
8996 if (release1) PyMem_Free(buf1);
8997 sbuf = _PyUnicode_AsKind(self, rkind);
8998 if (!sbuf) goto error;
8999 srelease = 1;
9000 buf1 = _PyUnicode_AsKind(str1, rkind);
9001 if (!buf1) goto error;
9002 release1 = 1;
9003 }
9004 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9005 if (!res) {
9006 PyErr_NoMemory();
9007 goto error;
9008 }
9009 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009010 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9012 buf2,
9013 PyUnicode_KIND_SIZE(rkind, len2));
9014 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009015
9016 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9018 slen-i,
9019 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009020 if (i == -1)
9021 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9023 buf2,
9024 PyUnicode_KIND_SIZE(rkind, len2));
9025 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027
9028 u = PyUnicode_FromKindAndData(rkind, res, slen);
9029 PyMem_Free(res);
9030 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 Py_ssize_t n, i, j, ires;
9035 Py_ssize_t product, new_size;
9036 int rkind = skind;
9037 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 if (kind1 < rkind) {
9040 buf1 = _PyUnicode_AsKind(str1, rkind);
9041 if (!buf1) goto error;
9042 release1 = 1;
9043 }
9044 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009045 if (n == 0)
9046 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047 if (kind2 < rkind) {
9048 buf2 = _PyUnicode_AsKind(str2, rkind);
9049 if (!buf2) goto error;
9050 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 else if (kind2 > rkind) {
9053 rkind = kind2;
9054 sbuf = _PyUnicode_AsKind(self, rkind);
9055 if (!sbuf) goto error;
9056 srelease = 1;
9057 if (release1) PyMem_Free(buf1);
9058 buf1 = _PyUnicode_AsKind(str1, rkind);
9059 if (!buf1) goto error;
9060 release1 = 1;
9061 }
9062 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9063 PyUnicode_GET_LENGTH(str1))); */
9064 product = n * (len2-len1);
9065 if ((product / (len2-len1)) != n) {
9066 PyErr_SetString(PyExc_OverflowError,
9067 "replace string is too long");
9068 goto error;
9069 }
9070 new_size = slen + product;
9071 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9072 PyErr_SetString(PyExc_OverflowError,
9073 "replace string is too long");
9074 goto error;
9075 }
9076 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9077 if (!res)
9078 goto error;
9079 ires = i = 0;
9080 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009081 while (n-- > 0) {
9082 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083 j = anylib_find(rkind,
9084 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9085 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009086 if (j == -1)
9087 break;
9088 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009089 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9091 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9092 PyUnicode_KIND_SIZE(rkind, j-i));
9093 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009094 }
9095 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 if (len2 > 0) {
9097 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9098 buf2,
9099 PyUnicode_KIND_SIZE(rkind, len2));
9100 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009101 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009103 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009105 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009106 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9107 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9108 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009109 } else {
9110 /* interleave */
9111 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9113 buf2,
9114 PyUnicode_KIND_SIZE(rkind, len2));
9115 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009116 if (--n <= 0)
9117 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9119 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9120 PyUnicode_KIND_SIZE(rkind, 1));
9121 ires++;
9122 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009123 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9125 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9126 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009127 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009129 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131 if (srelease)
9132 PyMem_FREE(sbuf);
9133 if (release1)
9134 PyMem_FREE(buf1);
9135 if (release2)
9136 PyMem_FREE(buf2);
9137 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009138
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009140 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141 if (srelease)
9142 PyMem_FREE(sbuf);
9143 if (release1)
9144 PyMem_FREE(buf1);
9145 if (release2)
9146 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009147 if (PyUnicode_CheckExact(self)) {
9148 Py_INCREF(self);
9149 return (PyObject *) self;
9150 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009151 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 error:
9153 if (srelease && sbuf)
9154 PyMem_FREE(sbuf);
9155 if (release1 && buf1)
9156 PyMem_FREE(buf1);
9157 if (release2 && buf2)
9158 PyMem_FREE(buf2);
9159 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009160}
9161
9162/* --- Unicode Object Methods --------------------------------------------- */
9163
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009164PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009165 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166\n\
9167Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009168characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169
9170static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009171unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173 return fixup(self, fixtitle);
9174}
9175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009176PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009177 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178\n\
9179Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009180have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181
9182static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009183unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185 return fixup(self, fixcapitalize);
9186}
9187
9188#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009189PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009191\n\
9192Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009193normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009194
9195static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009196unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009197{
9198 PyObject *list;
9199 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009200 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009201
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202 /* Split into words */
9203 list = split(self, NULL, -1);
9204 if (!list)
9205 return NULL;
9206
9207 /* Capitalize each word */
9208 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9209 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009210 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009211 if (item == NULL)
9212 goto onError;
9213 Py_DECREF(PyList_GET_ITEM(list, i));
9214 PyList_SET_ITEM(list, i, item);
9215 }
9216
9217 /* Join the words to form a new string */
9218 item = PyUnicode_Join(NULL, list);
9219
Benjamin Peterson29060642009-01-31 22:14:21 +00009220 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221 Py_DECREF(list);
9222 return (PyObject *)item;
9223}
9224#endif
9225
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009226/* Argument converter. Coerces to a single unicode character */
9227
9228static int
9229convert_uc(PyObject *obj, void *addr)
9230{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009231 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009232 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009233
Benjamin Peterson14339b62009-01-31 16:36:08 +00009234 uniobj = PyUnicode_FromObject(obj);
9235 if (uniobj == NULL) {
9236 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009237 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009238 return 0;
9239 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009241 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009242 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009243 Py_DECREF(uniobj);
9244 return 0;
9245 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009247 Py_DECREF(uniobj);
9248 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009249}
9250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009251PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009252 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009253\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009254Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009255done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009256
9257static PyObject *
9258unicode_center(PyUnicodeObject *self, PyObject *args)
9259{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009260 Py_ssize_t marg, left;
9261 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262 Py_UCS4 fillchar = ' ';
9263
Victor Stinnere9a29352011-10-01 02:14:59 +02009264 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009266
Victor Stinnere9a29352011-10-01 02:14:59 +02009267 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268 return NULL;
9269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009271 Py_INCREF(self);
9272 return (PyObject*) self;
9273 }
9274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009276 left = marg / 2 + (marg & width & 1);
9277
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009278 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279}
9280
Marc-André Lemburge5034372000-08-08 08:04:29 +00009281#if 0
9282
9283/* This code should go into some future Unicode collation support
9284 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009285 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009286
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009287/* speedy UTF-16 code point order comparison */
9288/* gleaned from: */
9289/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9290
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009291static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009292{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009293 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009294 0, 0, 0, 0, 0, 0, 0, 0,
9295 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009296 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009297};
9298
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299static int
9300unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9301{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009302 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009303
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304 Py_UNICODE *s1 = str1->str;
9305 Py_UNICODE *s2 = str2->str;
9306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 len1 = str1->_base._base.length;
9308 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009309
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009311 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009312
9313 c1 = *s1++;
9314 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009315
Benjamin Peterson29060642009-01-31 22:14:21 +00009316 if (c1 > (1<<11) * 26)
9317 c1 += utf16Fixup[c1>>11];
9318 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009319 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009320 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009321
9322 if (c1 != c2)
9323 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009324
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009325 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326 }
9327
9328 return (len1 < len2) ? -1 : (len1 != len2);
9329}
9330
Marc-André Lemburge5034372000-08-08 08:04:29 +00009331#else
9332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333/* This function assumes that str1 and str2 are readied by the caller. */
9334
Marc-André Lemburge5034372000-08-08 08:04:29 +00009335static int
9336unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9337{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 int kind1, kind2;
9339 void *data1, *data2;
9340 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 kind1 = PyUnicode_KIND(str1);
9343 kind2 = PyUnicode_KIND(str2);
9344 data1 = PyUnicode_DATA(str1);
9345 data2 = PyUnicode_DATA(str2);
9346 len1 = PyUnicode_GET_LENGTH(str1);
9347 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349 for (i = 0; i < len1 && i < len2; ++i) {
9350 Py_UCS4 c1, c2;
9351 c1 = PyUnicode_READ(kind1, data1, i);
9352 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009353
9354 if (c1 != c2)
9355 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009356 }
9357
9358 return (len1 < len2) ? -1 : (len1 != len2);
9359}
9360
9361#endif
9362
Alexander Belopolsky40018472011-02-26 01:02:56 +00009363int
9364PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009366 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9367 if (PyUnicode_READY(left) == -1 ||
9368 PyUnicode_READY(right) == -1)
9369 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009370 return unicode_compare((PyUnicodeObject *)left,
9371 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009373 PyErr_Format(PyExc_TypeError,
9374 "Can't compare %.100s and %.100s",
9375 left->ob_type->tp_name,
9376 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377 return -1;
9378}
9379
Martin v. Löwis5b222132007-06-10 09:51:05 +00009380int
9381PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9382{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383 Py_ssize_t i;
9384 int kind;
9385 void *data;
9386 Py_UCS4 chr;
9387
Martin v. Löwis5b222132007-06-10 09:51:05 +00009388 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 if (PyUnicode_READY(uni) == -1)
9390 return -1;
9391 kind = PyUnicode_KIND(uni);
9392 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009393 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9395 if (chr != str[i])
9396 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009397 /* This check keeps Python strings that end in '\0' from comparing equal
9398 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009400 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009401 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009402 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009403 return 0;
9404}
9405
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009406
Benjamin Peterson29060642009-01-31 22:14:21 +00009407#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009408 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009409
Alexander Belopolsky40018472011-02-26 01:02:56 +00009410PyObject *
9411PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009412{
9413 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009414
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009415 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9416 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417 if (PyUnicode_READY(left) == -1 ||
9418 PyUnicode_READY(right) == -1)
9419 return NULL;
9420 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9421 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009422 if (op == Py_EQ) {
9423 Py_INCREF(Py_False);
9424 return Py_False;
9425 }
9426 if (op == Py_NE) {
9427 Py_INCREF(Py_True);
9428 return Py_True;
9429 }
9430 }
9431 if (left == right)
9432 result = 0;
9433 else
9434 result = unicode_compare((PyUnicodeObject *)left,
9435 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009436
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009437 /* Convert the return value to a Boolean */
9438 switch (op) {
9439 case Py_EQ:
9440 v = TEST_COND(result == 0);
9441 break;
9442 case Py_NE:
9443 v = TEST_COND(result != 0);
9444 break;
9445 case Py_LE:
9446 v = TEST_COND(result <= 0);
9447 break;
9448 case Py_GE:
9449 v = TEST_COND(result >= 0);
9450 break;
9451 case Py_LT:
9452 v = TEST_COND(result == -1);
9453 break;
9454 case Py_GT:
9455 v = TEST_COND(result == 1);
9456 break;
9457 default:
9458 PyErr_BadArgument();
9459 return NULL;
9460 }
9461 Py_INCREF(v);
9462 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009464
Brian Curtindfc80e32011-08-10 20:28:54 -05009465 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009466}
9467
Alexander Belopolsky40018472011-02-26 01:02:56 +00009468int
9469PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009470{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009471 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 int kind1, kind2, kind;
9473 void *buf1, *buf2;
9474 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009475 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009476
9477 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009478 sub = PyUnicode_FromObject(element);
9479 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009480 PyErr_Format(PyExc_TypeError,
9481 "'in <string>' requires string as left operand, not %s",
9482 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009483 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 if (PyUnicode_READY(sub) == -1)
9486 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009487
Thomas Wouters477c8d52006-05-27 19:21:47 +00009488 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009489 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009490 Py_DECREF(sub);
9491 return -1;
9492 }
9493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 kind1 = PyUnicode_KIND(str);
9495 kind2 = PyUnicode_KIND(sub);
9496 kind = kind1 > kind2 ? kind1 : kind2;
9497 buf1 = PyUnicode_DATA(str);
9498 buf2 = PyUnicode_DATA(sub);
9499 if (kind1 != kind)
9500 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9501 if (!buf1) {
9502 Py_DECREF(sub);
9503 return -1;
9504 }
9505 if (kind2 != kind)
9506 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9507 if (!buf2) {
9508 Py_DECREF(sub);
9509 if (kind1 != kind) PyMem_Free(buf1);
9510 return -1;
9511 }
9512 len1 = PyUnicode_GET_LENGTH(str);
9513 len2 = PyUnicode_GET_LENGTH(sub);
9514
9515 switch(kind) {
9516 case PyUnicode_1BYTE_KIND:
9517 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9518 break;
9519 case PyUnicode_2BYTE_KIND:
9520 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9521 break;
9522 case PyUnicode_4BYTE_KIND:
9523 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9524 break;
9525 default:
9526 result = -1;
9527 assert(0);
9528 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009529
9530 Py_DECREF(str);
9531 Py_DECREF(sub);
9532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 if (kind1 != kind)
9534 PyMem_Free(buf1);
9535 if (kind2 != kind)
9536 PyMem_Free(buf2);
9537
Guido van Rossum403d68b2000-03-13 15:55:09 +00009538 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009539}
9540
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541/* Concat to string or Unicode object giving a new Unicode object. */
9542
Alexander Belopolsky40018472011-02-26 01:02:56 +00009543PyObject *
9544PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009546 PyObject *u = NULL, *v = NULL, *w;
9547 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548
9549 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009551 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009552 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009554 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556
9557 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009558 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009559 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009562 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009563 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565 }
9566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009568 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 w = PyUnicode_New(
9572 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9573 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009575 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009576 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9577 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009578 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009579 v, 0,
9580 PyUnicode_GET_LENGTH(v)) < 0)
9581 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582 Py_DECREF(u);
9583 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009587 Py_XDECREF(u);
9588 Py_XDECREF(v);
9589 return NULL;
9590}
9591
Walter Dörwald1ab83302007-05-18 17:15:44 +00009592void
9593PyUnicode_Append(PyObject **pleft, PyObject *right)
9594{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009595 PyObject *new;
9596 if (*pleft == NULL)
9597 return;
9598 if (right == NULL || !PyUnicode_Check(*pleft)) {
9599 Py_DECREF(*pleft);
9600 *pleft = NULL;
9601 return;
9602 }
9603 new = PyUnicode_Concat(*pleft, right);
9604 Py_DECREF(*pleft);
9605 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009606}
9607
9608void
9609PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9610{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009611 PyUnicode_Append(pleft, right);
9612 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009613}
9614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009615PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009616 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009617\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009618Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009619string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009620interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009621
9622static PyObject *
9623unicode_count(PyUnicodeObject *self, PyObject *args)
9624{
9625 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009626 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009627 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 int kind1, kind2, kind;
9630 void *buf1, *buf2;
9631 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632
Jesus Ceaac451502011-04-20 17:09:23 +02009633 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9634 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009635 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 kind1 = PyUnicode_KIND(self);
9638 kind2 = PyUnicode_KIND(substring);
9639 kind = kind1 > kind2 ? kind1 : kind2;
9640 buf1 = PyUnicode_DATA(self);
9641 buf2 = PyUnicode_DATA(substring);
9642 if (kind1 != kind)
9643 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9644 if (!buf1) {
9645 Py_DECREF(substring);
9646 return NULL;
9647 }
9648 if (kind2 != kind)
9649 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9650 if (!buf2) {
9651 Py_DECREF(substring);
9652 if (kind1 != kind) PyMem_Free(buf1);
9653 return NULL;
9654 }
9655 len1 = PyUnicode_GET_LENGTH(self);
9656 len2 = PyUnicode_GET_LENGTH(substring);
9657
9658 ADJUST_INDICES(start, end, len1);
9659 switch(kind) {
9660 case PyUnicode_1BYTE_KIND:
9661 iresult = ucs1lib_count(
9662 ((Py_UCS1*)buf1) + start, end - start,
9663 buf2, len2, PY_SSIZE_T_MAX
9664 );
9665 break;
9666 case PyUnicode_2BYTE_KIND:
9667 iresult = ucs2lib_count(
9668 ((Py_UCS2*)buf1) + start, end - start,
9669 buf2, len2, PY_SSIZE_T_MAX
9670 );
9671 break;
9672 case PyUnicode_4BYTE_KIND:
9673 iresult = ucs4lib_count(
9674 ((Py_UCS4*)buf1) + start, end - start,
9675 buf2, len2, PY_SSIZE_T_MAX
9676 );
9677 break;
9678 default:
9679 assert(0); iresult = 0;
9680 }
9681
9682 result = PyLong_FromSsize_t(iresult);
9683
9684 if (kind1 != kind)
9685 PyMem_Free(buf1);
9686 if (kind2 != kind)
9687 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688
9689 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009690
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691 return result;
9692}
9693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009694PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009695 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009697Encode S using the codec registered for encoding. Default encoding\n\
9698is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009699handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009700a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9701'xmlcharrefreplace' as well as any other name registered with\n\
9702codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009703
9704static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009705unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009707 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009708 char *encoding = NULL;
9709 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009710
Benjamin Peterson308d6372009-09-18 21:42:35 +00009711 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9712 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009714 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009715}
9716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009717PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009718 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719\n\
9720Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009721If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722
9723static PyObject*
9724unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9725{
9726 Py_UNICODE *e;
9727 Py_UNICODE *p;
9728 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009729 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731 PyUnicodeObject *u;
9732 int tabsize = 8;
9733
9734 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009735 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9738 return NULL;
9739
Thomas Wouters7e474022000-07-16 12:04:32 +00009740 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009741 i = 0; /* chars up to and including most recent \n or \r */
9742 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009743 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9744 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009746 if (tabsize > 0) {
9747 incr = tabsize - (j % tabsize); /* cannot overflow */
9748 if (j > PY_SSIZE_T_MAX - incr)
9749 goto overflow1;
9750 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009751 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009754 if (j > PY_SSIZE_T_MAX - 1)
9755 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756 j++;
9757 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009758 if (i > PY_SSIZE_T_MAX - j)
9759 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009761 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762 }
9763 }
9764
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009765 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009766 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009767
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768 /* Second pass: create output string and fill it */
9769 u = _PyUnicode_New(i + j);
9770 if (!u)
9771 return NULL;
9772
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009773 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774 q = _PyUnicode_WSTR(u); /* next output char */
9775 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009778 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009779 if (tabsize > 0) {
9780 i = tabsize - (j % tabsize);
9781 j += i;
9782 while (i--) {
9783 if (q >= qe)
9784 goto overflow2;
9785 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009786 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009787 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009788 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009789 else {
9790 if (q >= qe)
9791 goto overflow2;
9792 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009793 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794 if (*p == '\n' || *p == '\r')
9795 j = 0;
9796 }
9797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009798 if (PyUnicode_READY(u) == -1) {
9799 Py_DECREF(u);
9800 return NULL;
9801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009803
9804 overflow2:
9805 Py_DECREF(u);
9806 overflow1:
9807 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809}
9810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009811PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009812 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813\n\
9814Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009815such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816arguments start and end are interpreted as in slice notation.\n\
9817\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009818Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819
9820static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009822{
Jesus Ceaac451502011-04-20 17:09:23 +02009823 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009824 Py_ssize_t start;
9825 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009826 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827
Jesus Ceaac451502011-04-20 17:09:23 +02009828 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9829 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 if (PyUnicode_READY(self) == -1)
9833 return NULL;
9834 if (PyUnicode_READY(substring) == -1)
9835 return NULL;
9836
9837 result = any_find_slice(
9838 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9839 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009840 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009841
9842 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 if (result == -2)
9845 return NULL;
9846
Christian Heimes217cfd12007-12-02 14:31:20 +00009847 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009848}
9849
9850static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +02009851unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02009853 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
9854 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009857}
9858
Guido van Rossumc2504932007-09-18 19:42:40 +00009859/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009860 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009861static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009862unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863{
Guido van Rossumc2504932007-09-18 19:42:40 +00009864 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009865 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 if (_PyUnicode_HASH(self) != -1)
9868 return _PyUnicode_HASH(self);
9869 if (PyUnicode_READY(self) == -1)
9870 return -1;
9871 len = PyUnicode_GET_LENGTH(self);
9872
9873 /* The hash function as a macro, gets expanded three times below. */
9874#define HASH(P) \
9875 x = (Py_uhash_t)*P << 7; \
9876 while (--len >= 0) \
9877 x = (1000003*x) ^ (Py_uhash_t)*P++;
9878
9879 switch (PyUnicode_KIND(self)) {
9880 case PyUnicode_1BYTE_KIND: {
9881 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9882 HASH(c);
9883 break;
9884 }
9885 case PyUnicode_2BYTE_KIND: {
9886 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9887 HASH(s);
9888 break;
9889 }
9890 default: {
9891 Py_UCS4 *l;
9892 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9893 "Impossible switch case in unicode_hash");
9894 l = PyUnicode_4BYTE_DATA(self);
9895 HASH(l);
9896 break;
9897 }
9898 }
9899 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9900
Guido van Rossumc2504932007-09-18 19:42:40 +00009901 if (x == -1)
9902 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009904 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009908PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009909 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009910\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009911Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912
9913static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009915{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009916 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009917 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009918 Py_ssize_t start;
9919 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920
Jesus Ceaac451502011-04-20 17:09:23 +02009921 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9922 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 if (PyUnicode_READY(self) == -1)
9926 return NULL;
9927 if (PyUnicode_READY(substring) == -1)
9928 return NULL;
9929
9930 result = any_find_slice(
9931 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9932 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009933 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009934
9935 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 if (result == -2)
9938 return NULL;
9939
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940 if (result < 0) {
9941 PyErr_SetString(PyExc_ValueError, "substring not found");
9942 return NULL;
9943 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009944
Christian Heimes217cfd12007-12-02 14:31:20 +00009945 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009946}
9947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009948PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009949 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009951Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009952at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953
9954static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009955unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009956{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 Py_ssize_t i, length;
9958 int kind;
9959 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009960 int cased;
9961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 if (PyUnicode_READY(self) == -1)
9963 return NULL;
9964 length = PyUnicode_GET_LENGTH(self);
9965 kind = PyUnicode_KIND(self);
9966 data = PyUnicode_DATA(self);
9967
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009969 if (length == 1)
9970 return PyBool_FromLong(
9971 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009972
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009973 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009975 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009976
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 for (i = 0; i < length; i++) {
9979 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009980
Benjamin Peterson29060642009-01-31 22:14:21 +00009981 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9982 return PyBool_FromLong(0);
9983 else if (!cased && Py_UNICODE_ISLOWER(ch))
9984 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009985 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009986 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009987}
9988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009989PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009990 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009992Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009993at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009994
9995static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009996unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 Py_ssize_t i, length;
9999 int kind;
10000 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010001 int cased;
10002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 if (PyUnicode_READY(self) == -1)
10004 return NULL;
10005 length = PyUnicode_GET_LENGTH(self);
10006 kind = PyUnicode_KIND(self);
10007 data = PyUnicode_DATA(self);
10008
Guido van Rossumd57fd912000-03-10 22:53:23 +000010009 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 if (length == 1)
10011 return PyBool_FromLong(
10012 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010013
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010014 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010016 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010017
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 for (i = 0; i < length; i++) {
10020 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010021
Benjamin Peterson29060642009-01-31 22:14:21 +000010022 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10023 return PyBool_FromLong(0);
10024 else if (!cased && Py_UNICODE_ISUPPER(ch))
10025 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010027 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028}
10029
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010030PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010031 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010033Return True if S is a titlecased string and there is at least one\n\
10034character in S, i.e. upper- and titlecase characters may only\n\
10035follow uncased characters and lowercase characters only cased ones.\n\
10036Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010037
10038static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010039unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 Py_ssize_t i, length;
10042 int kind;
10043 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010044 int cased, previous_is_cased;
10045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 if (PyUnicode_READY(self) == -1)
10047 return NULL;
10048 length = PyUnicode_GET_LENGTH(self);
10049 kind = PyUnicode_KIND(self);
10050 data = PyUnicode_DATA(self);
10051
Guido van Rossumd57fd912000-03-10 22:53:23 +000010052 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 if (length == 1) {
10054 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10055 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10056 (Py_UNICODE_ISUPPER(ch) != 0));
10057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010058
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010059 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010061 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010062
Guido van Rossumd57fd912000-03-10 22:53:23 +000010063 cased = 0;
10064 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 for (i = 0; i < length; i++) {
10066 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010067
Benjamin Peterson29060642009-01-31 22:14:21 +000010068 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10069 if (previous_is_cased)
10070 return PyBool_FromLong(0);
10071 previous_is_cased = 1;
10072 cased = 1;
10073 }
10074 else if (Py_UNICODE_ISLOWER(ch)) {
10075 if (!previous_is_cased)
10076 return PyBool_FromLong(0);
10077 previous_is_cased = 1;
10078 cased = 1;
10079 }
10080 else
10081 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010082 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010083 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010084}
10085
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010086PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010087 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010089Return True if all characters in S are whitespace\n\
10090and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010091
10092static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010093unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010094{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 Py_ssize_t i, length;
10096 int kind;
10097 void *data;
10098
10099 if (PyUnicode_READY(self) == -1)
10100 return NULL;
10101 length = PyUnicode_GET_LENGTH(self);
10102 kind = PyUnicode_KIND(self);
10103 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010104
Guido van Rossumd57fd912000-03-10 22:53:23 +000010105 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 if (length == 1)
10107 return PyBool_FromLong(
10108 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010109
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010110 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010112 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 for (i = 0; i < length; i++) {
10115 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010116 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010117 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010118 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010119 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010120}
10121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010122PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010123 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010124\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010125Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010126and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010127
10128static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010129unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010130{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 Py_ssize_t i, length;
10132 int kind;
10133 void *data;
10134
10135 if (PyUnicode_READY(self) == -1)
10136 return NULL;
10137 length = PyUnicode_GET_LENGTH(self);
10138 kind = PyUnicode_KIND(self);
10139 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010140
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010141 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 if (length == 1)
10143 return PyBool_FromLong(
10144 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010145
10146 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010148 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 for (i = 0; i < length; i++) {
10151 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010152 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010153 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010154 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010155}
10156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010157PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010158 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010159\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010160Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010161and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010162
10163static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010164unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010165{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 int kind;
10167 void *data;
10168 Py_ssize_t len, i;
10169
10170 if (PyUnicode_READY(self) == -1)
10171 return NULL;
10172
10173 kind = PyUnicode_KIND(self);
10174 data = PyUnicode_DATA(self);
10175 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010176
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010177 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 if (len == 1) {
10179 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10180 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10181 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010182
10183 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010185 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 for (i = 0; i < len; i++) {
10188 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010189 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010190 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010191 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010192 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010193}
10194
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010195PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010196 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010197\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010198Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010199False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200
10201static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010202unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 Py_ssize_t i, length;
10205 int kind;
10206 void *data;
10207
10208 if (PyUnicode_READY(self) == -1)
10209 return NULL;
10210 length = PyUnicode_GET_LENGTH(self);
10211 kind = PyUnicode_KIND(self);
10212 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 if (length == 1)
10216 return PyBool_FromLong(
10217 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010218
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010219 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010221 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 for (i = 0; i < length; i++) {
10224 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010225 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010227 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228}
10229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010230PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010231 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010233Return True if all characters in S are digits\n\
10234and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235
10236static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010237unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 Py_ssize_t i, length;
10240 int kind;
10241 void *data;
10242
10243 if (PyUnicode_READY(self) == -1)
10244 return NULL;
10245 length = PyUnicode_GET_LENGTH(self);
10246 kind = PyUnicode_KIND(self);
10247 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 if (length == 1) {
10251 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10252 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10253 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010255 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010257 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 for (i = 0; i < length; i++) {
10260 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010261 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010262 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010263 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264}
10265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010266PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010267 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010269Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010270False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010271
10272static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010273unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 Py_ssize_t i, length;
10276 int kind;
10277 void *data;
10278
10279 if (PyUnicode_READY(self) == -1)
10280 return NULL;
10281 length = PyUnicode_GET_LENGTH(self);
10282 kind = PyUnicode_KIND(self);
10283 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284
Guido van Rossumd57fd912000-03-10 22:53:23 +000010285 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 if (length == 1)
10287 return PyBool_FromLong(
10288 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010289
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010290 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010292 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 for (i = 0; i < length; i++) {
10295 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010296 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010298 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299}
10300
Martin v. Löwis47383402007-08-15 07:32:56 +000010301int
10302PyUnicode_IsIdentifier(PyObject *self)
10303{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 int kind;
10305 void *data;
10306 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010307 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 if (PyUnicode_READY(self) == -1) {
10310 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010311 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 }
10313
10314 /* Special case for empty strings */
10315 if (PyUnicode_GET_LENGTH(self) == 0)
10316 return 0;
10317 kind = PyUnicode_KIND(self);
10318 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010319
10320 /* PEP 3131 says that the first character must be in
10321 XID_Start and subsequent characters in XID_Continue,
10322 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010323 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010324 letters, digits, underscore). However, given the current
10325 definition of XID_Start and XID_Continue, it is sufficient
10326 to check just for these, except that _ must be allowed
10327 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010329 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010330 return 0;
10331
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010332 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010334 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010335 return 1;
10336}
10337
10338PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010339 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010340\n\
10341Return True if S is a valid identifier according\n\
10342to the language definition.");
10343
10344static PyObject*
10345unicode_isidentifier(PyObject *self)
10346{
10347 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10348}
10349
Georg Brandl559e5d72008-06-11 18:37:52 +000010350PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010351 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010352\n\
10353Return True if all characters in S are considered\n\
10354printable in repr() or S is empty, False otherwise.");
10355
10356static PyObject*
10357unicode_isprintable(PyObject *self)
10358{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 Py_ssize_t i, length;
10360 int kind;
10361 void *data;
10362
10363 if (PyUnicode_READY(self) == -1)
10364 return NULL;
10365 length = PyUnicode_GET_LENGTH(self);
10366 kind = PyUnicode_KIND(self);
10367 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010368
10369 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 if (length == 1)
10371 return PyBool_FromLong(
10372 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 for (i = 0; i < length; i++) {
10375 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010376 Py_RETURN_FALSE;
10377 }
10378 }
10379 Py_RETURN_TRUE;
10380}
10381
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010382PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010383 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010384\n\
10385Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010386iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387
10388static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010389unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010391 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392}
10393
Martin v. Löwis18e16552006-02-15 17:27:45 +000010394static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010395unicode_length(PyUnicodeObject *self)
10396{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 if (PyUnicode_READY(self) == -1)
10398 return -1;
10399 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400}
10401
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010402PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010403 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010405Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010406done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407
10408static PyObject *
10409unicode_ljust(PyUnicodeObject *self, PyObject *args)
10410{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010411 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 Py_UCS4 fillchar = ' ';
10413
10414 if (PyUnicode_READY(self) == -1)
10415 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010416
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010417 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418 return NULL;
10419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421 Py_INCREF(self);
10422 return (PyObject*) self;
10423 }
10424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010426}
10427
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010428PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010429 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010431Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432
10433static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010434unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436 return fixup(self, fixlower);
10437}
10438
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010439#define LEFTSTRIP 0
10440#define RIGHTSTRIP 1
10441#define BOTHSTRIP 2
10442
10443/* Arrays indexed by above */
10444static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10445
10446#define STRIPNAME(i) (stripformat[i]+3)
10447
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010448/* externally visible for str.strip(unicode) */
10449PyObject *
10450_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 void *data;
10453 int kind;
10454 Py_ssize_t i, j, len;
10455 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10458 return NULL;
10459
10460 kind = PyUnicode_KIND(self);
10461 data = PyUnicode_DATA(self);
10462 len = PyUnicode_GET_LENGTH(self);
10463 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10464 PyUnicode_DATA(sepobj),
10465 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010466
Benjamin Peterson14339b62009-01-31 16:36:08 +000010467 i = 0;
10468 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 while (i < len &&
10470 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010471 i++;
10472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010473 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010474
Benjamin Peterson14339b62009-01-31 16:36:08 +000010475 j = len;
10476 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010477 do {
10478 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010479 } while (j >= i &&
10480 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010481 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010482 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010483
Victor Stinner12bab6d2011-10-01 01:53:49 +020010484 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485}
10486
10487PyObject*
10488PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10489{
10490 unsigned char *data;
10491 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010492 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493
Victor Stinnerde636f32011-10-01 03:55:54 +020010494 if (PyUnicode_READY(self) == -1)
10495 return NULL;
10496
10497 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10498
Victor Stinner12bab6d2011-10-01 01:53:49 +020010499 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010501 if (PyUnicode_CheckExact(self)) {
10502 Py_INCREF(self);
10503 return self;
10504 }
10505 else
10506 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 }
10508
Victor Stinner12bab6d2011-10-01 01:53:49 +020010509 length = end - start;
10510 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010511 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512
Victor Stinnerde636f32011-10-01 03:55:54 +020010513 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010514 PyErr_SetString(PyExc_IndexError, "string index out of range");
10515 return NULL;
10516 }
10517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 kind = PyUnicode_KIND(self);
10519 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010520 return PyUnicode_FromKindAndData(kind,
10521 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010522 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524
10525static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010526do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 int kind;
10529 void *data;
10530 Py_ssize_t len, i, j;
10531
10532 if (PyUnicode_READY(self) == -1)
10533 return NULL;
10534
10535 kind = PyUnicode_KIND(self);
10536 data = PyUnicode_DATA(self);
10537 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010538
Benjamin Peterson14339b62009-01-31 16:36:08 +000010539 i = 0;
10540 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010542 i++;
10543 }
10544 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010545
Benjamin Peterson14339b62009-01-31 16:36:08 +000010546 j = len;
10547 if (striptype != LEFTSTRIP) {
10548 do {
10549 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010551 j++;
10552 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010553
Victor Stinner12bab6d2011-10-01 01:53:49 +020010554 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010555}
10556
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010557
10558static PyObject *
10559do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10560{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010561 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010562
Benjamin Peterson14339b62009-01-31 16:36:08 +000010563 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10564 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010565
Benjamin Peterson14339b62009-01-31 16:36:08 +000010566 if (sep != NULL && sep != Py_None) {
10567 if (PyUnicode_Check(sep))
10568 return _PyUnicode_XStrip(self, striptype, sep);
10569 else {
10570 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010571 "%s arg must be None or str",
10572 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010573 return NULL;
10574 }
10575 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010576
Benjamin Peterson14339b62009-01-31 16:36:08 +000010577 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010578}
10579
10580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010581PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010582 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010583\n\
10584Return a copy of the string S with leading and trailing\n\
10585whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010586If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010587
10588static PyObject *
10589unicode_strip(PyUnicodeObject *self, PyObject *args)
10590{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010591 if (PyTuple_GET_SIZE(args) == 0)
10592 return do_strip(self, BOTHSTRIP); /* Common case */
10593 else
10594 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010595}
10596
10597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010598PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010599 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010600\n\
10601Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010602If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010603
10604static PyObject *
10605unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10606{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010607 if (PyTuple_GET_SIZE(args) == 0)
10608 return do_strip(self, LEFTSTRIP); /* Common case */
10609 else
10610 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010611}
10612
10613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010614PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010615 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010616\n\
10617Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010618If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010619
10620static PyObject *
10621unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10622{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010623 if (PyTuple_GET_SIZE(args) == 0)
10624 return do_strip(self, RIGHTSTRIP); /* Common case */
10625 else
10626 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010627}
10628
10629
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010631unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632{
10633 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635
Georg Brandl222de0f2009-04-12 12:01:50 +000010636 if (len < 1) {
10637 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020010638 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000010639 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010640
Tim Peters7a29bd52001-09-12 03:03:31 +000010641 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642 /* no repeat, return original string */
10643 Py_INCREF(str);
10644 return (PyObject*) str;
10645 }
Tim Peters8f422462000-09-09 06:13:41 +000010646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 if (PyUnicode_READY(str) == -1)
10648 return NULL;
10649
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010650 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010651 PyErr_SetString(PyExc_OverflowError,
10652 "repeated string is too long");
10653 return NULL;
10654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010658 if (!u)
10659 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010660 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 if (PyUnicode_GET_LENGTH(str) == 1) {
10663 const int kind = PyUnicode_KIND(str);
10664 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10665 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010666 if (kind == PyUnicode_1BYTE_KIND)
10667 memset(to, (unsigned char)fill_char, len);
10668 else {
10669 for (n = 0; n < len; ++n)
10670 PyUnicode_WRITE(kind, to, n, fill_char);
10671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 }
10673 else {
10674 /* number of characters copied this far */
10675 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10676 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10677 char *to = (char *) PyUnicode_DATA(u);
10678 Py_MEMCPY(to, PyUnicode_DATA(str),
10679 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010680 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 n = (done <= nchars-done) ? done : nchars-done;
10682 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010683 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010684 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010685 }
10686
10687 return (PyObject*) u;
10688}
10689
Alexander Belopolsky40018472011-02-26 01:02:56 +000010690PyObject *
10691PyUnicode_Replace(PyObject *obj,
10692 PyObject *subobj,
10693 PyObject *replobj,
10694 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695{
10696 PyObject *self;
10697 PyObject *str1;
10698 PyObject *str2;
10699 PyObject *result;
10700
10701 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010702 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010703 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010705 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010706 Py_DECREF(self);
10707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708 }
10709 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010710 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010711 Py_DECREF(self);
10712 Py_DECREF(str1);
10713 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716 Py_DECREF(self);
10717 Py_DECREF(str1);
10718 Py_DECREF(str2);
10719 return result;
10720}
10721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010722PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010723 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724\n\
10725Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010726old replaced by new. If the optional argument count is\n\
10727given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728
10729static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010732 PyObject *str1;
10733 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010734 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735 PyObject *result;
10736
Martin v. Löwis18e16552006-02-15 17:27:45 +000010737 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010740 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 str1 = PyUnicode_FromObject(str1);
10742 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10743 return NULL;
10744 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020010745 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010746 Py_DECREF(str1);
10747 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010748 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749
10750 result = replace(self, str1, str2, maxcount);
10751
10752 Py_DECREF(str1);
10753 Py_DECREF(str2);
10754 return result;
10755}
10756
Alexander Belopolsky40018472011-02-26 01:02:56 +000010757static PyObject *
10758unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010760 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 Py_ssize_t isize;
10762 Py_ssize_t osize, squote, dquote, i, o;
10763 Py_UCS4 max, quote;
10764 int ikind, okind;
10765 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010767 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010768 return NULL;
10769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 isize = PyUnicode_GET_LENGTH(unicode);
10771 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 /* Compute length of output, quote characters, and
10774 maximum character */
10775 osize = 2; /* quotes */
10776 max = 127;
10777 squote = dquote = 0;
10778 ikind = PyUnicode_KIND(unicode);
10779 for (i = 0; i < isize; i++) {
10780 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10781 switch (ch) {
10782 case '\'': squote++; osize++; break;
10783 case '"': dquote++; osize++; break;
10784 case '\\': case '\t': case '\r': case '\n':
10785 osize += 2; break;
10786 default:
10787 /* Fast-path ASCII */
10788 if (ch < ' ' || ch == 0x7f)
10789 osize += 4; /* \xHH */
10790 else if (ch < 0x7f)
10791 osize++;
10792 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10793 osize++;
10794 max = ch > max ? ch : max;
10795 }
10796 else if (ch < 0x100)
10797 osize += 4; /* \xHH */
10798 else if (ch < 0x10000)
10799 osize += 6; /* \uHHHH */
10800 else
10801 osize += 10; /* \uHHHHHHHH */
10802 }
10803 }
10804
10805 quote = '\'';
10806 if (squote) {
10807 if (dquote)
10808 /* Both squote and dquote present. Use squote,
10809 and escape them */
10810 osize += squote;
10811 else
10812 quote = '"';
10813 }
10814
10815 repr = PyUnicode_New(osize, max);
10816 if (repr == NULL)
10817 return NULL;
10818 okind = PyUnicode_KIND(repr);
10819 odata = PyUnicode_DATA(repr);
10820
10821 PyUnicode_WRITE(okind, odata, 0, quote);
10822 PyUnicode_WRITE(okind, odata, osize-1, quote);
10823
10824 for (i = 0, o = 1; i < isize; i++) {
10825 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010826
10827 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 if ((ch == quote) || (ch == '\\')) {
10829 PyUnicode_WRITE(okind, odata, o++, '\\');
10830 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010831 continue;
10832 }
10833
Benjamin Peterson29060642009-01-31 22:14:21 +000010834 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010835 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 PyUnicode_WRITE(okind, odata, o++, '\\');
10837 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010838 }
10839 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 PyUnicode_WRITE(okind, odata, o++, '\\');
10841 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010842 }
10843 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 PyUnicode_WRITE(okind, odata, o++, '\\');
10845 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010846 }
10847
10848 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010849 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 PyUnicode_WRITE(okind, odata, o++, '\\');
10851 PyUnicode_WRITE(okind, odata, o++, 'x');
10852 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10853 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010854 }
10855
Georg Brandl559e5d72008-06-11 18:37:52 +000010856 /* Copy ASCII characters as-is */
10857 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010859 }
10860
Benjamin Peterson29060642009-01-31 22:14:21 +000010861 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010862 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010863 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010864 (categories Z* and C* except ASCII space)
10865 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010867 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 if (ch <= 0xff) {
10869 PyUnicode_WRITE(okind, odata, o++, '\\');
10870 PyUnicode_WRITE(okind, odata, o++, 'x');
10871 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10872 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010873 }
10874 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 else if (ch >= 0x10000) {
10876 PyUnicode_WRITE(okind, odata, o++, '\\');
10877 PyUnicode_WRITE(okind, odata, o++, 'U');
10878 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10879 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10880 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10881 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10882 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10883 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10884 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10885 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010886 }
10887 /* Map 16-bit characters to '\uxxxx' */
10888 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010889 PyUnicode_WRITE(okind, odata, o++, '\\');
10890 PyUnicode_WRITE(okind, odata, o++, 'u');
10891 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10892 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10893 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10894 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010895 }
10896 }
10897 /* Copy characters as-is */
10898 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010899 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010900 }
10901 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010904 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905}
10906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010907PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010908 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909\n\
10910Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010911such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912arguments start and end are interpreted as in slice notation.\n\
10913\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010914Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915
10916static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010917unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918{
Jesus Ceaac451502011-04-20 17:09:23 +020010919 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010920 Py_ssize_t start;
10921 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010922 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923
Jesus Ceaac451502011-04-20 17:09:23 +020010924 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10925 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010926 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 if (PyUnicode_READY(self) == -1)
10929 return NULL;
10930 if (PyUnicode_READY(substring) == -1)
10931 return NULL;
10932
10933 result = any_find_slice(
10934 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10935 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010936 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937
10938 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010940 if (result == -2)
10941 return NULL;
10942
Christian Heimes217cfd12007-12-02 14:31:20 +000010943 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944}
10945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010946PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010947 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010949Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950
10951static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953{
Jesus Ceaac451502011-04-20 17:09:23 +020010954 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010955 Py_ssize_t start;
10956 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010957 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
Jesus Ceaac451502011-04-20 17:09:23 +020010959 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10960 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010961 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 if (PyUnicode_READY(self) == -1)
10964 return NULL;
10965 if (PyUnicode_READY(substring) == -1)
10966 return NULL;
10967
10968 result = any_find_slice(
10969 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10970 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010971 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972
10973 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 if (result == -2)
10976 return NULL;
10977
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978 if (result < 0) {
10979 PyErr_SetString(PyExc_ValueError, "substring not found");
10980 return NULL;
10981 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982
Christian Heimes217cfd12007-12-02 14:31:20 +000010983 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984}
10985
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010986PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010987 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010989Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010990done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991
10992static PyObject *
10993unicode_rjust(PyUnicodeObject *self, PyObject *args)
10994{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010995 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 Py_UCS4 fillchar = ' ';
10997
Victor Stinnere9a29352011-10-01 02:14:59 +020010998 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011000
Victor Stinnere9a29352011-10-01 02:14:59 +020011001 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002 return NULL;
11003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005 Py_INCREF(self);
11006 return (PyObject*) self;
11007 }
11008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010}
11011
Alexander Belopolsky40018472011-02-26 01:02:56 +000011012PyObject *
11013PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014{
11015 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011016
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017 s = PyUnicode_FromObject(s);
11018 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011019 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 if (sep != NULL) {
11021 sep = PyUnicode_FromObject(sep);
11022 if (sep == NULL) {
11023 Py_DECREF(s);
11024 return NULL;
11025 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026 }
11027
11028 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11029
11030 Py_DECREF(s);
11031 Py_XDECREF(sep);
11032 return result;
11033}
11034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011035PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011036 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037\n\
11038Return a list of the words in S, using sep as the\n\
11039delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011040splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011041whitespace string is a separator and empty strings are\n\
11042removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043
11044static PyObject*
11045unicode_split(PyUnicodeObject *self, PyObject *args)
11046{
11047 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011048 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049
Martin v. Löwis18e16552006-02-15 17:27:45 +000011050 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051 return NULL;
11052
11053 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011054 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011056 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011058 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059}
11060
Thomas Wouters477c8d52006-05-27 19:21:47 +000011061PyObject *
11062PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11063{
11064 PyObject* str_obj;
11065 PyObject* sep_obj;
11066 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067 int kind1, kind2, kind;
11068 void *buf1 = NULL, *buf2 = NULL;
11069 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011070
11071 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011072 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011073 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011074 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011075 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011076 Py_DECREF(str_obj);
11077 return NULL;
11078 }
11079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 kind1 = PyUnicode_KIND(str_in);
11081 kind2 = PyUnicode_KIND(sep_obj);
11082 kind = kind1 > kind2 ? kind1 : kind2;
11083 buf1 = PyUnicode_DATA(str_in);
11084 if (kind1 != kind)
11085 buf1 = _PyUnicode_AsKind(str_in, kind);
11086 if (!buf1)
11087 goto onError;
11088 buf2 = PyUnicode_DATA(sep_obj);
11089 if (kind2 != kind)
11090 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11091 if (!buf2)
11092 goto onError;
11093 len1 = PyUnicode_GET_LENGTH(str_obj);
11094 len2 = PyUnicode_GET_LENGTH(sep_obj);
11095
11096 switch(PyUnicode_KIND(str_in)) {
11097 case PyUnicode_1BYTE_KIND:
11098 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11099 break;
11100 case PyUnicode_2BYTE_KIND:
11101 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11102 break;
11103 case PyUnicode_4BYTE_KIND:
11104 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11105 break;
11106 default:
11107 assert(0);
11108 out = 0;
11109 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011110
11111 Py_DECREF(sep_obj);
11112 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011113 if (kind1 != kind)
11114 PyMem_Free(buf1);
11115 if (kind2 != kind)
11116 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011117
11118 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 onError:
11120 Py_DECREF(sep_obj);
11121 Py_DECREF(str_obj);
11122 if (kind1 != kind && buf1)
11123 PyMem_Free(buf1);
11124 if (kind2 != kind && buf2)
11125 PyMem_Free(buf2);
11126 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011127}
11128
11129
11130PyObject *
11131PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11132{
11133 PyObject* str_obj;
11134 PyObject* sep_obj;
11135 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 int kind1, kind2, kind;
11137 void *buf1 = NULL, *buf2 = NULL;
11138 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011139
11140 str_obj = PyUnicode_FromObject(str_in);
11141 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011142 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011143 sep_obj = PyUnicode_FromObject(sep_in);
11144 if (!sep_obj) {
11145 Py_DECREF(str_obj);
11146 return NULL;
11147 }
11148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 kind1 = PyUnicode_KIND(str_in);
11150 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011151 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 buf1 = PyUnicode_DATA(str_in);
11153 if (kind1 != kind)
11154 buf1 = _PyUnicode_AsKind(str_in, kind);
11155 if (!buf1)
11156 goto onError;
11157 buf2 = PyUnicode_DATA(sep_obj);
11158 if (kind2 != kind)
11159 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11160 if (!buf2)
11161 goto onError;
11162 len1 = PyUnicode_GET_LENGTH(str_obj);
11163 len2 = PyUnicode_GET_LENGTH(sep_obj);
11164
11165 switch(PyUnicode_KIND(str_in)) {
11166 case PyUnicode_1BYTE_KIND:
11167 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11168 break;
11169 case PyUnicode_2BYTE_KIND:
11170 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11171 break;
11172 case PyUnicode_4BYTE_KIND:
11173 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11174 break;
11175 default:
11176 assert(0);
11177 out = 0;
11178 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011179
11180 Py_DECREF(sep_obj);
11181 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011182 if (kind1 != kind)
11183 PyMem_Free(buf1);
11184 if (kind2 != kind)
11185 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011186
11187 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188 onError:
11189 Py_DECREF(sep_obj);
11190 Py_DECREF(str_obj);
11191 if (kind1 != kind && buf1)
11192 PyMem_Free(buf1);
11193 if (kind2 != kind && buf2)
11194 PyMem_Free(buf2);
11195 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011196}
11197
11198PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011199 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011200\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011201Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011202the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011203found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011204
11205static PyObject*
11206unicode_partition(PyUnicodeObject *self, PyObject *separator)
11207{
11208 return PyUnicode_Partition((PyObject *)self, separator);
11209}
11210
11211PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011212 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011213\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011214Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011215the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011216separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011217
11218static PyObject*
11219unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11220{
11221 return PyUnicode_RPartition((PyObject *)self, separator);
11222}
11223
Alexander Belopolsky40018472011-02-26 01:02:56 +000011224PyObject *
11225PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011226{
11227 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011228
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011229 s = PyUnicode_FromObject(s);
11230 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011231 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011232 if (sep != NULL) {
11233 sep = PyUnicode_FromObject(sep);
11234 if (sep == NULL) {
11235 Py_DECREF(s);
11236 return NULL;
11237 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011238 }
11239
11240 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11241
11242 Py_DECREF(s);
11243 Py_XDECREF(sep);
11244 return result;
11245}
11246
11247PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011248 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011249\n\
11250Return a list of the words in S, using sep as the\n\
11251delimiter string, starting at the end of the string and\n\
11252working to the front. If maxsplit is given, at most maxsplit\n\
11253splits are done. If sep is not specified, any whitespace string\n\
11254is a separator.");
11255
11256static PyObject*
11257unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11258{
11259 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011260 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011261
Martin v. Löwis18e16552006-02-15 17:27:45 +000011262 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011263 return NULL;
11264
11265 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011267 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011268 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011269 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011270 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011271}
11272
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011273PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011274 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275\n\
11276Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011277Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011278is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279
11280static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011281unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011283 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011284 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011286 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11287 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288 return NULL;
11289
Guido van Rossum86662912000-04-11 15:38:46 +000011290 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291}
11292
11293static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011294PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295{
Walter Dörwald346737f2007-05-31 10:44:43 +000011296 if (PyUnicode_CheckExact(self)) {
11297 Py_INCREF(self);
11298 return self;
11299 } else
11300 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011301 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302}
11303
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011304PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011305 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306\n\
11307Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011308and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309
11310static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011311unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313 return fixup(self, fixswapcase);
11314}
11315
Georg Brandlceee0772007-11-27 23:48:05 +000011316PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011317 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011318\n\
11319Return a translation table usable for str.translate().\n\
11320If there is only one argument, it must be a dictionary mapping Unicode\n\
11321ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011322Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011323If there are two arguments, they must be strings of equal length, and\n\
11324in the resulting dictionary, each character in x will be mapped to the\n\
11325character at the same position in y. If there is a third argument, it\n\
11326must be a string, whose characters will be mapped to None in the result.");
11327
11328static PyObject*
11329unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11330{
11331 PyObject *x, *y = NULL, *z = NULL;
11332 PyObject *new = NULL, *key, *value;
11333 Py_ssize_t i = 0;
11334 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011335
Georg Brandlceee0772007-11-27 23:48:05 +000011336 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11337 return NULL;
11338 new = PyDict_New();
11339 if (!new)
11340 return NULL;
11341 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 int x_kind, y_kind, z_kind;
11343 void *x_data, *y_data, *z_data;
11344
Georg Brandlceee0772007-11-27 23:48:05 +000011345 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011346 if (!PyUnicode_Check(x)) {
11347 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11348 "be a string if there is a second argument");
11349 goto err;
11350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011351 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011352 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11353 "arguments must have equal length");
11354 goto err;
11355 }
11356 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357 x_kind = PyUnicode_KIND(x);
11358 y_kind = PyUnicode_KIND(y);
11359 x_data = PyUnicode_DATA(x);
11360 y_data = PyUnicode_DATA(y);
11361 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11362 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11363 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011364 if (!key || !value)
11365 goto err;
11366 res = PyDict_SetItem(new, key, value);
11367 Py_DECREF(key);
11368 Py_DECREF(value);
11369 if (res < 0)
11370 goto err;
11371 }
11372 /* create entries for deleting chars in z */
11373 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 z_kind = PyUnicode_KIND(z);
11375 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011376 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011378 if (!key)
11379 goto err;
11380 res = PyDict_SetItem(new, key, Py_None);
11381 Py_DECREF(key);
11382 if (res < 0)
11383 goto err;
11384 }
11385 }
11386 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 int kind;
11388 void *data;
11389
Georg Brandlceee0772007-11-27 23:48:05 +000011390 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011391 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011392 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11393 "to maketrans it must be a dict");
11394 goto err;
11395 }
11396 /* copy entries into the new dict, converting string keys to int keys */
11397 while (PyDict_Next(x, &i, &key, &value)) {
11398 if (PyUnicode_Check(key)) {
11399 /* convert string keys to integer keys */
11400 PyObject *newkey;
11401 if (PyUnicode_GET_SIZE(key) != 1) {
11402 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11403 "table must be of length 1");
11404 goto err;
11405 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406 kind = PyUnicode_KIND(key);
11407 data = PyUnicode_DATA(key);
11408 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011409 if (!newkey)
11410 goto err;
11411 res = PyDict_SetItem(new, newkey, value);
11412 Py_DECREF(newkey);
11413 if (res < 0)
11414 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011415 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011416 /* just keep integer keys */
11417 if (PyDict_SetItem(new, key, value) < 0)
11418 goto err;
11419 } else {
11420 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11421 "be strings or integers");
11422 goto err;
11423 }
11424 }
11425 }
11426 return new;
11427 err:
11428 Py_DECREF(new);
11429 return NULL;
11430}
11431
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011432PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011433 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434\n\
11435Return a copy of the string S, where all characters have been mapped\n\
11436through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011437Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011438Unmapped characters are left untouched. Characters mapped to None\n\
11439are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440
11441static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445}
11446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011447PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011448 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011450Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451
11452static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011453unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455 return fixup(self, fixupper);
11456}
11457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011458PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011459 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011461Pad a numeric string S with zeros on the left, to fill a field\n\
11462of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463
11464static PyObject *
11465unicode_zfill(PyUnicodeObject *self, PyObject *args)
11466{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011467 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011469 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 int kind;
11471 void *data;
11472 Py_UCS4 chr;
11473
11474 if (PyUnicode_READY(self) == -1)
11475 return NULL;
11476
Martin v. Löwis18e16552006-02-15 17:27:45 +000011477 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478 return NULL;
11479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011481 if (PyUnicode_CheckExact(self)) {
11482 Py_INCREF(self);
11483 return (PyObject*) self;
11484 }
11485 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011486 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487 }
11488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
11491 u = pad(self, fill, 0, '0');
11492
Walter Dörwald068325e2002-04-15 13:36:47 +000011493 if (u == NULL)
11494 return NULL;
11495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011496 kind = PyUnicode_KIND(u);
11497 data = PyUnicode_DATA(u);
11498 chr = PyUnicode_READ(kind, data, fill);
11499
11500 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 PyUnicode_WRITE(kind, data, 0, chr);
11503 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504 }
11505
11506 return (PyObject*) u;
11507}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
11509#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011510static PyObject *
11511unicode__decimal2ascii(PyObject *self)
11512{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011514}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515#endif
11516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011517PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011518 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011520Return True if S starts with the specified prefix, False otherwise.\n\
11521With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011522With optional end, stop comparing S at that position.\n\
11523prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524
11525static PyObject *
11526unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011527 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011529 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011531 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011532 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011533 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534
Jesus Ceaac451502011-04-20 17:09:23 +020011535 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011536 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011537 if (PyTuple_Check(subobj)) {
11538 Py_ssize_t i;
11539 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11540 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011541 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011542 if (substring == NULL)
11543 return NULL;
11544 result = tailmatch(self, substring, start, end, -1);
11545 Py_DECREF(substring);
11546 if (result) {
11547 Py_RETURN_TRUE;
11548 }
11549 }
11550 /* nothing matched */
11551 Py_RETURN_FALSE;
11552 }
11553 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011554 if (substring == NULL) {
11555 if (PyErr_ExceptionMatches(PyExc_TypeError))
11556 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11557 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011559 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011560 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011562 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563}
11564
11565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011566PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011567 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011569Return True if S ends with the specified suffix, False otherwise.\n\
11570With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011571With optional end, stop comparing S at that position.\n\
11572suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573
11574static PyObject *
11575unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011576 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011578 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011580 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011581 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011582 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583
Jesus Ceaac451502011-04-20 17:09:23 +020011584 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011585 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011586 if (PyTuple_Check(subobj)) {
11587 Py_ssize_t i;
11588 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11589 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011590 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011591 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011592 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011593 result = tailmatch(self, substring, start, end, +1);
11594 Py_DECREF(substring);
11595 if (result) {
11596 Py_RETURN_TRUE;
11597 }
11598 }
11599 Py_RETURN_FALSE;
11600 }
11601 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011602 if (substring == NULL) {
11603 if (PyErr_ExceptionMatches(PyExc_TypeError))
11604 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11605 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011606 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011607 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011608 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011610 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611}
11612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011614
11615PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011616 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011617\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011618Return a formatted version of S, using substitutions from args and kwargs.\n\
11619The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011620
Eric Smith27bbca62010-11-04 17:06:58 +000011621PyDoc_STRVAR(format_map__doc__,
11622 "S.format_map(mapping) -> str\n\
11623\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011624Return a formatted version of S, using substitutions from mapping.\n\
11625The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011626
Eric Smith4a7d76d2008-05-30 18:10:19 +000011627static PyObject *
11628unicode__format__(PyObject* self, PyObject* args)
11629{
11630 PyObject *format_spec;
11631
11632 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11633 return NULL;
11634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11636 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011637}
11638
Eric Smith8c663262007-08-25 02:26:07 +000011639PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011640 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011641\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011642Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011643
11644static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011645unicode__sizeof__(PyUnicodeObject *v)
11646{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 Py_ssize_t size;
11648
11649 /* If it's a compact object, account for base structure +
11650 character data. */
11651 if (PyUnicode_IS_COMPACT_ASCII(v))
11652 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11653 else if (PyUnicode_IS_COMPACT(v))
11654 size = sizeof(PyCompactUnicodeObject) +
11655 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11656 else {
11657 /* If it is a two-block object, account for base object, and
11658 for character block if present. */
11659 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020011660 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 size += (PyUnicode_GET_LENGTH(v) + 1) *
11662 PyUnicode_CHARACTER_SIZE(v);
11663 }
11664 /* If the wstr pointer is present, account for it unless it is shared
11665 with the data pointer. Since PyUnicode_DATA will crash if the object
11666 is not ready, check whether it's either not ready (in which case the
11667 data is entirely in wstr) or if the data is not shared. */
11668 if (_PyUnicode_WSTR(v) &&
11669 (!PyUnicode_IS_READY(v) ||
11670 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11671 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011672 if (!PyUnicode_IS_COMPACT_ASCII(v)
11673 && _PyUnicode_UTF8(v)
11674 && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11675 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676
11677 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011678}
11679
11680PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011681 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011682
11683static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011684unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011685{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011686 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 if (!copy)
11688 return NULL;
11689 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011690}
11691
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692static PyMethodDef unicode_methods[] = {
11693
11694 /* Order is according to common usage: often used methods should
11695 appear first, since lookup is done sequentially. */
11696
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011697 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011698 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11699 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011700 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011701 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11702 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11703 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11704 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11705 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11706 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11707 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011708 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011709 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11710 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11711 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011712 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011713 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11714 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11715 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011716 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011717 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011718 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011719 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011720 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11721 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11722 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11723 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11724 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11725 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11726 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11727 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11728 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11729 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11730 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11731 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11732 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11733 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011734 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011735 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011736 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011737 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011738 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011739 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011740 {"maketrans", (PyCFunction) unicode_maketrans,
11741 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011742 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011743#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011744 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745#endif
11746
11747#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011748 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011749 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750#endif
11751
Benjamin Peterson14339b62009-01-31 16:36:08 +000011752 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753 {NULL, NULL}
11754};
11755
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011756static PyObject *
11757unicode_mod(PyObject *v, PyObject *w)
11758{
Brian Curtindfc80e32011-08-10 20:28:54 -050011759 if (!PyUnicode_Check(v))
11760 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011762}
11763
11764static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011765 0, /*nb_add*/
11766 0, /*nb_subtract*/
11767 0, /*nb_multiply*/
11768 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011769};
11770
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011772 (lenfunc) unicode_length, /* sq_length */
11773 PyUnicode_Concat, /* sq_concat */
11774 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11775 (ssizeargfunc) unicode_getitem, /* sq_item */
11776 0, /* sq_slice */
11777 0, /* sq_ass_item */
11778 0, /* sq_ass_slice */
11779 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780};
11781
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011782static PyObject*
11783unicode_subscript(PyUnicodeObject* self, PyObject* item)
11784{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 if (PyUnicode_READY(self) == -1)
11786 return NULL;
11787
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011788 if (PyIndex_Check(item)) {
11789 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011790 if (i == -1 && PyErr_Occurred())
11791 return NULL;
11792 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011794 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011795 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011796 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011798 Py_UNICODE* result_buf;
11799 PyObject* result;
11800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011802 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011803 return NULL;
11804 }
11805
11806 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 return PyUnicode_New(0, 0);
11808 } else if (start == 0 && step == 1 &&
11809 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011810 PyUnicode_CheckExact(self)) {
11811 Py_INCREF(self);
11812 return (PyObject *)self;
11813 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011814 return PyUnicode_Substring((PyObject*)self,
11815 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011816 } else {
11817 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011818 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11819 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011820
Benjamin Peterson29060642009-01-31 22:14:21 +000011821 if (result_buf == NULL)
11822 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011823
11824 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11825 result_buf[i] = source_buf[cur];
11826 }
Tim Petersced69f82003-09-16 20:30:58 +000011827
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011828 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011829 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011830 return result;
11831 }
11832 } else {
11833 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11834 return NULL;
11835 }
11836}
11837
11838static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011839 (lenfunc)unicode_length, /* mp_length */
11840 (binaryfunc)unicode_subscript, /* mp_subscript */
11841 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011842};
11843
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845/* Helpers for PyUnicode_Format() */
11846
11847static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011848getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011850 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011852 (*p_argidx)++;
11853 if (arglen < 0)
11854 return args;
11855 else
11856 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857 }
11858 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011859 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860 return NULL;
11861}
11862
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011863/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011865static PyObject *
11866formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011868 char *p;
11869 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011871
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872 x = PyFloat_AsDouble(v);
11873 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011874 return NULL;
11875
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011878
Eric Smith0923d1d2009-04-16 20:16:10 +000011879 p = PyOS_double_to_string(x, type, prec,
11880 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011881 if (p == NULL)
11882 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011884 PyMem_Free(p);
11885 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886}
11887
Tim Peters38fd5b62000-09-21 05:43:11 +000011888static PyObject*
11889formatlong(PyObject *val, int flags, int prec, int type)
11890{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011891 char *buf;
11892 int len;
11893 PyObject *str; /* temporary string object. */
11894 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011895
Benjamin Peterson14339b62009-01-31 16:36:08 +000011896 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11897 if (!str)
11898 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011900 Py_DECREF(str);
11901 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011902}
11903
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011906 size_t buflen,
11907 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011909 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011910 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 if (PyUnicode_GET_LENGTH(v) == 1) {
11912 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011913 buf[1] = '\0';
11914 return 1;
11915 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011916 goto onError;
11917 }
11918 else {
11919 /* Integer input truncated to a character */
11920 long x;
11921 x = PyLong_AsLong(v);
11922 if (x == -1 && PyErr_Occurred())
11923 goto onError;
11924
11925 if (x < 0 || x > 0x10ffff) {
11926 PyErr_SetString(PyExc_OverflowError,
11927 "%c arg not in range(0x110000)");
11928 return -1;
11929 }
11930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011932 buf[1] = '\0';
11933 return 1;
11934 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011935
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011937 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011939 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940}
11941
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011942/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011943 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011944*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011945#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011946
Alexander Belopolsky40018472011-02-26 01:02:56 +000011947PyObject *
11948PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 void *fmt;
11951 int fmtkind;
11952 PyObject *result;
11953 Py_UCS4 *res, *res0;
11954 Py_UCS4 max;
11955 int kind;
11956 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011960
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011962 PyErr_BadInternalCall();
11963 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11966 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011967 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 fmt = PyUnicode_DATA(uformat);
11969 fmtkind = PyUnicode_KIND(uformat);
11970 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11971 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972
11973 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11975 if (res0 == NULL) {
11976 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979
11980 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 arglen = PyTuple_Size(args);
11982 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983 }
11984 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011985 arglen = -1;
11986 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011988 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011989 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011990 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991
11992 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011994 if (--rescnt < 0) {
11995 rescnt = fmtcnt + 100;
11996 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11998 if (res0 == NULL){
11999 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 }
12002 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012003 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012006 }
12007 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012008 /* Got a format specifier */
12009 int flags = 0;
12010 Py_ssize_t width = -1;
12011 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 Py_UCS4 c = '\0';
12013 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012014 int isnumok;
12015 PyObject *v = NULL;
12016 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 void *pbuf;
12018 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012019 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 Py_ssize_t len, len1;
12021 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 fmtpos++;
12024 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12025 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012026 Py_ssize_t keylen;
12027 PyObject *key;
12028 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012029
Benjamin Peterson29060642009-01-31 22:14:21 +000012030 if (dict == NULL) {
12031 PyErr_SetString(PyExc_TypeError,
12032 "format requires a mapping");
12033 goto onError;
12034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012036 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012038 /* Skip over balanced parentheses */
12039 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012041 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012043 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 if (fmtcnt < 0 || pcount > 0) {
12048 PyErr_SetString(PyExc_ValueError,
12049 "incomplete format key");
12050 goto onError;
12051 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012052 key = PyUnicode_Substring((PyObject*)uformat,
12053 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012054 if (key == NULL)
12055 goto onError;
12056 if (args_owned) {
12057 Py_DECREF(args);
12058 args_owned = 0;
12059 }
12060 args = PyObject_GetItem(dict, key);
12061 Py_DECREF(key);
12062 if (args == NULL) {
12063 goto onError;
12064 }
12065 args_owned = 1;
12066 arglen = -1;
12067 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012068 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012069 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012071 case '-': flags |= F_LJUST; continue;
12072 case '+': flags |= F_SIGN; continue;
12073 case ' ': flags |= F_BLANK; continue;
12074 case '#': flags |= F_ALT; continue;
12075 case '0': flags |= F_ZERO; continue;
12076 }
12077 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012078 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012079 if (c == '*') {
12080 v = getnextarg(args, arglen, &argidx);
12081 if (v == NULL)
12082 goto onError;
12083 if (!PyLong_Check(v)) {
12084 PyErr_SetString(PyExc_TypeError,
12085 "* wants int");
12086 goto onError;
12087 }
12088 width = PyLong_AsLong(v);
12089 if (width == -1 && PyErr_Occurred())
12090 goto onError;
12091 if (width < 0) {
12092 flags |= F_LJUST;
12093 width = -width;
12094 }
12095 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012097 }
12098 else if (c >= '0' && c <= '9') {
12099 width = c - '0';
12100 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012102 if (c < '0' || c > '9')
12103 break;
12104 if ((width*10) / 10 != width) {
12105 PyErr_SetString(PyExc_ValueError,
12106 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012107 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012108 }
12109 width = width*10 + (c - '0');
12110 }
12111 }
12112 if (c == '.') {
12113 prec = 0;
12114 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012116 if (c == '*') {
12117 v = getnextarg(args, arglen, &argidx);
12118 if (v == NULL)
12119 goto onError;
12120 if (!PyLong_Check(v)) {
12121 PyErr_SetString(PyExc_TypeError,
12122 "* wants int");
12123 goto onError;
12124 }
12125 prec = PyLong_AsLong(v);
12126 if (prec == -1 && PyErr_Occurred())
12127 goto onError;
12128 if (prec < 0)
12129 prec = 0;
12130 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012132 }
12133 else if (c >= '0' && c <= '9') {
12134 prec = c - '0';
12135 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012137 if (c < '0' || c > '9')
12138 break;
12139 if ((prec*10) / 10 != prec) {
12140 PyErr_SetString(PyExc_ValueError,
12141 "prec too big");
12142 goto onError;
12143 }
12144 prec = prec*10 + (c - '0');
12145 }
12146 }
12147 } /* prec */
12148 if (fmtcnt >= 0) {
12149 if (c == 'h' || c == 'l' || c == 'L') {
12150 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012151 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012152 }
12153 }
12154 if (fmtcnt < 0) {
12155 PyErr_SetString(PyExc_ValueError,
12156 "incomplete format");
12157 goto onError;
12158 }
12159 if (c != '%') {
12160 v = getnextarg(args, arglen, &argidx);
12161 if (v == NULL)
12162 goto onError;
12163 }
12164 sign = 0;
12165 fill = ' ';
12166 switch (c) {
12167
12168 case '%':
12169 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012171 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012173 len = 1;
12174 break;
12175
12176 case 's':
12177 case 'r':
12178 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012179 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012180 temp = v;
12181 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012182 }
12183 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012184 if (c == 's')
12185 temp = PyObject_Str(v);
12186 else if (c == 'r')
12187 temp = PyObject_Repr(v);
12188 else
12189 temp = PyObject_ASCII(v);
12190 if (temp == NULL)
12191 goto onError;
12192 if (PyUnicode_Check(temp))
12193 /* nothing to do */;
12194 else {
12195 Py_DECREF(temp);
12196 PyErr_SetString(PyExc_TypeError,
12197 "%s argument has non-string str()");
12198 goto onError;
12199 }
12200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 if (PyUnicode_READY(temp) == -1) {
12202 Py_CLEAR(temp);
12203 goto onError;
12204 }
12205 pbuf = PyUnicode_DATA(temp);
12206 kind = PyUnicode_KIND(temp);
12207 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012208 if (prec >= 0 && len > prec)
12209 len = prec;
12210 break;
12211
12212 case 'i':
12213 case 'd':
12214 case 'u':
12215 case 'o':
12216 case 'x':
12217 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012218 isnumok = 0;
12219 if (PyNumber_Check(v)) {
12220 PyObject *iobj=NULL;
12221
12222 if (PyLong_Check(v)) {
12223 iobj = v;
12224 Py_INCREF(iobj);
12225 }
12226 else {
12227 iobj = PyNumber_Long(v);
12228 }
12229 if (iobj!=NULL) {
12230 if (PyLong_Check(iobj)) {
12231 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012232 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012233 Py_DECREF(iobj);
12234 if (!temp)
12235 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 if (PyUnicode_READY(temp) == -1) {
12237 Py_CLEAR(temp);
12238 goto onError;
12239 }
12240 pbuf = PyUnicode_DATA(temp);
12241 kind = PyUnicode_KIND(temp);
12242 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012243 sign = 1;
12244 }
12245 else {
12246 Py_DECREF(iobj);
12247 }
12248 }
12249 }
12250 if (!isnumok) {
12251 PyErr_Format(PyExc_TypeError,
12252 "%%%c format: a number is required, "
12253 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12254 goto onError;
12255 }
12256 if (flags & F_ZERO)
12257 fill = '0';
12258 break;
12259
12260 case 'e':
12261 case 'E':
12262 case 'f':
12263 case 'F':
12264 case 'g':
12265 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012266 temp = formatfloat(v, flags, prec, c);
12267 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012268 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012269 if (PyUnicode_READY(temp) == -1) {
12270 Py_CLEAR(temp);
12271 goto onError;
12272 }
12273 pbuf = PyUnicode_DATA(temp);
12274 kind = PyUnicode_KIND(temp);
12275 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012276 sign = 1;
12277 if (flags & F_ZERO)
12278 fill = '0';
12279 break;
12280
12281 case 'c':
12282 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012284 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 if (len < 0)
12286 goto onError;
12287 break;
12288
12289 default:
12290 PyErr_Format(PyExc_ValueError,
12291 "unsupported format character '%c' (0x%x) "
12292 "at index %zd",
12293 (31<=c && c<=126) ? (char)c : '?',
12294 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012296 goto onError;
12297 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298 /* pbuf is initialized here. */
12299 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012300 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12302 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12303 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 len--;
12305 }
12306 else if (flags & F_SIGN)
12307 sign = '+';
12308 else if (flags & F_BLANK)
12309 sign = ' ';
12310 else
12311 sign = 0;
12312 }
12313 if (width < len)
12314 width = len;
12315 if (rescnt - (sign != 0) < width) {
12316 reslen -= rescnt;
12317 rescnt = width + fmtcnt + 100;
12318 reslen += rescnt;
12319 if (reslen < 0) {
12320 Py_XDECREF(temp);
12321 PyErr_NoMemory();
12322 goto onError;
12323 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12325 if (res0 == 0) {
12326 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012327 Py_XDECREF(temp);
12328 goto onError;
12329 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012331 }
12332 if (sign) {
12333 if (fill != ' ')
12334 *res++ = sign;
12335 rescnt--;
12336 if (width > len)
12337 width--;
12338 }
12339 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12341 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012342 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12344 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012345 }
12346 rescnt -= 2;
12347 width -= 2;
12348 if (width < 0)
12349 width = 0;
12350 len -= 2;
12351 }
12352 if (width > len && !(flags & F_LJUST)) {
12353 do {
12354 --rescnt;
12355 *res++ = fill;
12356 } while (--width > len);
12357 }
12358 if (fill == ' ') {
12359 if (sign)
12360 *res++ = sign;
12361 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12363 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12364 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12365 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012366 }
12367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 /* Copy all characters, preserving len */
12369 len1 = len;
12370 while (len1--) {
12371 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12372 rescnt--;
12373 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012374 while (--width >= len) {
12375 --rescnt;
12376 *res++ = ' ';
12377 }
12378 if (dict && (argidx < arglen) && c != '%') {
12379 PyErr_SetString(PyExc_TypeError,
12380 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012381 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012382 goto onError;
12383 }
12384 Py_XDECREF(temp);
12385 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386 } /* until end */
12387 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 PyErr_SetString(PyExc_TypeError,
12389 "not all arguments converted during string formatting");
12390 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391 }
12392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393
12394 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12395 if (*res > max)
12396 max = *res;
12397 result = PyUnicode_New(reslen - rescnt, max);
12398 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012399 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400 kind = PyUnicode_KIND(result);
12401 for (res = res0; res < res0+reslen-rescnt; res++)
12402 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12403 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012404 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012405 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012406 }
12407 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012408 return (PyObject *)result;
12409
Benjamin Peterson29060642009-01-31 22:14:21 +000012410 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012412 Py_DECREF(uformat);
12413 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012414 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012415 }
12416 return NULL;
12417}
12418
Jeremy Hylton938ace62002-07-17 16:30:39 +000012419static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012420unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12421
Tim Peters6d6c1a32001-08-02 04:15:00 +000012422static PyObject *
12423unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12424{
Benjamin Peterson29060642009-01-31 22:14:21 +000012425 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012426 static char *kwlist[] = {"object", "encoding", "errors", 0};
12427 char *encoding = NULL;
12428 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012429
Benjamin Peterson14339b62009-01-31 16:36:08 +000012430 if (type != &PyUnicode_Type)
12431 return unicode_subtype_new(type, args, kwds);
12432 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012433 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012434 return NULL;
12435 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012436 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012437 if (encoding == NULL && errors == NULL)
12438 return PyObject_Str(x);
12439 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012440 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012441}
12442
Guido van Rossume023fe02001-08-30 03:12:59 +000012443static PyObject *
12444unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12445{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012446 PyUnicodeObject *unicode, *self;
12447 Py_ssize_t length, char_size;
12448 int share_wstr, share_utf8;
12449 unsigned int kind;
12450 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012451
Benjamin Peterson14339b62009-01-31 16:36:08 +000012452 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012453
12454 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12455 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012456 return NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012457 assert(PyUnicode_Check(unicode));
12458 if (PyUnicode_READY(unicode))
12459 return NULL;
12460
12461 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12462 if (self == NULL) {
12463 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464 return NULL;
12465 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012466 kind = PyUnicode_KIND(unicode);
12467 length = PyUnicode_GET_LENGTH(unicode);
12468
12469 _PyUnicode_LENGTH(self) = length;
12470 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12471 _PyUnicode_STATE(self).interned = 0;
12472 _PyUnicode_STATE(self).kind = kind;
12473 _PyUnicode_STATE(self).compact = 0;
12474 _PyUnicode_STATE(self).ascii = 0;
12475 _PyUnicode_STATE(self).ready = 1;
12476 _PyUnicode_WSTR(self) = NULL;
12477 _PyUnicode_UTF8_LENGTH(self) = 0;
12478 _PyUnicode_UTF8(self) = NULL;
12479 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012480 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012481
12482 share_utf8 = 0;
12483 share_wstr = 0;
12484 if (kind == PyUnicode_1BYTE_KIND) {
12485 char_size = 1;
12486 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12487 share_utf8 = 1;
12488 }
12489 else if (kind == PyUnicode_2BYTE_KIND) {
12490 char_size = 2;
12491 if (sizeof(wchar_t) == 2)
12492 share_wstr = 1;
12493 }
12494 else {
12495 assert(kind == PyUnicode_4BYTE_KIND);
12496 char_size = 4;
12497 if (sizeof(wchar_t) == 4)
12498 share_wstr = 1;
12499 }
12500
12501 /* Ensure we won't overflow the length. */
12502 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12503 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012505 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012506 data = PyObject_MALLOC((length + 1) * char_size);
12507 if (data == NULL) {
12508 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 goto onError;
12510 }
12511
Victor Stinnerc3c74152011-10-02 20:39:55 +020012512 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012513 if (share_utf8) {
12514 _PyUnicode_UTF8_LENGTH(self) = length;
12515 _PyUnicode_UTF8(self) = data;
12516 }
12517 if (share_wstr) {
12518 _PyUnicode_WSTR_LENGTH(self) = length;
12519 _PyUnicode_WSTR(self) = (wchar_t *)data;
12520 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012522 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12523 PyUnicode_KIND_SIZE(kind, length + 1));
12524 Py_DECREF(unicode);
12525 return (PyObject *)self;
12526
12527onError:
12528 Py_DECREF(unicode);
12529 Py_DECREF(self);
12530 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012531}
12532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012533PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012534 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012535\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012536Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012537encoding defaults to the current default string encoding.\n\
12538errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012539
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012540static PyObject *unicode_iter(PyObject *seq);
12541
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012543 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012544 "str", /* tp_name */
12545 sizeof(PyUnicodeObject), /* tp_size */
12546 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012548 (destructor)unicode_dealloc, /* tp_dealloc */
12549 0, /* tp_print */
12550 0, /* tp_getattr */
12551 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012552 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012553 unicode_repr, /* tp_repr */
12554 &unicode_as_number, /* tp_as_number */
12555 &unicode_as_sequence, /* tp_as_sequence */
12556 &unicode_as_mapping, /* tp_as_mapping */
12557 (hashfunc) unicode_hash, /* tp_hash*/
12558 0, /* tp_call*/
12559 (reprfunc) unicode_str, /* tp_str */
12560 PyObject_GenericGetAttr, /* tp_getattro */
12561 0, /* tp_setattro */
12562 0, /* tp_as_buffer */
12563 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012564 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012565 unicode_doc, /* tp_doc */
12566 0, /* tp_traverse */
12567 0, /* tp_clear */
12568 PyUnicode_RichCompare, /* tp_richcompare */
12569 0, /* tp_weaklistoffset */
12570 unicode_iter, /* tp_iter */
12571 0, /* tp_iternext */
12572 unicode_methods, /* tp_methods */
12573 0, /* tp_members */
12574 0, /* tp_getset */
12575 &PyBaseObject_Type, /* tp_base */
12576 0, /* tp_dict */
12577 0, /* tp_descr_get */
12578 0, /* tp_descr_set */
12579 0, /* tp_dictoffset */
12580 0, /* tp_init */
12581 0, /* tp_alloc */
12582 unicode_new, /* tp_new */
12583 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584};
12585
12586/* Initialize the Unicode implementation */
12587
Thomas Wouters78890102000-07-22 19:25:51 +000012588void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012590 int i;
12591
Thomas Wouters477c8d52006-05-27 19:21:47 +000012592 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012594 0x000A, /* LINE FEED */
12595 0x000D, /* CARRIAGE RETURN */
12596 0x001C, /* FILE SEPARATOR */
12597 0x001D, /* GROUP SEPARATOR */
12598 0x001E, /* RECORD SEPARATOR */
12599 0x0085, /* NEXT LINE */
12600 0x2028, /* LINE SEPARATOR */
12601 0x2029, /* PARAGRAPH SEPARATOR */
12602 };
12603
Fred Drakee4315f52000-05-09 19:53:39 +000012604 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012605 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012606 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012607 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012608
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012609 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012610 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012611 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012613
12614 /* initialize the linebreak bloom filter */
12615 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012617 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012618
12619 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620}
12621
12622/* Finalize the Unicode implementation */
12623
Christian Heimesa156e092008-02-16 07:38:31 +000012624int
12625PyUnicode_ClearFreeList(void)
12626{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012628}
12629
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630void
Thomas Wouters78890102000-07-22 19:25:51 +000012631_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012633 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012635 Py_XDECREF(unicode_empty);
12636 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012637
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012638 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012639 if (unicode_latin1[i]) {
12640 Py_DECREF(unicode_latin1[i]);
12641 unicode_latin1[i] = NULL;
12642 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012643 }
Christian Heimesa156e092008-02-16 07:38:31 +000012644 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012645}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012646
Walter Dörwald16807132007-05-25 13:52:07 +000012647void
12648PyUnicode_InternInPlace(PyObject **p)
12649{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012650 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12651 PyObject *t;
12652 if (s == NULL || !PyUnicode_Check(s))
12653 Py_FatalError(
12654 "PyUnicode_InternInPlace: unicode strings only please!");
12655 /* If it's a subclass, we don't really know what putting
12656 it in the interned dict might do. */
12657 if (!PyUnicode_CheckExact(s))
12658 return;
12659 if (PyUnicode_CHECK_INTERNED(s))
12660 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 if (PyUnicode_READY(s) == -1) {
12662 assert(0 && "ready fail in intern...");
12663 return;
12664 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012665 if (interned == NULL) {
12666 interned = PyDict_New();
12667 if (interned == NULL) {
12668 PyErr_Clear(); /* Don't leave an exception */
12669 return;
12670 }
12671 }
12672 /* It might be that the GetItem call fails even
12673 though the key is present in the dictionary,
12674 namely when this happens during a stack overflow. */
12675 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012676 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012677 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012678
Benjamin Peterson29060642009-01-31 22:14:21 +000012679 if (t) {
12680 Py_INCREF(t);
12681 Py_DECREF(*p);
12682 *p = t;
12683 return;
12684 }
Walter Dörwald16807132007-05-25 13:52:07 +000012685
Benjamin Peterson14339b62009-01-31 16:36:08 +000012686 PyThreadState_GET()->recursion_critical = 1;
12687 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12688 PyErr_Clear();
12689 PyThreadState_GET()->recursion_critical = 0;
12690 return;
12691 }
12692 PyThreadState_GET()->recursion_critical = 0;
12693 /* The two references in interned are not counted by refcnt.
12694 The deallocator will take care of this */
12695 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012697}
12698
12699void
12700PyUnicode_InternImmortal(PyObject **p)
12701{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012702 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12703
Benjamin Peterson14339b62009-01-31 16:36:08 +000012704 PyUnicode_InternInPlace(p);
12705 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012707 Py_INCREF(*p);
12708 }
Walter Dörwald16807132007-05-25 13:52:07 +000012709}
12710
12711PyObject *
12712PyUnicode_InternFromString(const char *cp)
12713{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012714 PyObject *s = PyUnicode_FromString(cp);
12715 if (s == NULL)
12716 return NULL;
12717 PyUnicode_InternInPlace(&s);
12718 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012719}
12720
Alexander Belopolsky40018472011-02-26 01:02:56 +000012721void
12722_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012723{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012724 PyObject *keys;
12725 PyUnicodeObject *s;
12726 Py_ssize_t i, n;
12727 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012728
Benjamin Peterson14339b62009-01-31 16:36:08 +000012729 if (interned == NULL || !PyDict_Check(interned))
12730 return;
12731 keys = PyDict_Keys(interned);
12732 if (keys == NULL || !PyList_Check(keys)) {
12733 PyErr_Clear();
12734 return;
12735 }
Walter Dörwald16807132007-05-25 13:52:07 +000012736
Benjamin Peterson14339b62009-01-31 16:36:08 +000012737 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12738 detector, interned unicode strings are not forcibly deallocated;
12739 rather, we give them their stolen references back, and then clear
12740 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012741
Benjamin Peterson14339b62009-01-31 16:36:08 +000012742 n = PyList_GET_SIZE(keys);
12743 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012744 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012745 for (i = 0; i < n; i++) {
12746 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012747 if (PyUnicode_READY(s) == -1)
12748 fprintf(stderr, "could not ready string\n");
12749 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012750 case SSTATE_NOT_INTERNED:
12751 /* XXX Shouldn't happen */
12752 break;
12753 case SSTATE_INTERNED_IMMORTAL:
12754 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012756 break;
12757 case SSTATE_INTERNED_MORTAL:
12758 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012759 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012760 break;
12761 default:
12762 Py_FatalError("Inconsistent interned string state.");
12763 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012764 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012765 }
12766 fprintf(stderr, "total size of all interned strings: "
12767 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12768 "mortal/immortal\n", mortal_size, immortal_size);
12769 Py_DECREF(keys);
12770 PyDict_Clear(interned);
12771 Py_DECREF(interned);
12772 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012773}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012774
12775
12776/********************* Unicode Iterator **************************/
12777
12778typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012779 PyObject_HEAD
12780 Py_ssize_t it_index;
12781 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012782} unicodeiterobject;
12783
12784static void
12785unicodeiter_dealloc(unicodeiterobject *it)
12786{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012787 _PyObject_GC_UNTRACK(it);
12788 Py_XDECREF(it->it_seq);
12789 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012790}
12791
12792static int
12793unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12794{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012795 Py_VISIT(it->it_seq);
12796 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012797}
12798
12799static PyObject *
12800unicodeiter_next(unicodeiterobject *it)
12801{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012802 PyUnicodeObject *seq;
12803 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012804
Benjamin Peterson14339b62009-01-31 16:36:08 +000012805 assert(it != NULL);
12806 seq = it->it_seq;
12807 if (seq == NULL)
12808 return NULL;
12809 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12812 int kind = PyUnicode_KIND(seq);
12813 void *data = PyUnicode_DATA(seq);
12814 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12815 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012816 if (item != NULL)
12817 ++it->it_index;
12818 return item;
12819 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012820
Benjamin Peterson14339b62009-01-31 16:36:08 +000012821 Py_DECREF(seq);
12822 it->it_seq = NULL;
12823 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012824}
12825
12826static PyObject *
12827unicodeiter_len(unicodeiterobject *it)
12828{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012829 Py_ssize_t len = 0;
12830 if (it->it_seq)
12831 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12832 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012833}
12834
12835PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12836
12837static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012838 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012839 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012840 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012841};
12842
12843PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012844 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12845 "str_iterator", /* tp_name */
12846 sizeof(unicodeiterobject), /* tp_basicsize */
12847 0, /* tp_itemsize */
12848 /* methods */
12849 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12850 0, /* tp_print */
12851 0, /* tp_getattr */
12852 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012853 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012854 0, /* tp_repr */
12855 0, /* tp_as_number */
12856 0, /* tp_as_sequence */
12857 0, /* tp_as_mapping */
12858 0, /* tp_hash */
12859 0, /* tp_call */
12860 0, /* tp_str */
12861 PyObject_GenericGetAttr, /* tp_getattro */
12862 0, /* tp_setattro */
12863 0, /* tp_as_buffer */
12864 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12865 0, /* tp_doc */
12866 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12867 0, /* tp_clear */
12868 0, /* tp_richcompare */
12869 0, /* tp_weaklistoffset */
12870 PyObject_SelfIter, /* tp_iter */
12871 (iternextfunc)unicodeiter_next, /* tp_iternext */
12872 unicodeiter_methods, /* tp_methods */
12873 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012874};
12875
12876static PyObject *
12877unicode_iter(PyObject *seq)
12878{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012879 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012880
Benjamin Peterson14339b62009-01-31 16:36:08 +000012881 if (!PyUnicode_Check(seq)) {
12882 PyErr_BadInternalCall();
12883 return NULL;
12884 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012885 if (PyUnicode_READY(seq) == -1)
12886 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012887 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12888 if (it == NULL)
12889 return NULL;
12890 it->it_index = 0;
12891 Py_INCREF(seq);
12892 it->it_seq = (PyUnicodeObject *)seq;
12893 _PyObject_GC_TRACK(it);
12894 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012895}
12896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012897#define UNIOP(x) Py_UNICODE_##x
12898#define UNIOP_t Py_UNICODE
12899#include "uniops.h"
12900#undef UNIOP
12901#undef UNIOP_t
12902#define UNIOP(x) Py_UCS4_##x
12903#define UNIOP_t Py_UCS4
12904#include "uniops.h"
12905#undef UNIOP
12906#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012907
Victor Stinner71133ff2010-09-01 23:43:53 +000012908Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012909PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012910{
12911 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12912 Py_UNICODE *copy;
12913 Py_ssize_t size;
12914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012915 if (!PyUnicode_Check(unicode)) {
12916 PyErr_BadArgument();
12917 return NULL;
12918 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012919 /* Ensure we won't overflow the size. */
12920 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12921 PyErr_NoMemory();
12922 return NULL;
12923 }
12924 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12925 size *= sizeof(Py_UNICODE);
12926 copy = PyMem_Malloc(size);
12927 if (copy == NULL) {
12928 PyErr_NoMemory();
12929 return NULL;
12930 }
12931 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12932 return copy;
12933}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012934
Georg Brandl66c221e2010-10-14 07:04:07 +000012935/* A _string module, to export formatter_parser and formatter_field_name_split
12936 to the string.Formatter class implemented in Python. */
12937
12938static PyMethodDef _string_methods[] = {
12939 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12940 METH_O, PyDoc_STR("split the argument as a field name")},
12941 {"formatter_parser", (PyCFunction) formatter_parser,
12942 METH_O, PyDoc_STR("parse the argument as a format string")},
12943 {NULL, NULL}
12944};
12945
12946static struct PyModuleDef _string_module = {
12947 PyModuleDef_HEAD_INIT,
12948 "_string",
12949 PyDoc_STR("string helper module"),
12950 0,
12951 _string_methods,
12952 NULL,
12953 NULL,
12954 NULL,
12955 NULL
12956};
12957
12958PyMODINIT_FUNC
12959PyInit__string(void)
12960{
12961 return PyModule_Create(&_string_module);
12962}
12963
12964
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012965#ifdef __cplusplus
12966}
12967#endif