blob: 00572a79836c0c62a798230f09ef70aed71a37b2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092/* Generic helper macro to convert characters of different types.
93 from_type and to_type have to be valid type names, begin and end
94 are pointers to the source characters which should be of type
95 "from_type *". to is a pointer of type "to_type *" and points to the
96 buffer where the result characters are written to. */
97#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
98 do { \
99 const from_type *iter_; to_type *to_; \
100 for (iter_ = (begin), to_ = (to_type *)(to); \
101 iter_ < (end); \
102 ++iter_, ++to_) { \
103 *to_ = (to_type)*iter_; \
104 } \
105 } while (0)
106
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107#define _PyUnicode_UTF8(op) \
108 (((PyCompactUnicodeObject*)(op))->utf8)
109#define PyUnicode_UTF8(op) \
110 (assert(PyUnicode_Check(op)), \
111 assert(PyUnicode_IS_READY(op)), \
112 PyUnicode_IS_COMPACT_ASCII(op) ? \
113 ((char*)((PyASCIIObject*)(op) + 1)) : \
114 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200115#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 (((PyCompactUnicodeObject*)(op))->utf8_length)
117#define PyUnicode_UTF8_LENGTH(op) \
118 (assert(PyUnicode_Check(op)), \
119 assert(PyUnicode_IS_READY(op)), \
120 PyUnicode_IS_COMPACT_ASCII(op) ? \
121 ((PyASCIIObject*)(op))->length : \
122 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200123#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
124#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
125#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
126#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
128#define _PyUnicode_KIND(op) \
129 (assert(PyUnicode_Check(op)), \
130 ((PyASCIIObject *)(op))->state.kind)
131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(PyUnicode_Check(op)), \
133 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200134#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200135
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200136/* The Unicode string has been modified: reset the hash */
137#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
138
Walter Dörwald16807132007-05-25 13:52:07 +0000139/* This dictionary holds all interned unicode strings. Note that references
140 to strings in this dictionary are *not* counted in the string's ob_refcnt.
141 When the interned string reaches a refcnt of 0 the string deallocation
142 function will delete the reference from this dictionary.
143
144 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000145 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000146*/
147static PyObject *interned;
148
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000149/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200150static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000151
152/* Single character Unicode strings in the Latin-1 range are being
153 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200154static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Fast detection of the most frequent whitespace characters */
157const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000159/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000161/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* case 0x000C: * FORM FEED */
163/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 1, 1, 1, 1, 1, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000166/* case 0x001C: * FILE SEPARATOR */
167/* case 0x001D: * GROUP SEPARATOR */
168/* case 0x001E: * RECORD SEPARATOR */
169/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 1, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
Alexander Belopolsky40018472011-02-26 01:02:56 +0000187static PyObject *
188unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000189 PyObject **errorHandler,const char *encoding, const char *reason,
190 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
191 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
192
Alexander Belopolsky40018472011-02-26 01:02:56 +0000193static void
194raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300195 const char *encoding,
196 const Py_UNICODE *unicode, Py_ssize_t size,
197 Py_ssize_t startpos, Py_ssize_t endpos,
198 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000199
Christian Heimes190d79e2008-01-30 11:58:22 +0000200/* Same for linebreaks */
201static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000204/* 0x000B, * LINE TABULATION */
205/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000206/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* 0x001C, * FILE SEPARATOR */
210/* 0x001D, * GROUP SEPARATOR */
211/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 1, 1, 1, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300228/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
229 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000230Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000231PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000232{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000233#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000235#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 /* This is actually an illegal character, so it should
237 not be passed to unichr. */
238 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000239#endif
240}
241
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242/* --- Bloom Filters ----------------------------------------------------- */
243
244/* stuff to implement simple "bloom filters" for Unicode characters.
245 to keep things simple, we use a single bitmask, using the least 5
246 bits from each unicode characters as the bit index. */
247
248/* the linebreak mask is set up by Unicode_Init below */
249
Antoine Pitrouf068f942010-01-13 14:19:12 +0000250#if LONG_BIT >= 128
251#define BLOOM_WIDTH 128
252#elif LONG_BIT >= 64
253#define BLOOM_WIDTH 64
254#elif LONG_BIT >= 32
255#define BLOOM_WIDTH 32
256#else
257#error "LONG_BIT is smaller than 32"
258#endif
259
Thomas Wouters477c8d52006-05-27 19:21:47 +0000260#define BLOOM_MASK unsigned long
261
262static BLOOM_MASK bloom_linebreak;
263
Antoine Pitrouf068f942010-01-13 14:19:12 +0000264#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
265#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266
Benjamin Peterson29060642009-01-31 22:14:21 +0000267#define BLOOM_LINEBREAK(ch) \
268 ((ch) < 128U ? ascii_linebreak[(ch)] : \
269 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270
Alexander Belopolsky40018472011-02-26 01:02:56 +0000271Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200272make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273{
274 /* calculate simple bloom-style bitmask for a given unicode string */
275
Antoine Pitrouf068f942010-01-13 14:19:12 +0000276 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000277 Py_ssize_t i;
278
279 mask = 0;
280 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200281 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
283 return mask;
284}
285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200286#define BLOOM_MEMBER(mask, chr, str) \
287 (BLOOM(mask, chr) \
288 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000289
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290/* --- Unicode Object ----------------------------------------------------- */
291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200292static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200293fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
294
295Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
296 Py_ssize_t size, Py_UCS4 ch,
297 int direction)
298{
299 /* like wcschr, but doesn't stop at NULL characters */
300 Py_ssize_t i;
301 if (direction == 1) {
302 for(i = 0; i < size; i++)
303 if (PyUnicode_READ(kind, s, i) == ch)
304 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
305 }
306 else {
307 for(i = size-1; i >= 0; i--)
308 if (PyUnicode_READ(kind, s, i) == ch)
309 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
310 }
311 return NULL;
312}
313
Alexander Belopolsky40018472011-02-26 01:02:56 +0000314static int
315unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000317{
318 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200320 /* Resizing is only supported for old unicode objects. */
321 assert(!PyUnicode_IS_COMPACT(unicode));
322 assert(_PyUnicode_WSTR(unicode) != NULL);
323
324 /* ... and only if they have not been readied yet, because
325 callees usually rely on the wstr representation when resizing. */
326 assert(unicode->data.any == NULL);
327
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000328 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200329 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000330 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000332 /* Resizing shared object (unicode_empty or single character
333 objects) in-place is not allowed. Use PyUnicode_Resize()
334 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000335
Benjamin Peterson14339b62009-01-31 16:36:08 +0000336 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200337 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
338 _PyUnicode_WSTR(unicode)[0] < 256U &&
339 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000341 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 return -1;
343 }
344
Thomas Wouters477c8d52006-05-27 19:21:47 +0000345 /* We allocate one more byte to make sure the string is Ux0000 terminated.
346 The overallocation is also used by fastsearch, which assumes that it's
347 safe to look at str[length] (without making any assumptions about what
348 it contains). */
349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350 oldstr = _PyUnicode_WSTR(unicode);
351 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
352 sizeof(Py_UNICODE) * (length + 1));
353 if (!_PyUnicode_WSTR(unicode)) {
354 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 PyErr_NoMemory();
356 return -1;
357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 _PyUnicode_WSTR(unicode)[length] = 0;
359 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360
Benjamin Peterson29060642009-01-31 22:14:21 +0000361 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362 if (unicode->data.any != NULL) {
363 PyObject_FREE(unicode->data.any);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200364 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != unicode->data.any) {
365 PyObject_FREE(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200367 _PyUnicode_UTF8(unicode) = NULL;
368 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200369 unicode->data.any = NULL;
370 _PyUnicode_LENGTH(unicode) = 0;
371 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
372 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200374 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000375
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return 0;
377}
378
379/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000380 Ux0000 terminated; some code (e.g. new_identifier)
381 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000382
383 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000384 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385
386*/
387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200388#ifdef Py_DEBUG
389int unicode_old_new_calls = 0;
390#endif
391
Alexander Belopolsky40018472011-02-26 01:02:56 +0000392static PyUnicodeObject *
393_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394{
395 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200396 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 if (length == 0 && unicode_empty != NULL) {
400 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200401 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 }
403
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000404 /* Ensure we won't overflow the size. */
405 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
406 return (PyUnicodeObject *)PyErr_NoMemory();
407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200408 if (length < 0) {
409 PyErr_SetString(PyExc_SystemError,
410 "Negative size passed to _PyUnicode_New");
411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000412 }
413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200414#ifdef Py_DEBUG
415 ++unicode_old_new_calls;
416#endif
417
418 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
419 if (unicode == NULL)
420 return NULL;
421 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
422 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
423 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 PyErr_NoMemory();
425 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200427
Jeremy Hyltond8082792003-09-16 19:41:39 +0000428 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000429 * the caller fails before initializing str -- unicode_resize()
430 * reads str[0], and the Keep-Alive optimization can keep memory
431 * allocated for str alive across a call to unicode_dealloc(unicode).
432 * We don't want unicode_resize to read uninitialized memory in
433 * that case.
434 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200435 _PyUnicode_WSTR(unicode)[0] = 0;
436 _PyUnicode_WSTR(unicode)[length] = 0;
437 _PyUnicode_WSTR_LENGTH(unicode) = length;
438 _PyUnicode_HASH(unicode) = -1;
439 _PyUnicode_STATE(unicode).interned = 0;
440 _PyUnicode_STATE(unicode).kind = 0;
441 _PyUnicode_STATE(unicode).compact = 0;
442 _PyUnicode_STATE(unicode).ready = 0;
443 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200444 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200445 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200446 _PyUnicode_UTF8(unicode) = NULL;
447 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000449
Benjamin Peterson29060642009-01-31 22:14:21 +0000450 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000451 /* XXX UNREF/NEWREF interface should be more symmetrical */
452 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000453 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000454 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000455 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456}
457
Victor Stinnerf42dc442011-10-02 23:33:16 +0200458static const char*
459unicode_kind_name(PyObject *unicode)
460{
461 assert(PyUnicode_Check(unicode));
462 if (!PyUnicode_IS_COMPACT(unicode))
463 {
464 if (!PyUnicode_IS_READY(unicode))
465 return "wstr";
466 switch(PyUnicode_KIND(unicode))
467 {
468 case PyUnicode_1BYTE_KIND:
469 if (PyUnicode_IS_COMPACT_ASCII(unicode))
470 return "legacy ascii";
471 else
472 return "legacy latin1";
473 case PyUnicode_2BYTE_KIND:
474 return "legacy UCS2";
475 case PyUnicode_4BYTE_KIND:
476 return "legacy UCS4";
477 default:
478 return "<legacy invalid kind>";
479 }
480 }
481 assert(PyUnicode_IS_READY(unicode));
482 switch(PyUnicode_KIND(unicode))
483 {
484 case PyUnicode_1BYTE_KIND:
485 if (PyUnicode_IS_COMPACT_ASCII(unicode))
486 return "ascii";
487 else
488 return "compact latin1";
489 case PyUnicode_2BYTE_KIND:
490 return "compact UCS2";
491 case PyUnicode_4BYTE_KIND:
492 return "compact UCS4";
493 default:
494 return "<invalid compact kind>";
495 }
496}
497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200498#ifdef Py_DEBUG
499int unicode_new_new_calls = 0;
500
501/* Functions wrapping macros for use in debugger */
502char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200503 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200504}
505
506void *_PyUnicode_compact_data(void *unicode) {
507 return _PyUnicode_COMPACT_DATA(unicode);
508}
509void *_PyUnicode_data(void *unicode){
510 printf("obj %p\n", unicode);
511 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
512 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
513 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
514 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
515 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
516 return PyUnicode_DATA(unicode);
517}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200518
519void
520_PyUnicode_Dump(PyObject *op)
521{
522 PyASCIIObject *ascii = (PyASCIIObject *)op;
523 printf("%s: len=%zu, wstr=%p",
524 unicode_kind_name(op),
525 ascii->length,
526 ascii->wstr);
527 if (!ascii->state.ascii) {
528 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
529 printf(" (%zu), utf8=%p (%zu)",
530 compact->wstr_length,
531 compact->utf8,
532 compact->utf8_length);
533 }
534 if (!ascii->state.compact) {
535 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
536 printf(", data=%p",
537 unicode->data.any);
538 }
539 printf("\n");
540}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200541#endif
542
543PyObject *
544PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
545{
546 PyObject *obj;
547 PyCompactUnicodeObject *unicode;
548 void *data;
549 int kind_state;
550 int is_sharing = 0, is_ascii = 0;
551 Py_ssize_t char_size;
552 Py_ssize_t struct_size;
553
554 /* Optimization for empty strings */
555 if (size == 0 && unicode_empty != NULL) {
556 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200557 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200558 }
559
560#ifdef Py_DEBUG
561 ++unicode_new_new_calls;
562#endif
563
564 struct_size = sizeof(PyCompactUnicodeObject);
565 if (maxchar < 128) {
566 kind_state = PyUnicode_1BYTE_KIND;
567 char_size = 1;
568 is_ascii = 1;
569 struct_size = sizeof(PyASCIIObject);
570 }
571 else if (maxchar < 256) {
572 kind_state = PyUnicode_1BYTE_KIND;
573 char_size = 1;
574 }
575 else if (maxchar < 65536) {
576 kind_state = PyUnicode_2BYTE_KIND;
577 char_size = 2;
578 if (sizeof(wchar_t) == 2)
579 is_sharing = 1;
580 }
581 else {
582 kind_state = PyUnicode_4BYTE_KIND;
583 char_size = 4;
584 if (sizeof(wchar_t) == 4)
585 is_sharing = 1;
586 }
587
588 /* Ensure we won't overflow the size. */
589 if (size < 0) {
590 PyErr_SetString(PyExc_SystemError,
591 "Negative size passed to PyUnicode_New");
592 return NULL;
593 }
594 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
595 return PyErr_NoMemory();
596
597 /* Duplicated allocation code from _PyObject_New() instead of a call to
598 * PyObject_New() so we are able to allocate space for the object and
599 * it's data buffer.
600 */
601 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
602 if (obj == NULL)
603 return PyErr_NoMemory();
604 obj = PyObject_INIT(obj, &PyUnicode_Type);
605 if (obj == NULL)
606 return NULL;
607
608 unicode = (PyCompactUnicodeObject *)obj;
609 if (is_ascii)
610 data = ((PyASCIIObject*)obj) + 1;
611 else
612 data = unicode + 1;
613 _PyUnicode_LENGTH(unicode) = size;
614 _PyUnicode_HASH(unicode) = -1;
615 _PyUnicode_STATE(unicode).interned = 0;
616 _PyUnicode_STATE(unicode).kind = kind_state;
617 _PyUnicode_STATE(unicode).compact = 1;
618 _PyUnicode_STATE(unicode).ready = 1;
619 _PyUnicode_STATE(unicode).ascii = is_ascii;
620 if (is_ascii) {
621 ((char*)data)[size] = 0;
622 _PyUnicode_WSTR(unicode) = NULL;
623 }
624 else if (kind_state == PyUnicode_1BYTE_KIND) {
625 ((char*)data)[size] = 0;
626 _PyUnicode_WSTR(unicode) = NULL;
627 _PyUnicode_WSTR_LENGTH(unicode) = 0;
628 unicode->utf8_length = 0;
629 unicode->utf8 = NULL;
630 }
631 else {
632 unicode->utf8 = NULL;
633 if (kind_state == PyUnicode_2BYTE_KIND)
634 ((Py_UCS2*)data)[size] = 0;
635 else /* kind_state == PyUnicode_4BYTE_KIND */
636 ((Py_UCS4*)data)[size] = 0;
637 if (is_sharing) {
638 _PyUnicode_WSTR_LENGTH(unicode) = size;
639 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
640 }
641 else {
642 _PyUnicode_WSTR_LENGTH(unicode) = 0;
643 _PyUnicode_WSTR(unicode) = NULL;
644 }
645 }
646 return obj;
647}
648
649#if SIZEOF_WCHAR_T == 2
650/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
651 will decode surrogate pairs, the other conversions are implemented as macros
652 for efficency.
653
654 This function assumes that unicode can hold one more code point than wstr
655 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200656static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200657unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
658 PyUnicodeObject *unicode)
659{
660 const wchar_t *iter;
661 Py_UCS4 *ucs4_out;
662
663 assert(unicode && PyUnicode_Check(unicode));
664 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
665 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
666
667 for (iter = begin; iter < end; ) {
668 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
669 _PyUnicode_GET_LENGTH(unicode)));
670 if (*iter >= 0xD800 && *iter <= 0xDBFF
671 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
672 {
673 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
674 iter += 2;
675 }
676 else {
677 *ucs4_out++ = *iter;
678 iter++;
679 }
680 }
681 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
682 _PyUnicode_GET_LENGTH(unicode)));
683
684 return 0;
685}
686#endif
687
Victor Stinnercd9950f2011-10-02 00:34:53 +0200688static int
689_PyUnicode_Dirty(PyObject *unicode)
690{
691 assert(PyUnicode_Check(unicode));
692 if (Py_REFCNT(unicode) != 1) {
693 PyErr_SetString(PyExc_ValueError,
694 "Cannot modify a string having more than 1 reference");
695 return -1;
696 }
697 _PyUnicode_DIRTY(unicode);
698 return 0;
699}
700
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200701Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200702PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
703 PyObject *from, Py_ssize_t from_start,
704 Py_ssize_t how_many)
705{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200706 unsigned int from_kind, to_kind;
707 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200708
Victor Stinnerb1536152011-09-30 02:26:10 +0200709 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
710 PyErr_BadInternalCall();
711 return -1;
712 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200713
714 if (PyUnicode_READY(from))
715 return -1;
716 if (PyUnicode_READY(to))
717 return -1;
718
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200719 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200720 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
721 PyErr_Format(PyExc_ValueError,
722 "Cannot write %zi characters at %zi "
723 "in a string of %zi characters",
724 how_many, to_start, PyUnicode_GET_LENGTH(to));
725 return -1;
726 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200727 if (how_many == 0)
728 return 0;
729
Victor Stinnercd9950f2011-10-02 00:34:53 +0200730 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200731 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200733 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200734 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200735 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200736 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200737
Victor Stinnerf42dc442011-10-02 23:33:16 +0200738 if (from_kind == to_kind
739 /* deny latin1 => ascii */
740 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
741 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200742 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200743 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200744 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200745 + PyUnicode_KIND_SIZE(from_kind, from_start),
746 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200747 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200748 else if (from_kind == PyUnicode_1BYTE_KIND
749 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200750 {
751 _PyUnicode_CONVERT_BYTES(
752 Py_UCS1, Py_UCS2,
753 PyUnicode_1BYTE_DATA(from) + from_start,
754 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
755 PyUnicode_2BYTE_DATA(to) + to_start
756 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200757 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200758 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200759 && to_kind == PyUnicode_4BYTE_KIND)
760 {
761 _PyUnicode_CONVERT_BYTES(
762 Py_UCS1, Py_UCS4,
763 PyUnicode_1BYTE_DATA(from) + from_start,
764 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
765 PyUnicode_4BYTE_DATA(to) + to_start
766 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200767 }
768 else if (from_kind == PyUnicode_2BYTE_KIND
769 && to_kind == PyUnicode_4BYTE_KIND)
770 {
771 _PyUnicode_CONVERT_BYTES(
772 Py_UCS2, Py_UCS4,
773 PyUnicode_2BYTE_DATA(from) + from_start,
774 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
775 PyUnicode_4BYTE_DATA(to) + to_start
776 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200777 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200778 else {
779 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200780
781 /* check if max_char(from substring) <= max_char(to) */
782 if (from_kind > to_kind
783 /* latin1 => ascii */
784 || (PyUnicode_IS_COMPACT_ASCII(to)
785 && to_kind == PyUnicode_1BYTE_KIND
786 && !PyUnicode_IS_COMPACT_ASCII(from)))
787 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200788 /* slow path to check for character overflow */
789 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
790 Py_UCS4 ch, maxchar;
791 Py_ssize_t i;
792
793 maxchar = 0;
794 invalid_kinds = 0;
795 for (i=0; i < how_many; i++) {
796 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
797 if (ch > maxchar) {
798 maxchar = ch;
799 if (maxchar > to_maxchar) {
800 invalid_kinds = 1;
801 break;
802 }
803 }
804 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
805 }
806 }
807 else
808 invalid_kinds = 1;
809 if (invalid_kinds) {
810 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200811 "Cannot copy %s characters "
812 "into a string of %s characters",
813 unicode_kind_name(from),
814 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +0200815 return -1;
816 }
817 }
818 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200819}
820
Victor Stinner17222162011-09-28 22:15:37 +0200821/* Find the maximum code point and count the number of surrogate pairs so a
822 correct string length can be computed before converting a string to UCS4.
823 This function counts single surrogates as a character and not as a pair.
824
825 Return 0 on success, or -1 on error. */
826static int
827find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
828 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200829{
830 const wchar_t *iter;
831
Victor Stinnerc53be962011-10-02 21:33:54 +0200832 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833 if (num_surrogates == NULL || maxchar == NULL) {
834 PyErr_SetString(PyExc_SystemError,
835 "unexpected NULL arguments to "
836 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
837 return -1;
838 }
839
840 *num_surrogates = 0;
841 *maxchar = 0;
842
843 for (iter = begin; iter < end; ) {
844 if (*iter > *maxchar)
845 *maxchar = *iter;
846#if SIZEOF_WCHAR_T == 2
847 if (*iter >= 0xD800 && *iter <= 0xDBFF
848 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
849 {
850 Py_UCS4 surrogate_val;
851 surrogate_val = (((iter[0] & 0x3FF)<<10)
852 | (iter[1] & 0x3FF)) + 0x10000;
853 ++(*num_surrogates);
854 if (surrogate_val > *maxchar)
855 *maxchar = surrogate_val;
856 iter += 2;
857 }
858 else
859 iter++;
860#else
861 iter++;
862#endif
863 }
864 return 0;
865}
866
867#ifdef Py_DEBUG
868int unicode_ready_calls = 0;
869#endif
870
871int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200872_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200873{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200874 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200875 wchar_t *end;
876 Py_UCS4 maxchar = 0;
877 Py_ssize_t num_surrogates;
878#if SIZEOF_WCHAR_T == 2
879 Py_ssize_t length_wo_surrogates;
880#endif
881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200883 strings were created using _PyObject_New() and where no canonical
884 representation (the str field) has been set yet aka strings
885 which are not yet ready. */
886 assert(PyUnicode_Check(obj));
887 assert(!PyUnicode_IS_READY(obj));
888 assert(!PyUnicode_IS_COMPACT(obj));
889 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200890 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +0200891 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200892 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200893 /* Actually, it should neither be interned nor be anything else: */
894 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200895
896#ifdef Py_DEBUG
897 ++unicode_ready_calls;
898#endif
899
900 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200901 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200902 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200903 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200904
905 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +0200906 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
907 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908 PyErr_NoMemory();
909 return -1;
910 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200911 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912 _PyUnicode_WSTR(unicode), end,
913 PyUnicode_1BYTE_DATA(unicode));
914 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
915 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
916 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
917 if (maxchar < 128) {
Victor Stinnerc3c74152011-10-02 20:39:55 +0200918 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200919 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200920 }
921 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200922 _PyUnicode_UTF8(unicode) = NULL;
923 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200924 }
925 PyObject_FREE(_PyUnicode_WSTR(unicode));
926 _PyUnicode_WSTR(unicode) = NULL;
927 _PyUnicode_WSTR_LENGTH(unicode) = 0;
928 }
929 /* In this case we might have to convert down from 4-byte native
930 wchar_t to 2-byte unicode. */
931 else if (maxchar < 65536) {
932 assert(num_surrogates == 0 &&
933 "FindMaxCharAndNumSurrogatePairs() messed up");
934
Victor Stinner506f5922011-09-28 22:34:18 +0200935#if SIZEOF_WCHAR_T == 2
936 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +0200937 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +0200938 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
939 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
940 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200941 _PyUnicode_UTF8(unicode) = NULL;
942 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200943#else
944 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +0200945 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +0200946 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +0200947 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +0200948 PyErr_NoMemory();
949 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200950 }
Victor Stinner506f5922011-09-28 22:34:18 +0200951 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
952 _PyUnicode_WSTR(unicode), end,
953 PyUnicode_2BYTE_DATA(unicode));
954 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
955 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
956 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200957 _PyUnicode_UTF8(unicode) = NULL;
958 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +0200959 PyObject_FREE(_PyUnicode_WSTR(unicode));
960 _PyUnicode_WSTR(unicode) = NULL;
961 _PyUnicode_WSTR_LENGTH(unicode) = 0;
962#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963 }
964 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
965 else {
966#if SIZEOF_WCHAR_T == 2
967 /* in case the native representation is 2-bytes, we need to allocate a
968 new normalized 4-byte version. */
969 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200970 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
971 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972 PyErr_NoMemory();
973 return -1;
974 }
975 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
976 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200977 _PyUnicode_UTF8(unicode) = NULL;
978 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinnerc53be962011-10-02 21:33:54 +0200979 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 PyObject_FREE(_PyUnicode_WSTR(unicode));
981 _PyUnicode_WSTR(unicode) = NULL;
982 _PyUnicode_WSTR_LENGTH(unicode) = 0;
983#else
984 assert(num_surrogates == 0);
985
Victor Stinnerc3c74152011-10-02 20:39:55 +0200986 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200988 _PyUnicode_UTF8(unicode) = NULL;
989 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
991#endif
992 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
993 }
994 _PyUnicode_STATE(unicode).ready = 1;
995 return 0;
996}
997
Alexander Belopolsky40018472011-02-26 01:02:56 +0000998static void
999unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001000{
Walter Dörwald16807132007-05-25 13:52:07 +00001001 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001002 case SSTATE_NOT_INTERNED:
1003 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001004
Benjamin Peterson29060642009-01-31 22:14:21 +00001005 case SSTATE_INTERNED_MORTAL:
1006 /* revive dead object temporarily for DelItem */
1007 Py_REFCNT(unicode) = 3;
1008 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1009 Py_FatalError(
1010 "deletion of interned string failed");
1011 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001012
Benjamin Peterson29060642009-01-31 22:14:21 +00001013 case SSTATE_INTERNED_IMMORTAL:
1014 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001015
Benjamin Peterson29060642009-01-31 22:14:21 +00001016 default:
1017 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001018 }
1019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001020 if (_PyUnicode_WSTR(unicode) &&
1021 (!PyUnicode_IS_READY(unicode) ||
1022 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1023 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001024 if (!PyUnicode_IS_COMPACT_ASCII(unicode)
1025 && _PyUnicode_UTF8(unicode)
1026 && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
1027 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028
1029 if (PyUnicode_IS_COMPACT(unicode)) {
1030 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031 }
1032 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001033 if (_PyUnicode_DATA_ANY(unicode))
1034 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001035 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 }
1037}
1038
Alexander Belopolsky40018472011-02-26 01:02:56 +00001039static int
1040_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001041{
1042 register PyUnicodeObject *v;
1043
1044 /* Argument checks */
1045 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001046 PyErr_BadInternalCall();
1047 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001048 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001049 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
1051 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001052 PyErr_BadInternalCall();
1053 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001054 }
1055
1056 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 possible since these are being shared.
1058 The same goes for new-representation unicode objects or objects which
1059 have already been readied.
1060 For these, we simply return a fresh copy with the same Unicode content.
1061 */
1062 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
1063 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
1064 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001065 PyUnicodeObject *w = _PyUnicode_New(length);
1066 if (w == NULL)
1067 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
1069 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +00001070 Py_DECREF(*unicode);
1071 *unicode = w;
1072 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001073 }
1074
1075 /* Note that we don't have to modify *unicode for unshared Unicode
1076 objects, since we can modify them in-place. */
1077 return unicode_resize(v, length);
1078}
1079
Alexander Belopolsky40018472011-02-26 01:02:56 +00001080int
1081PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001082{
1083 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
1084}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001085
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086static PyObject*
1087get_latin1_char(unsigned char ch)
1088{
Victor Stinnera464fc12011-10-02 20:39:30 +02001089 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001091 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001092 if (!unicode)
1093 return NULL;
1094 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1095 unicode_latin1[ch] = unicode;
1096 }
1097 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001098 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001099}
1100
Alexander Belopolsky40018472011-02-26 01:02:56 +00001101PyObject *
1102PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103{
1104 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105 Py_UCS4 maxchar = 0;
1106 Py_ssize_t num_surrogates;
1107
1108 if (u == NULL)
1109 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001111 /* If the Unicode data is known at construction time, we can apply
1112 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114 /* Optimization for empty strings */
1115 if (size == 0 && unicode_empty != NULL) {
1116 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001117 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001118 }
Tim Petersced69f82003-09-16 20:30:58 +00001119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 /* Single character Unicode objects in the Latin-1 range are
1121 shared when using this constructor */
1122 if (size == 1 && *u < 256)
1123 return get_latin1_char((unsigned char)*u);
1124
1125 /* If not empty and not single character, copy the Unicode data
1126 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001127 if (find_maxchar_surrogates(u, u + size,
1128 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129 return NULL;
1130
1131 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1132 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133 if (!unicode)
1134 return NULL;
1135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136 switch (PyUnicode_KIND(unicode)) {
1137 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001138 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1140 break;
1141 case PyUnicode_2BYTE_KIND:
1142#if Py_UNICODE_SIZE == 2
1143 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1144#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001145 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1147#endif
1148 break;
1149 case PyUnicode_4BYTE_KIND:
1150#if SIZEOF_WCHAR_T == 2
1151 /* This is the only case which has to process surrogates, thus
1152 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001153 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154#else
1155 assert(num_surrogates == 0);
1156 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1157#endif
1158 break;
1159 default:
1160 assert(0 && "Impossible state");
1161 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162
1163 return (PyObject *)unicode;
1164}
1165
Alexander Belopolsky40018472011-02-26 01:02:56 +00001166PyObject *
1167PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001168{
1169 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001170
Benjamin Peterson14339b62009-01-31 16:36:08 +00001171 if (size < 0) {
1172 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001173 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001174 return NULL;
1175 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001176
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001177 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001178 some optimizations which share commonly used objects.
1179 Also, this means the input must be UTF-8, so fall back to the
1180 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001181 if (u != NULL) {
1182
Benjamin Peterson29060642009-01-31 22:14:21 +00001183 /* Optimization for empty strings */
1184 if (size == 0 && unicode_empty != NULL) {
1185 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001186 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001187 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001188
1189 /* Single characters are shared when using this constructor.
1190 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001191 if (size == 1 && Py_CHARMASK(*u) < 128)
1192 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001193
1194 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001195 }
1196
Walter Dörwald55507312007-05-18 13:12:10 +00001197 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001198 if (!unicode)
1199 return NULL;
1200
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001201 return (PyObject *)unicode;
1202}
1203
Alexander Belopolsky40018472011-02-26 01:02:56 +00001204PyObject *
1205PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001206{
1207 size_t size = strlen(u);
1208 if (size > PY_SSIZE_T_MAX) {
1209 PyErr_SetString(PyExc_OverflowError, "input too long");
1210 return NULL;
1211 }
1212
1213 return PyUnicode_FromStringAndSize(u, size);
1214}
1215
Victor Stinnere57b1c02011-09-28 22:20:48 +02001216static PyObject*
1217_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001218{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 PyObject *res;
1220 unsigned char max = 127;
1221 Py_ssize_t i;
1222 for (i = 0; i < size; i++) {
1223 if (u[i] & 0x80) {
1224 max = 255;
1225 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001226 }
1227 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 res = PyUnicode_New(size, max);
1229 if (!res)
1230 return NULL;
1231 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1232 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001233}
1234
Victor Stinnere57b1c02011-09-28 22:20:48 +02001235static PyObject*
1236_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001237{
1238 PyObject *res;
1239 Py_UCS2 max = 0;
1240 Py_ssize_t i;
1241 for (i = 0; i < size; i++)
1242 if (u[i] > max)
1243 max = u[i];
1244 res = PyUnicode_New(size, max);
1245 if (!res)
1246 return NULL;
1247 if (max >= 256)
1248 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1249 else
1250 for (i = 0; i < size; i++)
1251 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1252 return res;
1253}
1254
Victor Stinnere57b1c02011-09-28 22:20:48 +02001255static PyObject*
1256_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257{
1258 PyObject *res;
1259 Py_UCS4 max = 0;
1260 Py_ssize_t i;
1261 for (i = 0; i < size; i++)
1262 if (u[i] > max)
1263 max = u[i];
1264 res = PyUnicode_New(size, max);
1265 if (!res)
1266 return NULL;
1267 if (max >= 0x10000)
1268 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1269 else {
1270 int kind = PyUnicode_KIND(res);
1271 void *data = PyUnicode_DATA(res);
1272 for (i = 0; i < size; i++)
1273 PyUnicode_WRITE(kind, data, i, u[i]);
1274 }
1275 return res;
1276}
1277
1278PyObject*
1279PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1280{
1281 switch(kind) {
1282 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001283 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001284 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001285 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001286 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001287 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001288 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001289 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001290 return NULL;
1291}
1292
Victor Stinner034f6cf2011-09-30 02:26:44 +02001293PyObject*
1294PyUnicode_Copy(PyObject *unicode)
1295{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001296 Py_ssize_t size;
1297 PyObject *copy;
1298 void *data;
1299
Victor Stinner034f6cf2011-09-30 02:26:44 +02001300 if (!PyUnicode_Check(unicode)) {
1301 PyErr_BadInternalCall();
1302 return NULL;
1303 }
1304 if (PyUnicode_READY(unicode))
1305 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001306
1307 size = PyUnicode_GET_LENGTH(unicode);
1308 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1309 if (!copy)
1310 return NULL;
1311 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1312
1313 data = PyUnicode_DATA(unicode);
1314 switch (PyUnicode_KIND(unicode))
1315 {
1316 case PyUnicode_1BYTE_KIND:
1317 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1318 break;
1319 case PyUnicode_2BYTE_KIND:
1320 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1321 break;
1322 case PyUnicode_4BYTE_KIND:
1323 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1324 break;
1325 default:
1326 assert(0);
1327 break;
1328 }
1329 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001330}
1331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332
Victor Stinnerbc603d12011-10-02 01:00:40 +02001333/* Widen Unicode objects to larger buffers. Don't write terminating null
1334 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001335
1336void*
1337_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1338{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001339 Py_ssize_t len;
1340 void *result;
1341 unsigned int skind;
1342
1343 if (PyUnicode_READY(s))
1344 return NULL;
1345
1346 len = PyUnicode_GET_LENGTH(s);
1347 skind = PyUnicode_KIND(s);
1348 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1350 return NULL;
1351 }
1352 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001353 case PyUnicode_2BYTE_KIND:
1354 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1355 if (!result)
1356 return PyErr_NoMemory();
1357 assert(skind == PyUnicode_1BYTE_KIND);
1358 _PyUnicode_CONVERT_BYTES(
1359 Py_UCS1, Py_UCS2,
1360 PyUnicode_1BYTE_DATA(s),
1361 PyUnicode_1BYTE_DATA(s) + len,
1362 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001364 case PyUnicode_4BYTE_KIND:
1365 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1366 if (!result)
1367 return PyErr_NoMemory();
1368 if (skind == PyUnicode_2BYTE_KIND) {
1369 _PyUnicode_CONVERT_BYTES(
1370 Py_UCS2, Py_UCS4,
1371 PyUnicode_2BYTE_DATA(s),
1372 PyUnicode_2BYTE_DATA(s) + len,
1373 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001375 else {
1376 assert(skind == PyUnicode_1BYTE_KIND);
1377 _PyUnicode_CONVERT_BYTES(
1378 Py_UCS1, Py_UCS4,
1379 PyUnicode_1BYTE_DATA(s),
1380 PyUnicode_1BYTE_DATA(s) + len,
1381 result);
1382 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001384 default:
1385 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001387 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388 return NULL;
1389}
1390
1391static Py_UCS4*
1392as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1393 int copy_null)
1394{
1395 int kind;
1396 void *data;
1397 Py_ssize_t len, targetlen;
1398 if (PyUnicode_READY(string) == -1)
1399 return NULL;
1400 kind = PyUnicode_KIND(string);
1401 data = PyUnicode_DATA(string);
1402 len = PyUnicode_GET_LENGTH(string);
1403 targetlen = len;
1404 if (copy_null)
1405 targetlen++;
1406 if (!target) {
1407 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1408 PyErr_NoMemory();
1409 return NULL;
1410 }
1411 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1412 if (!target) {
1413 PyErr_NoMemory();
1414 return NULL;
1415 }
1416 }
1417 else {
1418 if (targetsize < targetlen) {
1419 PyErr_Format(PyExc_SystemError,
1420 "string is longer than the buffer");
1421 if (copy_null && 0 < targetsize)
1422 target[0] = 0;
1423 return NULL;
1424 }
1425 }
1426 if (kind != PyUnicode_4BYTE_KIND) {
1427 Py_ssize_t i;
1428 for (i = 0; i < len; i++)
1429 target[i] = PyUnicode_READ(kind, data, i);
1430 }
1431 else
1432 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1433 if (copy_null)
1434 target[len] = 0;
1435 return target;
1436}
1437
1438Py_UCS4*
1439PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1440 int copy_null)
1441{
1442 if (target == NULL || targetsize < 1) {
1443 PyErr_BadInternalCall();
1444 return NULL;
1445 }
1446 return as_ucs4(string, target, targetsize, copy_null);
1447}
1448
1449Py_UCS4*
1450PyUnicode_AsUCS4Copy(PyObject *string)
1451{
1452 return as_ucs4(string, NULL, 0, 1);
1453}
1454
1455#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001456
Alexander Belopolsky40018472011-02-26 01:02:56 +00001457PyObject *
1458PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001460 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001461 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001463 PyErr_BadInternalCall();
1464 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465 }
1466
Martin v. Löwis790465f2008-04-05 20:41:37 +00001467 if (size == -1) {
1468 size = wcslen(w);
1469 }
1470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001472}
1473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001475
Walter Dörwald346737f2007-05-31 10:44:43 +00001476static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001477makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1478 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001479{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001480 *fmt++ = '%';
1481 if (width) {
1482 if (zeropad)
1483 *fmt++ = '0';
1484 fmt += sprintf(fmt, "%d", width);
1485 }
1486 if (precision)
1487 fmt += sprintf(fmt, ".%d", precision);
1488 if (longflag)
1489 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001490 else if (longlongflag) {
1491 /* longlongflag should only ever be nonzero on machines with
1492 HAVE_LONG_LONG defined */
1493#ifdef HAVE_LONG_LONG
1494 char *f = PY_FORMAT_LONG_LONG;
1495 while (*f)
1496 *fmt++ = *f++;
1497#else
1498 /* we shouldn't ever get here */
1499 assert(0);
1500 *fmt++ = 'l';
1501#endif
1502 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001503 else if (size_tflag) {
1504 char *f = PY_FORMAT_SIZE_T;
1505 while (*f)
1506 *fmt++ = *f++;
1507 }
1508 *fmt++ = c;
1509 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001510}
1511
Victor Stinner96865452011-03-01 23:44:09 +00001512/* helper for PyUnicode_FromFormatV() */
1513
1514static const char*
1515parse_format_flags(const char *f,
1516 int *p_width, int *p_precision,
1517 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1518{
1519 int width, precision, longflag, longlongflag, size_tflag;
1520
1521 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1522 f++;
1523 width = 0;
1524 while (Py_ISDIGIT((unsigned)*f))
1525 width = (width*10) + *f++ - '0';
1526 precision = 0;
1527 if (*f == '.') {
1528 f++;
1529 while (Py_ISDIGIT((unsigned)*f))
1530 precision = (precision*10) + *f++ - '0';
1531 if (*f == '%') {
1532 /* "%.3%s" => f points to "3" */
1533 f--;
1534 }
1535 }
1536 if (*f == '\0') {
1537 /* bogus format "%.1" => go backward, f points to "1" */
1538 f--;
1539 }
1540 if (p_width != NULL)
1541 *p_width = width;
1542 if (p_precision != NULL)
1543 *p_precision = precision;
1544
1545 /* Handle %ld, %lu, %lld and %llu. */
1546 longflag = 0;
1547 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001548 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001549
1550 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001551 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001552 longflag = 1;
1553 ++f;
1554 }
1555#ifdef HAVE_LONG_LONG
1556 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001557 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001558 longlongflag = 1;
1559 f += 2;
1560 }
1561#endif
1562 }
1563 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001564 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001565 size_tflag = 1;
1566 ++f;
1567 }
1568 if (p_longflag != NULL)
1569 *p_longflag = longflag;
1570 if (p_longlongflag != NULL)
1571 *p_longlongflag = longlongflag;
1572 if (p_size_tflag != NULL)
1573 *p_size_tflag = size_tflag;
1574 return f;
1575}
1576
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001577/* maximum number of characters required for output of %ld. 21 characters
1578 allows for 64-bit integers (in decimal) and an optional sign. */
1579#define MAX_LONG_CHARS 21
1580/* maximum number of characters required for output of %lld.
1581 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1582 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1583#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1584
Walter Dörwaldd2034312007-05-18 16:29:38 +00001585PyObject *
1586PyUnicode_FromFormatV(const char *format, va_list vargs)
1587{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001588 va_list count;
1589 Py_ssize_t callcount = 0;
1590 PyObject **callresults = NULL;
1591 PyObject **callresult = NULL;
1592 Py_ssize_t n = 0;
1593 int width = 0;
1594 int precision = 0;
1595 int zeropad;
1596 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001598 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001599 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001600 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1601 Py_UCS4 argmaxchar;
1602 Py_ssize_t numbersize = 0;
1603 char *numberresults = NULL;
1604 char *numberresult = NULL;
1605 Py_ssize_t i;
1606 int kind;
1607 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001608
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001609 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001610 /* step 1: count the number of %S/%R/%A/%s format specifications
1611 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1612 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001613 * result in an array)
1614 * also esimate a upper bound for all the number formats in the string,
1615 * numbers will be formated in step 3 and be keept in a '\0'-separated
1616 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001617 for (f = format; *f; f++) {
1618 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001619 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001620 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1621 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1622 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1623 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001625 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001626#ifdef HAVE_LONG_LONG
1627 if (longlongflag) {
1628 if (width < MAX_LONG_LONG_CHARS)
1629 width = MAX_LONG_LONG_CHARS;
1630 }
1631 else
1632#endif
1633 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1634 including sign. Decimal takes the most space. This
1635 isn't enough for octal. If a width is specified we
1636 need more (which we allocate later). */
1637 if (width < MAX_LONG_CHARS)
1638 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001639
1640 /* account for the size + '\0' to separate numbers
1641 inside of the numberresults buffer */
1642 numbersize += (width + 1);
1643 }
1644 }
1645 else if ((unsigned char)*f > 127) {
1646 PyErr_Format(PyExc_ValueError,
1647 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1648 "string, got a non-ASCII byte: 0x%02x",
1649 (unsigned char)*f);
1650 return NULL;
1651 }
1652 }
1653 /* step 2: allocate memory for the results of
1654 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1655 if (callcount) {
1656 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1657 if (!callresults) {
1658 PyErr_NoMemory();
1659 return NULL;
1660 }
1661 callresult = callresults;
1662 }
1663 /* step 2.5: allocate memory for the results of formating numbers */
1664 if (numbersize) {
1665 numberresults = PyObject_Malloc(numbersize);
1666 if (!numberresults) {
1667 PyErr_NoMemory();
1668 goto fail;
1669 }
1670 numberresult = numberresults;
1671 }
1672
1673 /* step 3: format numbers and figure out how large a buffer we need */
1674 for (f = format; *f; f++) {
1675 if (*f == '%') {
1676 const char* p;
1677 int longflag;
1678 int longlongflag;
1679 int size_tflag;
1680 int numprinted;
1681
1682 p = f;
1683 zeropad = (f[1] == '0');
1684 f = parse_format_flags(f, &width, &precision,
1685 &longflag, &longlongflag, &size_tflag);
1686 switch (*f) {
1687 case 'c':
1688 {
1689 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001690 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 n++;
1692 break;
1693 }
1694 case '%':
1695 n++;
1696 break;
1697 case 'i':
1698 case 'd':
1699 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1700 width, precision, *f);
1701 if (longflag)
1702 numprinted = sprintf(numberresult, fmt,
1703 va_arg(count, long));
1704#ifdef HAVE_LONG_LONG
1705 else if (longlongflag)
1706 numprinted = sprintf(numberresult, fmt,
1707 va_arg(count, PY_LONG_LONG));
1708#endif
1709 else if (size_tflag)
1710 numprinted = sprintf(numberresult, fmt,
1711 va_arg(count, Py_ssize_t));
1712 else
1713 numprinted = sprintf(numberresult, fmt,
1714 va_arg(count, int));
1715 n += numprinted;
1716 /* advance by +1 to skip over the '\0' */
1717 numberresult += (numprinted + 1);
1718 assert(*(numberresult - 1) == '\0');
1719 assert(*(numberresult - 2) != '\0');
1720 assert(numprinted >= 0);
1721 assert(numberresult <= numberresults + numbersize);
1722 break;
1723 case 'u':
1724 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1725 width, precision, 'u');
1726 if (longflag)
1727 numprinted = sprintf(numberresult, fmt,
1728 va_arg(count, unsigned long));
1729#ifdef HAVE_LONG_LONG
1730 else if (longlongflag)
1731 numprinted = sprintf(numberresult, fmt,
1732 va_arg(count, unsigned PY_LONG_LONG));
1733#endif
1734 else if (size_tflag)
1735 numprinted = sprintf(numberresult, fmt,
1736 va_arg(count, size_t));
1737 else
1738 numprinted = sprintf(numberresult, fmt,
1739 va_arg(count, unsigned int));
1740 n += numprinted;
1741 numberresult += (numprinted + 1);
1742 assert(*(numberresult - 1) == '\0');
1743 assert(*(numberresult - 2) != '\0');
1744 assert(numprinted >= 0);
1745 assert(numberresult <= numberresults + numbersize);
1746 break;
1747 case 'x':
1748 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1749 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1750 n += numprinted;
1751 numberresult += (numprinted + 1);
1752 assert(*(numberresult - 1) == '\0');
1753 assert(*(numberresult - 2) != '\0');
1754 assert(numprinted >= 0);
1755 assert(numberresult <= numberresults + numbersize);
1756 break;
1757 case 'p':
1758 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1759 /* %p is ill-defined: ensure leading 0x. */
1760 if (numberresult[1] == 'X')
1761 numberresult[1] = 'x';
1762 else if (numberresult[1] != 'x') {
1763 memmove(numberresult + 2, numberresult,
1764 strlen(numberresult) + 1);
1765 numberresult[0] = '0';
1766 numberresult[1] = 'x';
1767 numprinted += 2;
1768 }
1769 n += numprinted;
1770 numberresult += (numprinted + 1);
1771 assert(*(numberresult - 1) == '\0');
1772 assert(*(numberresult - 2) != '\0');
1773 assert(numprinted >= 0);
1774 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001775 break;
1776 case 's':
1777 {
1778 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001779 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001780 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1781 if (!str)
1782 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783 /* since PyUnicode_DecodeUTF8 returns already flexible
1784 unicode objects, there is no need to call ready on them */
1785 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001786 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001788 /* Remember the str and switch to the next slot */
1789 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001790 break;
1791 }
1792 case 'U':
1793 {
1794 PyObject *obj = va_arg(count, PyObject *);
1795 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 if (PyUnicode_READY(obj) == -1)
1797 goto fail;
1798 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001799 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001801 break;
1802 }
1803 case 'V':
1804 {
1805 PyObject *obj = va_arg(count, PyObject *);
1806 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001807 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001808 assert(obj || str);
1809 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001810 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 if (PyUnicode_READY(obj) == -1)
1812 goto fail;
1813 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001814 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001816 *callresult++ = NULL;
1817 }
1818 else {
1819 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1820 if (!str_obj)
1821 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001823 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001824 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001825 *callresult++ = str_obj;
1826 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001827 break;
1828 }
1829 case 'S':
1830 {
1831 PyObject *obj = va_arg(count, PyObject *);
1832 PyObject *str;
1833 assert(obj);
1834 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001836 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001837 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001838 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001840 /* Remember the str and switch to the next slot */
1841 *callresult++ = str;
1842 break;
1843 }
1844 case 'R':
1845 {
1846 PyObject *obj = va_arg(count, PyObject *);
1847 PyObject *repr;
1848 assert(obj);
1849 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001850 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001851 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001852 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001853 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001854 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001855 /* Remember the repr and switch to the next slot */
1856 *callresult++ = repr;
1857 break;
1858 }
1859 case 'A':
1860 {
1861 PyObject *obj = va_arg(count, PyObject *);
1862 PyObject *ascii;
1863 assert(obj);
1864 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001865 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001866 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001868 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001869 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001870 /* Remember the repr and switch to the next slot */
1871 *callresult++ = ascii;
1872 break;
1873 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001874 default:
1875 /* if we stumble upon an unknown
1876 formatting code, copy the rest of
1877 the format string to the output
1878 string. (we cannot just skip the
1879 code, since there's no way to know
1880 what's in the argument list) */
1881 n += strlen(p);
1882 goto expand;
1883 }
1884 } else
1885 n++;
1886 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001887 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001888 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001889 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001890 we don't have to resize the string.
1891 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001893 if (!string)
1894 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895 kind = PyUnicode_KIND(string);
1896 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001897 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001901 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001902 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001903
1904 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1906 /* checking for == because the last argument could be a empty
1907 string, which causes i to point to end, the assert at the end of
1908 the loop */
1909 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001910
Benjamin Peterson14339b62009-01-31 16:36:08 +00001911 switch (*f) {
1912 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001913 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 const int ordinal = va_arg(vargs, int);
1915 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001916 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001917 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001918 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001919 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001920 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001921 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922 case 'p':
1923 /* unused, since we already have the result */
1924 if (*f == 'p')
1925 (void) va_arg(vargs, void *);
1926 else
1927 (void) va_arg(vargs, int);
1928 /* extract the result from numberresults and append. */
1929 for (; *numberresult; ++i, ++numberresult)
1930 PyUnicode_WRITE(kind, data, i, *numberresult);
1931 /* skip over the separating '\0' */
1932 assert(*numberresult == '\0');
1933 numberresult++;
1934 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001935 break;
1936 case 's':
1937 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001938 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001940 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001941 size = PyUnicode_GET_LENGTH(*callresult);
1942 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001943 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1944 *callresult, 0,
1945 size) < 0)
1946 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001948 /* We're done with the unicode()/repr() => forget it */
1949 Py_DECREF(*callresult);
1950 /* switch to next unicode()/repr() result */
1951 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001952 break;
1953 }
1954 case 'U':
1955 {
1956 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957 Py_ssize_t size;
1958 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1959 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001960 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1961 obj, 0,
1962 size) < 0)
1963 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001965 break;
1966 }
1967 case 'V':
1968 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001970 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001971 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001972 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 size = PyUnicode_GET_LENGTH(obj);
1974 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001975 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1976 obj, 0,
1977 size) < 0)
1978 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001980 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 size = PyUnicode_GET_LENGTH(*callresult);
1982 assert(PyUnicode_KIND(*callresult) <=
1983 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001984 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1985 *callresult,
1986 0, size) < 0)
1987 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001989 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001990 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001991 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001992 break;
1993 }
1994 case 'S':
1995 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001996 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001997 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001998 /* unused, since we already have the result */
1999 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002001 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2002 *callresult, 0,
2003 PyUnicode_GET_LENGTH(*callresult)) < 0)
2004 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002006 /* We're done with the unicode()/repr() => forget it */
2007 Py_DECREF(*callresult);
2008 /* switch to next unicode()/repr() result */
2009 ++callresult;
2010 break;
2011 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002012 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002014 break;
2015 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 for (; *p; ++p, ++i)
2017 PyUnicode_WRITE(kind, data, i, *p);
2018 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002019 goto end;
2020 }
Victor Stinner1205f272010-09-11 00:54:47 +00002021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 else {
2023 assert(i < PyUnicode_GET_LENGTH(string));
2024 PyUnicode_WRITE(kind, data, i++, *f);
2025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002028
Benjamin Peterson29060642009-01-31 22:14:21 +00002029 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002030 if (callresults)
2031 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 if (numberresults)
2033 PyObject_Free(numberresults);
2034 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002035 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002036 if (callresults) {
2037 PyObject **callresult2 = callresults;
2038 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002039 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002040 ++callresult2;
2041 }
2042 PyObject_Free(callresults);
2043 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 if (numberresults)
2045 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002046 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002047}
2048
Walter Dörwaldd2034312007-05-18 16:29:38 +00002049PyObject *
2050PyUnicode_FromFormat(const char *format, ...)
2051{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002052 PyObject* ret;
2053 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002054
2055#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002056 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002057#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002058 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002059#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002060 ret = PyUnicode_FromFormatV(format, vargs);
2061 va_end(vargs);
2062 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002063}
2064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002065#ifdef HAVE_WCHAR_H
2066
Victor Stinner5593d8a2010-10-02 11:11:27 +00002067/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2068 convert a Unicode object to a wide character string.
2069
Victor Stinnerd88d9832011-09-06 02:00:05 +02002070 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002071 character) required to convert the unicode object. Ignore size argument.
2072
Victor Stinnerd88d9832011-09-06 02:00:05 +02002073 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002074 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002075 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002076static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002077unicode_aswidechar(PyUnicodeObject *unicode,
2078 wchar_t *w,
2079 Py_ssize_t size)
2080{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002081 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002082 const wchar_t *wstr;
2083
2084 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2085 if (wstr == NULL)
2086 return -1;
2087
Victor Stinner5593d8a2010-10-02 11:11:27 +00002088 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002089 if (size > res)
2090 size = res + 1;
2091 else
2092 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002094 return res;
2095 }
2096 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002097 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002098}
2099
2100Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002101PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002102 wchar_t *w,
2103 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104{
2105 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002106 PyErr_BadInternalCall();
2107 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002109 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110}
2111
Victor Stinner137c34c2010-09-29 10:25:54 +00002112wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002113PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002114 Py_ssize_t *size)
2115{
2116 wchar_t* buffer;
2117 Py_ssize_t buflen;
2118
2119 if (unicode == NULL) {
2120 PyErr_BadInternalCall();
2121 return NULL;
2122 }
2123
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002124 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 if (buflen == -1)
2126 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002127 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002128 PyErr_NoMemory();
2129 return NULL;
2130 }
2131
Victor Stinner137c34c2010-09-29 10:25:54 +00002132 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2133 if (buffer == NULL) {
2134 PyErr_NoMemory();
2135 return NULL;
2136 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002137 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138 if (buflen == -1)
2139 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002140 if (size != NULL)
2141 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002142 return buffer;
2143}
2144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146
Alexander Belopolsky40018472011-02-26 01:02:56 +00002147PyObject *
2148PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002149{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002150 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002151 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002152 PyErr_SetString(PyExc_ValueError,
2153 "chr() arg not in range(0x110000)");
2154 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002155 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 if (ordinal < 256)
2158 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002160 v = PyUnicode_New(1, ordinal);
2161 if (v == NULL)
2162 return NULL;
2163 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2164 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002165}
2166
Alexander Belopolsky40018472011-02-26 01:02:56 +00002167PyObject *
2168PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002170 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002171 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002172 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002173 if (PyUnicode_READY(obj))
2174 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002175 Py_INCREF(obj);
2176 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002177 }
2178 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002179 /* For a Unicode subtype that's not a Unicode object,
2180 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002181 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002182 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002183 PyErr_Format(PyExc_TypeError,
2184 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002185 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002186 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002187}
2188
Alexander Belopolsky40018472011-02-26 01:02:56 +00002189PyObject *
2190PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002191 const char *encoding,
2192 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002193{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002194 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002195 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002196
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002198 PyErr_BadInternalCall();
2199 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002201
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002202 /* Decoding bytes objects is the most common case and should be fast */
2203 if (PyBytes_Check(obj)) {
2204 if (PyBytes_GET_SIZE(obj) == 0) {
2205 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002206 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002207 }
2208 else {
2209 v = PyUnicode_Decode(
2210 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2211 encoding, errors);
2212 }
2213 return v;
2214 }
2215
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002216 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002217 PyErr_SetString(PyExc_TypeError,
2218 "decoding str is not supported");
2219 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002220 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002221
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002222 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2223 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2224 PyErr_Format(PyExc_TypeError,
2225 "coercing to str: need bytes, bytearray "
2226 "or buffer-like object, %.80s found",
2227 Py_TYPE(obj)->tp_name);
2228 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002229 }
Tim Petersced69f82003-09-16 20:30:58 +00002230
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002231 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002232 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002233 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 }
Tim Petersced69f82003-09-16 20:30:58 +00002235 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002236 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002237
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002238 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002239 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002240}
2241
Victor Stinner600d3be2010-06-10 12:00:55 +00002242/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002243 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2244 1 on success. */
2245static int
2246normalize_encoding(const char *encoding,
2247 char *lower,
2248 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002250 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002251 char *l;
2252 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002253
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002254 e = encoding;
2255 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002256 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002257 while (*e) {
2258 if (l == l_end)
2259 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002260 if (Py_ISUPPER(*e)) {
2261 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002262 }
2263 else if (*e == '_') {
2264 *l++ = '-';
2265 e++;
2266 }
2267 else {
2268 *l++ = *e++;
2269 }
2270 }
2271 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002272 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002273}
2274
Alexander Belopolsky40018472011-02-26 01:02:56 +00002275PyObject *
2276PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002277 Py_ssize_t size,
2278 const char *encoding,
2279 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002280{
2281 PyObject *buffer = NULL, *unicode;
2282 Py_buffer info;
2283 char lower[11]; /* Enough for any encoding shortcut */
2284
2285 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002286 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002287
2288 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002289 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002290 if ((strcmp(lower, "utf-8") == 0) ||
2291 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002292 return PyUnicode_DecodeUTF8(s, size, errors);
2293 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002294 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002295 (strcmp(lower, "iso-8859-1") == 0))
2296 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002297#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002298 else if (strcmp(lower, "mbcs") == 0)
2299 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002300#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002301 else if (strcmp(lower, "ascii") == 0)
2302 return PyUnicode_DecodeASCII(s, size, errors);
2303 else if (strcmp(lower, "utf-16") == 0)
2304 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2305 else if (strcmp(lower, "utf-32") == 0)
2306 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308
2309 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002310 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002311 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002312 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002313 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314 if (buffer == NULL)
2315 goto onError;
2316 unicode = PyCodec_Decode(buffer, encoding, errors);
2317 if (unicode == NULL)
2318 goto onError;
2319 if (!PyUnicode_Check(unicode)) {
2320 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002321 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002322 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002323 Py_DECREF(unicode);
2324 goto onError;
2325 }
2326 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002327 if (PyUnicode_READY(unicode)) {
2328 Py_DECREF(unicode);
2329 return NULL;
2330 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002331 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002332
Benjamin Peterson29060642009-01-31 22:14:21 +00002333 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334 Py_XDECREF(buffer);
2335 return NULL;
2336}
2337
Alexander Belopolsky40018472011-02-26 01:02:56 +00002338PyObject *
2339PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002340 const char *encoding,
2341 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002342{
2343 PyObject *v;
2344
2345 if (!PyUnicode_Check(unicode)) {
2346 PyErr_BadArgument();
2347 goto onError;
2348 }
2349
2350 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002351 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002352
2353 /* Decode via the codec registry */
2354 v = PyCodec_Decode(unicode, encoding, errors);
2355 if (v == NULL)
2356 goto onError;
2357 return v;
2358
Benjamin Peterson29060642009-01-31 22:14:21 +00002359 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002360 return NULL;
2361}
2362
Alexander Belopolsky40018472011-02-26 01:02:56 +00002363PyObject *
2364PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002365 const char *encoding,
2366 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002367{
2368 PyObject *v;
2369
2370 if (!PyUnicode_Check(unicode)) {
2371 PyErr_BadArgument();
2372 goto onError;
2373 }
2374
2375 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002376 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002377
2378 /* Decode via the codec registry */
2379 v = PyCodec_Decode(unicode, encoding, errors);
2380 if (v == NULL)
2381 goto onError;
2382 if (!PyUnicode_Check(v)) {
2383 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002384 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002385 Py_TYPE(v)->tp_name);
2386 Py_DECREF(v);
2387 goto onError;
2388 }
2389 return v;
2390
Benjamin Peterson29060642009-01-31 22:14:21 +00002391 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002392 return NULL;
2393}
2394
Alexander Belopolsky40018472011-02-26 01:02:56 +00002395PyObject *
2396PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002397 Py_ssize_t size,
2398 const char *encoding,
2399 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400{
2401 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002402
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403 unicode = PyUnicode_FromUnicode(s, size);
2404 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002406 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2407 Py_DECREF(unicode);
2408 return v;
2409}
2410
Alexander Belopolsky40018472011-02-26 01:02:56 +00002411PyObject *
2412PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002413 const char *encoding,
2414 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002415{
2416 PyObject *v;
2417
2418 if (!PyUnicode_Check(unicode)) {
2419 PyErr_BadArgument();
2420 goto onError;
2421 }
2422
2423 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002424 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002425
2426 /* Encode via the codec registry */
2427 v = PyCodec_Encode(unicode, encoding, errors);
2428 if (v == NULL)
2429 goto onError;
2430 return v;
2431
Benjamin Peterson29060642009-01-31 22:14:21 +00002432 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002433 return NULL;
2434}
2435
Victor Stinnerad158722010-10-27 00:25:46 +00002436PyObject *
2437PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002438{
Victor Stinner99b95382011-07-04 14:23:54 +02002439#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002440 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2441 PyUnicode_GET_SIZE(unicode),
2442 NULL);
2443#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002445#else
Victor Stinner793b5312011-04-27 00:24:21 +02002446 PyInterpreterState *interp = PyThreadState_GET()->interp;
2447 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2448 cannot use it to encode and decode filenames before it is loaded. Load
2449 the Python codec requires to encode at least its own filename. Use the C
2450 version of the locale codec until the codec registry is initialized and
2451 the Python codec is loaded.
2452
2453 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2454 cannot only rely on it: check also interp->fscodec_initialized for
2455 subinterpreters. */
2456 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002457 return PyUnicode_AsEncodedString(unicode,
2458 Py_FileSystemDefaultEncoding,
2459 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002460 }
2461 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002462 /* locale encoding with surrogateescape */
2463 wchar_t *wchar;
2464 char *bytes;
2465 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002466 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002467
2468 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2469 if (wchar == NULL)
2470 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002471 bytes = _Py_wchar2char(wchar, &error_pos);
2472 if (bytes == NULL) {
2473 if (error_pos != (size_t)-1) {
2474 char *errmsg = strerror(errno);
2475 PyObject *exc = NULL;
2476 if (errmsg == NULL)
2477 errmsg = "Py_wchar2char() failed";
2478 raise_encode_exception(&exc,
2479 "filesystemencoding",
2480 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2481 error_pos, error_pos+1,
2482 errmsg);
2483 Py_XDECREF(exc);
2484 }
2485 else
2486 PyErr_NoMemory();
2487 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002488 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002489 }
2490 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002491
2492 bytes_obj = PyBytes_FromString(bytes);
2493 PyMem_Free(bytes);
2494 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002495 }
Victor Stinnerad158722010-10-27 00:25:46 +00002496#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002497}
2498
Alexander Belopolsky40018472011-02-26 01:02:56 +00002499PyObject *
2500PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002501 const char *encoding,
2502 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503{
2504 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002505 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002506
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507 if (!PyUnicode_Check(unicode)) {
2508 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002509 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510 }
Fred Drakee4315f52000-05-09 19:53:39 +00002511
Victor Stinner2f283c22011-03-02 01:21:46 +00002512 if (encoding == NULL) {
2513 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002514 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002515 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002517 }
Fred Drakee4315f52000-05-09 19:53:39 +00002518
2519 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002520 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002521 if ((strcmp(lower, "utf-8") == 0) ||
2522 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002523 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002524 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002525 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002526 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002527 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002528 }
Victor Stinner37296e82010-06-10 13:36:23 +00002529 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002530 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002531 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002532 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002533#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002534 else if (strcmp(lower, "mbcs") == 0)
2535 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2536 PyUnicode_GET_SIZE(unicode),
2537 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002538#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002539 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002540 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002541 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542
2543 /* Encode via the codec registry */
2544 v = PyCodec_Encode(unicode, encoding, errors);
2545 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002546 return NULL;
2547
2548 /* The normal path */
2549 if (PyBytes_Check(v))
2550 return v;
2551
2552 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002553 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002554 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002555 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002556
2557 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2558 "encoder %s returned bytearray instead of bytes",
2559 encoding);
2560 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002561 Py_DECREF(v);
2562 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002563 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002564
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002565 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2566 Py_DECREF(v);
2567 return b;
2568 }
2569
2570 PyErr_Format(PyExc_TypeError,
2571 "encoder did not return a bytes object (type=%.400s)",
2572 Py_TYPE(v)->tp_name);
2573 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002574 return NULL;
2575}
2576
Alexander Belopolsky40018472011-02-26 01:02:56 +00002577PyObject *
2578PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002579 const char *encoding,
2580 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002581{
2582 PyObject *v;
2583
2584 if (!PyUnicode_Check(unicode)) {
2585 PyErr_BadArgument();
2586 goto onError;
2587 }
2588
2589 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002590 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002591
2592 /* Encode via the codec registry */
2593 v = PyCodec_Encode(unicode, encoding, errors);
2594 if (v == NULL)
2595 goto onError;
2596 if (!PyUnicode_Check(v)) {
2597 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002598 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002599 Py_TYPE(v)->tp_name);
2600 Py_DECREF(v);
2601 goto onError;
2602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002604
Benjamin Peterson29060642009-01-31 22:14:21 +00002605 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002606 return NULL;
2607}
2608
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002609PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002610PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002611 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002612 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2613}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002614
Christian Heimes5894ba72007-11-04 11:43:14 +00002615PyObject*
2616PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2617{
Victor Stinner99b95382011-07-04 14:23:54 +02002618#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002619 return PyUnicode_DecodeMBCS(s, size, NULL);
2620#elif defined(__APPLE__)
2621 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2622#else
Victor Stinner793b5312011-04-27 00:24:21 +02002623 PyInterpreterState *interp = PyThreadState_GET()->interp;
2624 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2625 cannot use it to encode and decode filenames before it is loaded. Load
2626 the Python codec requires to encode at least its own filename. Use the C
2627 version of the locale codec until the codec registry is initialized and
2628 the Python codec is loaded.
2629
2630 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2631 cannot only rely on it: check also interp->fscodec_initialized for
2632 subinterpreters. */
2633 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002634 return PyUnicode_Decode(s, size,
2635 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002636 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002637 }
2638 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002639 /* locale encoding with surrogateescape */
2640 wchar_t *wchar;
2641 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002642 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002643
2644 if (s[size] != '\0' || size != strlen(s)) {
2645 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2646 return NULL;
2647 }
2648
Victor Stinner168e1172010-10-16 23:16:16 +00002649 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002650 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002651 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002652
Victor Stinner168e1172010-10-16 23:16:16 +00002653 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002654 PyMem_Free(wchar);
2655 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002656 }
Victor Stinnerad158722010-10-27 00:25:46 +00002657#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002658}
2659
Martin v. Löwis011e8422009-05-05 04:43:17 +00002660
2661int
2662PyUnicode_FSConverter(PyObject* arg, void* addr)
2663{
2664 PyObject *output = NULL;
2665 Py_ssize_t size;
2666 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002667 if (arg == NULL) {
2668 Py_DECREF(*(PyObject**)addr);
2669 return 1;
2670 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002671 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002672 output = arg;
2673 Py_INCREF(output);
2674 }
2675 else {
2676 arg = PyUnicode_FromObject(arg);
2677 if (!arg)
2678 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002679 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002680 Py_DECREF(arg);
2681 if (!output)
2682 return 0;
2683 if (!PyBytes_Check(output)) {
2684 Py_DECREF(output);
2685 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2686 return 0;
2687 }
2688 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002689 size = PyBytes_GET_SIZE(output);
2690 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002691 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002692 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002693 Py_DECREF(output);
2694 return 0;
2695 }
2696 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002697 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002698}
2699
2700
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002701int
2702PyUnicode_FSDecoder(PyObject* arg, void* addr)
2703{
2704 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002705 if (arg == NULL) {
2706 Py_DECREF(*(PyObject**)addr);
2707 return 1;
2708 }
2709 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002710 if (PyUnicode_READY(arg))
2711 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002712 output = arg;
2713 Py_INCREF(output);
2714 }
2715 else {
2716 arg = PyBytes_FromObject(arg);
2717 if (!arg)
2718 return 0;
2719 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2720 PyBytes_GET_SIZE(arg));
2721 Py_DECREF(arg);
2722 if (!output)
2723 return 0;
2724 if (!PyUnicode_Check(output)) {
2725 Py_DECREF(output);
2726 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2727 return 0;
2728 }
2729 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002730 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2731 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002732 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2733 Py_DECREF(output);
2734 return 0;
2735 }
2736 *(PyObject**)addr = output;
2737 return Py_CLEANUP_SUPPORTED;
2738}
2739
2740
Martin v. Löwis5b222132007-06-10 09:51:05 +00002741char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002743{
Christian Heimesf3863112007-11-22 07:46:41 +00002744 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002745 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2746
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002747 if (!PyUnicode_Check(unicode)) {
2748 PyErr_BadArgument();
2749 return NULL;
2750 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002752 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002753
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002754 if (PyUnicode_UTF8(unicode) == NULL) {
2755 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2757 if (bytes == NULL)
2758 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002759 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2760 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002761 Py_DECREF(bytes);
2762 return NULL;
2763 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002764 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2765 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002766 Py_DECREF(bytes);
2767 }
2768
2769 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002770 *psize = PyUnicode_UTF8_LENGTH(unicode);
2771 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002772}
2773
2774char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002776{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002777 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2778}
2779
2780#ifdef Py_DEBUG
2781int unicode_as_unicode_calls = 0;
2782#endif
2783
2784
2785Py_UNICODE *
2786PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2787{
2788 PyUnicodeObject *u;
2789 const unsigned char *one_byte;
2790#if SIZEOF_WCHAR_T == 4
2791 const Py_UCS2 *two_bytes;
2792#else
2793 const Py_UCS4 *four_bytes;
2794 const Py_UCS4 *ucs4_end;
2795 Py_ssize_t num_surrogates;
2796#endif
2797 wchar_t *w;
2798 wchar_t *wchar_end;
2799
2800 if (!PyUnicode_Check(unicode)) {
2801 PyErr_BadArgument();
2802 return NULL;
2803 }
2804 u = (PyUnicodeObject*)unicode;
2805 if (_PyUnicode_WSTR(u) == NULL) {
2806 /* Non-ASCII compact unicode object */
2807 assert(_PyUnicode_KIND(u) != 0);
2808 assert(PyUnicode_IS_READY(u));
2809
2810#ifdef Py_DEBUG
2811 ++unicode_as_unicode_calls;
2812#endif
2813
2814 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2815#if SIZEOF_WCHAR_T == 2
2816 four_bytes = PyUnicode_4BYTE_DATA(u);
2817 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2818 num_surrogates = 0;
2819
2820 for (; four_bytes < ucs4_end; ++four_bytes) {
2821 if (*four_bytes > 0xFFFF)
2822 ++num_surrogates;
2823 }
2824
2825 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2826 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2827 if (!_PyUnicode_WSTR(u)) {
2828 PyErr_NoMemory();
2829 return NULL;
2830 }
2831 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2832
2833 w = _PyUnicode_WSTR(u);
2834 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2835 four_bytes = PyUnicode_4BYTE_DATA(u);
2836 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2837 if (*four_bytes > 0xFFFF) {
2838 /* encode surrogate pair in this case */
2839 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2840 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2841 }
2842 else
2843 *w = *four_bytes;
2844
2845 if (w > wchar_end) {
2846 assert(0 && "Miscalculated string end");
2847 }
2848 }
2849 *w = 0;
2850#else
2851 /* sizeof(wchar_t) == 4 */
2852 Py_FatalError("Impossible unicode object state, wstr and str "
2853 "should share memory already.");
2854 return NULL;
2855#endif
2856 }
2857 else {
2858 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2859 (_PyUnicode_LENGTH(u) + 1));
2860 if (!_PyUnicode_WSTR(u)) {
2861 PyErr_NoMemory();
2862 return NULL;
2863 }
2864 if (!PyUnicode_IS_COMPACT_ASCII(u))
2865 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2866 w = _PyUnicode_WSTR(u);
2867 wchar_end = w + _PyUnicode_LENGTH(u);
2868
2869 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2870 one_byte = PyUnicode_1BYTE_DATA(u);
2871 for (; w < wchar_end; ++one_byte, ++w)
2872 *w = *one_byte;
2873 /* null-terminate the wstr */
2874 *w = 0;
2875 }
2876 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2877#if SIZEOF_WCHAR_T == 4
2878 two_bytes = PyUnicode_2BYTE_DATA(u);
2879 for (; w < wchar_end; ++two_bytes, ++w)
2880 *w = *two_bytes;
2881 /* null-terminate the wstr */
2882 *w = 0;
2883#else
2884 /* sizeof(wchar_t) == 2 */
2885 PyObject_FREE(_PyUnicode_WSTR(u));
2886 _PyUnicode_WSTR(u) = NULL;
2887 Py_FatalError("Impossible unicode object state, wstr "
2888 "and str should share memory already.");
2889 return NULL;
2890#endif
2891 }
2892 else {
2893 assert(0 && "This should never happen.");
2894 }
2895 }
2896 }
2897 if (size != NULL)
2898 *size = PyUnicode_WSTR_LENGTH(u);
2899 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002900}
2901
Alexander Belopolsky40018472011-02-26 01:02:56 +00002902Py_UNICODE *
2903PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002905 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906}
2907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002908
Alexander Belopolsky40018472011-02-26 01:02:56 +00002909Py_ssize_t
2910PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911{
2912 if (!PyUnicode_Check(unicode)) {
2913 PyErr_BadArgument();
2914 goto onError;
2915 }
2916 return PyUnicode_GET_SIZE(unicode);
2917
Benjamin Peterson29060642009-01-31 22:14:21 +00002918 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919 return -1;
2920}
2921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002922Py_ssize_t
2923PyUnicode_GetLength(PyObject *unicode)
2924{
Victor Stinner5a706cf2011-10-02 00:36:53 +02002925 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002926 PyErr_BadArgument();
2927 return -1;
2928 }
2929
2930 return PyUnicode_GET_LENGTH(unicode);
2931}
2932
2933Py_UCS4
2934PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2935{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02002936 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
2937 PyErr_BadArgument();
2938 return (Py_UCS4)-1;
2939 }
2940 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
2941 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002942 return (Py_UCS4)-1;
2943 }
2944 return PyUnicode_READ_CHAR(unicode, index);
2945}
2946
2947int
2948PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2949{
2950 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02002951 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002952 return -1;
2953 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02002954 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
2955 PyErr_SetString(PyExc_IndexError, "string index out of range");
2956 return -1;
2957 }
2958 if (_PyUnicode_Dirty(unicode))
2959 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002960 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2961 index, ch);
2962 return 0;
2963}
2964
Alexander Belopolsky40018472011-02-26 01:02:56 +00002965const char *
2966PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002967{
Victor Stinner42cb4622010-09-01 19:39:01 +00002968 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002969}
2970
Victor Stinner554f3f02010-06-16 23:33:54 +00002971/* create or adjust a UnicodeDecodeError */
2972static void
2973make_decode_exception(PyObject **exceptionObject,
2974 const char *encoding,
2975 const char *input, Py_ssize_t length,
2976 Py_ssize_t startpos, Py_ssize_t endpos,
2977 const char *reason)
2978{
2979 if (*exceptionObject == NULL) {
2980 *exceptionObject = PyUnicodeDecodeError_Create(
2981 encoding, input, length, startpos, endpos, reason);
2982 }
2983 else {
2984 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2985 goto onError;
2986 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2987 goto onError;
2988 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2989 goto onError;
2990 }
2991 return;
2992
2993onError:
2994 Py_DECREF(*exceptionObject);
2995 *exceptionObject = NULL;
2996}
2997
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002998/* error handling callback helper:
2999 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003000 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003001 and adjust various state variables.
3002 return 0 on success, -1 on error
3003*/
3004
Alexander Belopolsky40018472011-02-26 01:02:56 +00003005static int
3006unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003007 const char *encoding, const char *reason,
3008 const char **input, const char **inend, Py_ssize_t *startinpos,
3009 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3010 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003011{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003012 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003013
3014 PyObject *restuple = NULL;
3015 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003016 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003017 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003018 Py_ssize_t requiredsize;
3019 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003020 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003021 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003022 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003023 int res = -1;
3024
3025 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003026 *errorHandler = PyCodec_LookupError(errors);
3027 if (*errorHandler == NULL)
3028 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003029 }
3030
Victor Stinner554f3f02010-06-16 23:33:54 +00003031 make_decode_exception(exceptionObject,
3032 encoding,
3033 *input, *inend - *input,
3034 *startinpos, *endinpos,
3035 reason);
3036 if (*exceptionObject == NULL)
3037 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038
3039 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3040 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003042 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003043 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003044 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003045 }
3046 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003047 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003048
3049 /* Copy back the bytes variables, which might have been modified by the
3050 callback */
3051 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3052 if (!inputobj)
3053 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003054 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003055 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003056 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003057 *input = PyBytes_AS_STRING(inputobj);
3058 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003059 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003060 /* we can DECREF safely, as the exception has another reference,
3061 so the object won't go away. */
3062 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003063
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003065 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003066 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003067 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3068 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003069 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070
3071 /* need more space? (at least enough for what we
3072 have+the replacement+the rest of the string (starting
3073 at the new input position), so we won't have to check space
3074 when there are no errors in the rest of the string) */
3075 repptr = PyUnicode_AS_UNICODE(repunicode);
3076 repsize = PyUnicode_GET_SIZE(repunicode);
3077 requiredsize = *outpos + repsize + insize-newpos;
3078 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003079 if (requiredsize<2*outsize)
3080 requiredsize = 2*outsize;
3081 if (_PyUnicode_Resize(output, requiredsize) < 0)
3082 goto onError;
3083 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003084 }
3085 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003086 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003087 Py_UNICODE_COPY(*outptr, repptr, repsize);
3088 *outptr += repsize;
3089 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003090
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091 /* we made it! */
3092 res = 0;
3093
Benjamin Peterson29060642009-01-31 22:14:21 +00003094 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003095 Py_XDECREF(restuple);
3096 return res;
3097}
3098
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003099/* --- UTF-7 Codec -------------------------------------------------------- */
3100
Antoine Pitrou244651a2009-05-04 18:56:13 +00003101/* See RFC2152 for details. We encode conservatively and decode liberally. */
3102
3103/* Three simple macros defining base-64. */
3104
3105/* Is c a base-64 character? */
3106
3107#define IS_BASE64(c) \
3108 (((c) >= 'A' && (c) <= 'Z') || \
3109 ((c) >= 'a' && (c) <= 'z') || \
3110 ((c) >= '0' && (c) <= '9') || \
3111 (c) == '+' || (c) == '/')
3112
3113/* given that c is a base-64 character, what is its base-64 value? */
3114
3115#define FROM_BASE64(c) \
3116 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3117 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3118 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3119 (c) == '+' ? 62 : 63)
3120
3121/* What is the base-64 character of the bottom 6 bits of n? */
3122
3123#define TO_BASE64(n) \
3124 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3125
3126/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3127 * decoded as itself. We are permissive on decoding; the only ASCII
3128 * byte not decoding to itself is the + which begins a base64
3129 * string. */
3130
3131#define DECODE_DIRECT(c) \
3132 ((c) <= 127 && (c) != '+')
3133
3134/* The UTF-7 encoder treats ASCII characters differently according to
3135 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3136 * the above). See RFC2152. This array identifies these different
3137 * sets:
3138 * 0 : "Set D"
3139 * alphanumeric and '(),-./:?
3140 * 1 : "Set O"
3141 * !"#$%&*;<=>@[]^_`{|}
3142 * 2 : "whitespace"
3143 * ht nl cr sp
3144 * 3 : special (must be base64 encoded)
3145 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3146 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003147
Tim Petersced69f82003-09-16 20:30:58 +00003148static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003149char utf7_category[128] = {
3150/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3151 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3152/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3153 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3154/* sp ! " # $ % & ' ( ) * + , - . / */
3155 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3156/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3157 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3158/* @ A B C D E F G H I J K L M N O */
3159 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3160/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3161 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3162/* ` a b c d e f g h i j k l m n o */
3163 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3164/* p q r s t u v w x y z { | } ~ del */
3165 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003166};
3167
Antoine Pitrou244651a2009-05-04 18:56:13 +00003168/* ENCODE_DIRECT: this character should be encoded as itself. The
3169 * answer depends on whether we are encoding set O as itself, and also
3170 * on whether we are encoding whitespace as itself. RFC2152 makes it
3171 * clear that the answers to these questions vary between
3172 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003173
Antoine Pitrou244651a2009-05-04 18:56:13 +00003174#define ENCODE_DIRECT(c, directO, directWS) \
3175 ((c) < 128 && (c) > 0 && \
3176 ((utf7_category[(c)] == 0) || \
3177 (directWS && (utf7_category[(c)] == 2)) || \
3178 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003179
Alexander Belopolsky40018472011-02-26 01:02:56 +00003180PyObject *
3181PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003182 Py_ssize_t size,
3183 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003184{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003185 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3186}
3187
Antoine Pitrou244651a2009-05-04 18:56:13 +00003188/* The decoder. The only state we preserve is our read position,
3189 * i.e. how many characters we have consumed. So if we end in the
3190 * middle of a shift sequence we have to back off the read position
3191 * and the output to the beginning of the sequence, otherwise we lose
3192 * all the shift state (seen bits, number of bits seen, high
3193 * surrogate). */
3194
Alexander Belopolsky40018472011-02-26 01:02:56 +00003195PyObject *
3196PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003197 Py_ssize_t size,
3198 const char *errors,
3199 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003200{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003201 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003202 Py_ssize_t startinpos;
3203 Py_ssize_t endinpos;
3204 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003205 const char *e;
3206 PyUnicodeObject *unicode;
3207 Py_UNICODE *p;
3208 const char *errmsg = "";
3209 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003210 Py_UNICODE *shiftOutStart;
3211 unsigned int base64bits = 0;
3212 unsigned long base64buffer = 0;
3213 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003214 PyObject *errorHandler = NULL;
3215 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003216
3217 unicode = _PyUnicode_New(size);
3218 if (!unicode)
3219 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003220 if (size == 0) {
3221 if (consumed)
3222 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003223 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003224 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003226 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003227 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003228 e = s + size;
3229
3230 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003231 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003232 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003233 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003234
Antoine Pitrou244651a2009-05-04 18:56:13 +00003235 if (inShift) { /* in a base-64 section */
3236 if (IS_BASE64(ch)) { /* consume a base-64 character */
3237 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3238 base64bits += 6;
3239 s++;
3240 if (base64bits >= 16) {
3241 /* we have enough bits for a UTF-16 value */
3242 Py_UNICODE outCh = (Py_UNICODE)
3243 (base64buffer >> (base64bits-16));
3244 base64bits -= 16;
3245 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3246 if (surrogate) {
3247 /* expecting a second surrogate */
3248 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3249#ifdef Py_UNICODE_WIDE
3250 *p++ = (((surrogate & 0x3FF)<<10)
3251 | (outCh & 0x3FF)) + 0x10000;
3252#else
3253 *p++ = surrogate;
3254 *p++ = outCh;
3255#endif
3256 surrogate = 0;
3257 }
3258 else {
3259 surrogate = 0;
3260 errmsg = "second surrogate missing";
3261 goto utf7Error;
3262 }
3263 }
3264 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3265 /* first surrogate */
3266 surrogate = outCh;
3267 }
3268 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3269 errmsg = "unexpected second surrogate";
3270 goto utf7Error;
3271 }
3272 else {
3273 *p++ = outCh;
3274 }
3275 }
3276 }
3277 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003278 inShift = 0;
3279 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003280 if (surrogate) {
3281 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003282 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003283 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003284 if (base64bits > 0) { /* left-over bits */
3285 if (base64bits >= 6) {
3286 /* We've seen at least one base-64 character */
3287 errmsg = "partial character in shift sequence";
3288 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003289 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003290 else {
3291 /* Some bits remain; they should be zero */
3292 if (base64buffer != 0) {
3293 errmsg = "non-zero padding bits in shift sequence";
3294 goto utf7Error;
3295 }
3296 }
3297 }
3298 if (ch != '-') {
3299 /* '-' is absorbed; other terminating
3300 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003301 *p++ = ch;
3302 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003303 }
3304 }
3305 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003306 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003307 s++; /* consume '+' */
3308 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003309 s++;
3310 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003311 }
3312 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003313 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003314 shiftOutStart = p;
3315 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003316 }
3317 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003318 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003319 *p++ = ch;
3320 s++;
3321 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003322 else {
3323 startinpos = s-starts;
3324 s++;
3325 errmsg = "unexpected special character";
3326 goto utf7Error;
3327 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003328 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003329utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 outpos = p-PyUnicode_AS_UNICODE(unicode);
3331 endinpos = s-starts;
3332 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003333 errors, &errorHandler,
3334 "utf7", errmsg,
3335 &starts, &e, &startinpos, &endinpos, &exc, &s,
3336 &unicode, &outpos, &p))
3337 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003338 }
3339
Antoine Pitrou244651a2009-05-04 18:56:13 +00003340 /* end of string */
3341
3342 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3343 /* if we're in an inconsistent state, that's an error */
3344 if (surrogate ||
3345 (base64bits >= 6) ||
3346 (base64bits > 0 && base64buffer != 0)) {
3347 outpos = p-PyUnicode_AS_UNICODE(unicode);
3348 endinpos = size;
3349 if (unicode_decode_call_errorhandler(
3350 errors, &errorHandler,
3351 "utf7", "unterminated shift sequence",
3352 &starts, &e, &startinpos, &endinpos, &exc, &s,
3353 &unicode, &outpos, &p))
3354 goto onError;
3355 if (s < e)
3356 goto restart;
3357 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003358 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003359
3360 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003361 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003362 if (inShift) {
3363 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003364 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003365 }
3366 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003367 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003368 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003369 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003370
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003371 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003372 goto onError;
3373
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 Py_XDECREF(errorHandler);
3375 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003376 if (PyUnicode_READY(unicode) == -1) {
3377 Py_DECREF(unicode);
3378 return NULL;
3379 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003380 return (PyObject *)unicode;
3381
Benjamin Peterson29060642009-01-31 22:14:21 +00003382 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003383 Py_XDECREF(errorHandler);
3384 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003385 Py_DECREF(unicode);
3386 return NULL;
3387}
3388
3389
Alexander Belopolsky40018472011-02-26 01:02:56 +00003390PyObject *
3391PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003392 Py_ssize_t size,
3393 int base64SetO,
3394 int base64WhiteSpace,
3395 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003396{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003397 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003398 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003399 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003400 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003401 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003402 unsigned int base64bits = 0;
3403 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003404 char * out;
3405 char * start;
3406
3407 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003408 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003409
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003410 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003411 return PyErr_NoMemory();
3412
Antoine Pitrou244651a2009-05-04 18:56:13 +00003413 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003414 if (v == NULL)
3415 return NULL;
3416
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003417 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003418 for (;i < size; ++i) {
3419 Py_UNICODE ch = s[i];
3420
Antoine Pitrou244651a2009-05-04 18:56:13 +00003421 if (inShift) {
3422 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3423 /* shifting out */
3424 if (base64bits) { /* output remaining bits */
3425 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3426 base64buffer = 0;
3427 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003428 }
3429 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003430 /* Characters not in the BASE64 set implicitly unshift the sequence
3431 so no '-' is required, except if the character is itself a '-' */
3432 if (IS_BASE64(ch) || ch == '-') {
3433 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003434 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003435 *out++ = (char) ch;
3436 }
3437 else {
3438 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003439 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003440 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003441 else { /* not in a shift sequence */
3442 if (ch == '+') {
3443 *out++ = '+';
3444 *out++ = '-';
3445 }
3446 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3447 *out++ = (char) ch;
3448 }
3449 else {
3450 *out++ = '+';
3451 inShift = 1;
3452 goto encode_char;
3453 }
3454 }
3455 continue;
3456encode_char:
3457#ifdef Py_UNICODE_WIDE
3458 if (ch >= 0x10000) {
3459 /* code first surrogate */
3460 base64bits += 16;
3461 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3462 while (base64bits >= 6) {
3463 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3464 base64bits -= 6;
3465 }
3466 /* prepare second surrogate */
3467 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3468 }
3469#endif
3470 base64bits += 16;
3471 base64buffer = (base64buffer << 16) | ch;
3472 while (base64bits >= 6) {
3473 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3474 base64bits -= 6;
3475 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003476 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003477 if (base64bits)
3478 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3479 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003480 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003481 if (_PyBytes_Resize(&v, out - start) < 0)
3482 return NULL;
3483 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003484}
3485
Antoine Pitrou244651a2009-05-04 18:56:13 +00003486#undef IS_BASE64
3487#undef FROM_BASE64
3488#undef TO_BASE64
3489#undef DECODE_DIRECT
3490#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003491
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492/* --- UTF-8 Codec -------------------------------------------------------- */
3493
Tim Petersced69f82003-09-16 20:30:58 +00003494static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003496 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3497 illegal prefix. See RFC 3629 for details */
3498 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3499 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003500 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3502 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3503 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3504 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003505 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3506 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3508 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003509 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3510 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3511 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3512 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3513 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514};
3515
Alexander Belopolsky40018472011-02-26 01:02:56 +00003516PyObject *
3517PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003518 Py_ssize_t size,
3519 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520{
Walter Dörwald69652032004-09-07 20:24:22 +00003521 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3522}
3523
Antoine Pitrouab868312009-01-10 15:40:25 +00003524/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3525#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3526
3527/* Mask to quickly check whether a C 'long' contains a
3528 non-ASCII, UTF8-encoded char. */
3529#if (SIZEOF_LONG == 8)
3530# define ASCII_CHAR_MASK 0x8080808080808080L
3531#elif (SIZEOF_LONG == 4)
3532# define ASCII_CHAR_MASK 0x80808080L
3533#else
3534# error C 'long' size should be either 4 or 8!
3535#endif
3536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003537/* Scans a UTF-8 string and returns the maximum character to be expected,
3538 the size of the decoded unicode string and if any major errors were
3539 encountered.
3540
3541 This function does check basic UTF-8 sanity, it does however NOT CHECK
3542 if the string contains surrogates, and if all continuation bytes are
3543 within the correct ranges, these checks are performed in
3544 PyUnicode_DecodeUTF8Stateful.
3545
3546 If it sets has_errors to 1, it means the value of unicode_size and max_char
3547 will be bogus and you should not rely on useful information in them.
3548 */
3549static Py_UCS4
3550utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3551 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3552 int *has_errors)
3553{
3554 Py_ssize_t n;
3555 Py_ssize_t char_count = 0;
3556 Py_UCS4 max_char = 127, new_max;
3557 Py_UCS4 upper_bound;
3558 const unsigned char *p = (const unsigned char *)s;
3559 const unsigned char *end = p + string_size;
3560 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3561 int err = 0;
3562
3563 for (; p < end && !err; ++p, ++char_count) {
3564 /* Only check value if it's not a ASCII char... */
3565 if (*p < 0x80) {
3566 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3567 an explanation. */
3568 if (!((size_t) p & LONG_PTR_MASK)) {
3569 /* Help register allocation */
3570 register const unsigned char *_p = p;
3571 while (_p < aligned_end) {
3572 unsigned long value = *(unsigned long *) _p;
3573 if (value & ASCII_CHAR_MASK)
3574 break;
3575 _p += SIZEOF_LONG;
3576 char_count += SIZEOF_LONG;
3577 }
3578 p = _p;
3579 if (p == end)
3580 break;
3581 }
3582 }
3583 if (*p >= 0x80) {
3584 n = utf8_code_length[*p];
3585 new_max = max_char;
3586 switch (n) {
3587 /* invalid start byte */
3588 case 0:
3589 err = 1;
3590 break;
3591 case 2:
3592 /* Code points between 0x00FF and 0x07FF inclusive.
3593 Approximate the upper bound of the code point,
3594 if this flips over 255 we can be sure it will be more
3595 than 255 and the string will need 2 bytes per code coint,
3596 if it stays under or equal to 255, we can be sure 1 byte
3597 is enough.
3598 ((*p & 0b00011111) << 6) | 0b00111111 */
3599 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3600 if (max_char < upper_bound)
3601 new_max = upper_bound;
3602 /* Ensure we track at least that we left ASCII space. */
3603 if (new_max < 128)
3604 new_max = 128;
3605 break;
3606 case 3:
3607 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3608 always > 255 and <= 65535 and will always need 2 bytes. */
3609 if (max_char < 65535)
3610 new_max = 65535;
3611 break;
3612 case 4:
3613 /* Code point will be above 0xFFFF for sure in this case. */
3614 new_max = 65537;
3615 break;
3616 /* Internal error, this should be caught by the first if */
3617 case 1:
3618 default:
3619 assert(0 && "Impossible case in utf8_max_char_and_size");
3620 err = 1;
3621 }
3622 /* Instead of number of overall bytes for this code point,
3623 n containts the number of following bytes: */
3624 --n;
3625 /* Check if the follow up chars are all valid continuation bytes */
3626 if (n >= 1) {
3627 const unsigned char *cont;
3628 if ((p + n) >= end) {
3629 if (consumed == 0)
3630 /* incomplete data, non-incremental decoding */
3631 err = 1;
3632 break;
3633 }
3634 for (cont = p + 1; cont < (p + n); ++cont) {
3635 if ((*cont & 0xc0) != 0x80) {
3636 err = 1;
3637 break;
3638 }
3639 }
3640 p += n;
3641 }
3642 else
3643 err = 1;
3644 max_char = new_max;
3645 }
3646 }
3647
3648 if (unicode_size)
3649 *unicode_size = char_count;
3650 if (has_errors)
3651 *has_errors = err;
3652 return max_char;
3653}
3654
3655/* Similar to PyUnicode_WRITE but can also write into wstr field
3656 of the legacy unicode representation */
3657#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3658 do { \
3659 const int k_ = (kind); \
3660 if (k_ == PyUnicode_WCHAR_KIND) \
3661 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3662 else if (k_ == PyUnicode_1BYTE_KIND) \
3663 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3664 else if (k_ == PyUnicode_2BYTE_KIND) \
3665 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3666 else \
3667 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3668 } while (0)
3669
Alexander Belopolsky40018472011-02-26 01:02:56 +00003670PyObject *
3671PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003672 Py_ssize_t size,
3673 const char *errors,
3674 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003675{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003678 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003679 Py_ssize_t startinpos;
3680 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003681 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003683 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 PyObject *errorHandler = NULL;
3685 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003686 Py_UCS4 maxchar = 0;
3687 Py_ssize_t unicode_size;
3688 Py_ssize_t i;
3689 int kind;
3690 void *data;
3691 int has_errors;
3692 Py_UNICODE *error_outptr;
3693#if SIZEOF_WCHAR_T == 2
3694 Py_ssize_t wchar_offset = 0;
3695#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696
Walter Dörwald69652032004-09-07 20:24:22 +00003697 if (size == 0) {
3698 if (consumed)
3699 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003700 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003701 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003702 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3703 consumed, &has_errors);
3704 if (has_errors) {
3705 unicode = _PyUnicode_New(size);
3706 if (!unicode)
3707 return NULL;
3708 kind = PyUnicode_WCHAR_KIND;
3709 data = PyUnicode_AS_UNICODE(unicode);
3710 assert(data != NULL);
3711 }
3712 else {
3713 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3714 if (!unicode)
3715 return NULL;
3716 /* When the string is ASCII only, just use memcpy and return.
3717 unicode_size may be != size if there is an incomplete UTF-8
3718 sequence at the end of the ASCII block. */
3719 if (maxchar < 128 && size == unicode_size) {
3720 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3721 return (PyObject *)unicode;
3722 }
3723 kind = PyUnicode_KIND(unicode);
3724 data = PyUnicode_DATA(unicode);
3725 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003726 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003727 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003729 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730
3731 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003732 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733
3734 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003735 /* Fast path for runs of ASCII characters. Given that common UTF-8
3736 input will consist of an overwhelming majority of ASCII
3737 characters, we try to optimize for this case by checking
3738 as many characters as a C 'long' can contain.
3739 First, check if we can do an aligned read, as most CPUs have
3740 a penalty for unaligned reads.
3741 */
3742 if (!((size_t) s & LONG_PTR_MASK)) {
3743 /* Help register allocation */
3744 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003745 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003746 while (_s < aligned_end) {
3747 /* Read a whole long at a time (either 4 or 8 bytes),
3748 and do a fast unrolled copy if it only contains ASCII
3749 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003750 unsigned long value = *(unsigned long *) _s;
3751 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003752 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003753 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3754 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3755 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3756 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003757#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3759 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3760 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3761 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003762#endif
3763 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003765 }
3766 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003767 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003768 if (s == e)
3769 break;
3770 ch = (unsigned char)*s;
3771 }
3772 }
3773
3774 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003775 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 s++;
3777 continue;
3778 }
3779
3780 n = utf8_code_length[ch];
3781
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003782 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003783 if (consumed)
3784 break;
3785 else {
3786 errmsg = "unexpected end of data";
3787 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003788 endinpos = startinpos+1;
3789 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3790 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003791 goto utf8Error;
3792 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794
3795 switch (n) {
3796
3797 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003798 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003799 startinpos = s-starts;
3800 endinpos = startinpos+1;
3801 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802
3803 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003804 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003805 startinpos = s-starts;
3806 endinpos = startinpos+1;
3807 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808
3809 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003810 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003811 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003812 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003813 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003814 goto utf8Error;
3815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003817 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819 break;
3820
3821 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003822 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3823 will result in surrogates in range d800-dfff. Surrogates are
3824 not valid UTF-8 so they are rejected.
3825 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3826 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003827 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003828 (s[2] & 0xc0) != 0x80 ||
3829 ((unsigned char)s[0] == 0xE0 &&
3830 (unsigned char)s[1] < 0xA0) ||
3831 ((unsigned char)s[0] == 0xED &&
3832 (unsigned char)s[1] > 0x9F)) {
3833 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003834 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003835 endinpos = startinpos + 1;
3836
3837 /* if s[1] first two bits are 1 and 0, then the invalid
3838 continuation byte is s[2], so increment endinpos by 1,
3839 if not, s[1] is invalid and endinpos doesn't need to
3840 be incremented. */
3841 if ((s[1] & 0xC0) == 0x80)
3842 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003843 goto utf8Error;
3844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003846 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003847 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003848 break;
3849
3850 case 4:
3851 if ((s[1] & 0xc0) != 0x80 ||
3852 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003853 (s[3] & 0xc0) != 0x80 ||
3854 ((unsigned char)s[0] == 0xF0 &&
3855 (unsigned char)s[1] < 0x90) ||
3856 ((unsigned char)s[0] == 0xF4 &&
3857 (unsigned char)s[1] > 0x8F)) {
3858 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003859 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003860 endinpos = startinpos + 1;
3861 if ((s[1] & 0xC0) == 0x80) {
3862 endinpos++;
3863 if ((s[2] & 0xC0) == 0x80)
3864 endinpos++;
3865 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003866 goto utf8Error;
3867 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003868 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003869 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3870 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003872 /* If the string is flexible or we have native UCS-4, write
3873 directly.. */
3874 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3875 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 else {
3878 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880 /* translate from 10000..10FFFF to 0..FFFF */
3881 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883 /* high surrogate = top 10 bits added to D800 */
3884 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3885 (Py_UNICODE)(0xD800 + (ch >> 10)));
3886
3887 /* low surrogate = bottom 10 bits added to DC00 */
3888 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3889 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3890 }
3891#if SIZEOF_WCHAR_T == 2
3892 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003893#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 }
3896 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003897 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003898
Benjamin Peterson29060642009-01-31 22:14:21 +00003899 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003900 /* If this is not yet a resizable string, make it one.. */
3901 if (kind != PyUnicode_WCHAR_KIND) {
3902 const Py_UNICODE *u;
3903 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3904 if (!new_unicode)
3905 goto onError;
3906 u = PyUnicode_AsUnicode((PyObject *)unicode);
3907 if (!u)
3908 goto onError;
3909#if SIZEOF_WCHAR_T == 2
3910 i += wchar_offset;
3911#endif
3912 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3913 Py_DECREF(unicode);
3914 unicode = new_unicode;
3915 kind = 0;
3916 data = PyUnicode_AS_UNICODE(new_unicode);
3917 assert(data != NULL);
3918 }
3919 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003920 if (unicode_decode_call_errorhandler(
3921 errors, &errorHandler,
3922 "utf8", errmsg,
3923 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003924 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003925 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 /* Update data because unicode_decode_call_errorhandler might have
3927 re-created or resized the unicode object. */
3928 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003929 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931 /* Ensure the unicode_size calculation above was correct: */
3932 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3933
Walter Dörwald69652032004-09-07 20:24:22 +00003934 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003935 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003937 /* Adjust length and ready string when it contained errors and
3938 is of the old resizable kind. */
3939 if (kind == PyUnicode_WCHAR_KIND) {
3940 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3941 PyUnicode_READY(unicode) == -1)
3942 goto onError;
3943 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 Py_XDECREF(errorHandler);
3946 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 if (PyUnicode_READY(unicode) == -1) {
3948 Py_DECREF(unicode);
3949 return NULL;
3950 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951 return (PyObject *)unicode;
3952
Benjamin Peterson29060642009-01-31 22:14:21 +00003953 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003954 Py_XDECREF(errorHandler);
3955 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 Py_DECREF(unicode);
3957 return NULL;
3958}
3959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003961
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003962#ifdef __APPLE__
3963
3964/* Simplified UTF-8 decoder using surrogateescape error handler,
3965 used to decode the command line arguments on Mac OS X. */
3966
3967wchar_t*
3968_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3969{
3970 int n;
3971 const char *e;
3972 wchar_t *unicode, *p;
3973
3974 /* Note: size will always be longer than the resulting Unicode
3975 character count */
3976 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3977 PyErr_NoMemory();
3978 return NULL;
3979 }
3980 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3981 if (!unicode)
3982 return NULL;
3983
3984 /* Unpack UTF-8 encoded data */
3985 p = unicode;
3986 e = s + size;
3987 while (s < e) {
3988 Py_UCS4 ch = (unsigned char)*s;
3989
3990 if (ch < 0x80) {
3991 *p++ = (wchar_t)ch;
3992 s++;
3993 continue;
3994 }
3995
3996 n = utf8_code_length[ch];
3997 if (s + n > e) {
3998 goto surrogateescape;
3999 }
4000
4001 switch (n) {
4002 case 0:
4003 case 1:
4004 goto surrogateescape;
4005
4006 case 2:
4007 if ((s[1] & 0xc0) != 0x80)
4008 goto surrogateescape;
4009 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4010 assert ((ch > 0x007F) && (ch <= 0x07FF));
4011 *p++ = (wchar_t)ch;
4012 break;
4013
4014 case 3:
4015 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4016 will result in surrogates in range d800-dfff. Surrogates are
4017 not valid UTF-8 so they are rejected.
4018 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4019 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4020 if ((s[1] & 0xc0) != 0x80 ||
4021 (s[2] & 0xc0) != 0x80 ||
4022 ((unsigned char)s[0] == 0xE0 &&
4023 (unsigned char)s[1] < 0xA0) ||
4024 ((unsigned char)s[0] == 0xED &&
4025 (unsigned char)s[1] > 0x9F)) {
4026
4027 goto surrogateescape;
4028 }
4029 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4030 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004031 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004032 break;
4033
4034 case 4:
4035 if ((s[1] & 0xc0) != 0x80 ||
4036 (s[2] & 0xc0) != 0x80 ||
4037 (s[3] & 0xc0) != 0x80 ||
4038 ((unsigned char)s[0] == 0xF0 &&
4039 (unsigned char)s[1] < 0x90) ||
4040 ((unsigned char)s[0] == 0xF4 &&
4041 (unsigned char)s[1] > 0x8F)) {
4042 goto surrogateescape;
4043 }
4044 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4045 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4046 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4047
4048#if SIZEOF_WCHAR_T == 4
4049 *p++ = (wchar_t)ch;
4050#else
4051 /* compute and append the two surrogates: */
4052
4053 /* translate from 10000..10FFFF to 0..FFFF */
4054 ch -= 0x10000;
4055
4056 /* high surrogate = top 10 bits added to D800 */
4057 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4058
4059 /* low surrogate = bottom 10 bits added to DC00 */
4060 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4061#endif
4062 break;
4063 }
4064 s += n;
4065 continue;
4066
4067 surrogateescape:
4068 *p++ = 0xDC00 + ch;
4069 s++;
4070 }
4071 *p = L'\0';
4072 return unicode;
4073}
4074
4075#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004077/* Primary internal function which creates utf8 encoded bytes objects.
4078
4079 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004080 and allocate exactly as much space needed at the end. Else allocate the
4081 maximum possible needed (4 result bytes per Unicode character), and return
4082 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004083*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004084PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004085_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086{
Tim Peters602f7402002-04-27 18:03:26 +00004087#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004088
Guido van Rossum98297ee2007-11-06 21:34:58 +00004089 Py_ssize_t i; /* index into s of next input byte */
4090 PyObject *result; /* result string object */
4091 char *p; /* next free byte in output buffer */
4092 Py_ssize_t nallocated; /* number of result bytes allocated */
4093 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004094 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004095 PyObject *errorHandler = NULL;
4096 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097 int kind;
4098 void *data;
4099 Py_ssize_t size;
4100 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4101#if SIZEOF_WCHAR_T == 2
4102 Py_ssize_t wchar_offset = 0;
4103#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004105 if (!PyUnicode_Check(unicode)) {
4106 PyErr_BadArgument();
4107 return NULL;
4108 }
4109
4110 if (PyUnicode_READY(unicode) == -1)
4111 return NULL;
4112
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004113 if (PyUnicode_UTF8(unicode))
4114 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4115 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004116
4117 kind = PyUnicode_KIND(unicode);
4118 data = PyUnicode_DATA(unicode);
4119 size = PyUnicode_GET_LENGTH(unicode);
4120
Tim Peters602f7402002-04-27 18:03:26 +00004121 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122
Tim Peters602f7402002-04-27 18:03:26 +00004123 if (size <= MAX_SHORT_UNICHARS) {
4124 /* Write into the stack buffer; nallocated can't overflow.
4125 * At the end, we'll allocate exactly as much heap space as it
4126 * turns out we need.
4127 */
4128 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004129 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004130 p = stackbuf;
4131 }
4132 else {
4133 /* Overallocate on the heap, and give the excess back at the end. */
4134 nallocated = size * 4;
4135 if (nallocated / 4 != size) /* overflow! */
4136 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004137 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004138 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004139 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004140 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004141 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004142
Tim Peters602f7402002-04-27 18:03:26 +00004143 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004145
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004146 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004147 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004149
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004151 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004152 *p++ = (char)(0xc0 | (ch >> 6));
4153 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004154 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004155 Py_ssize_t newpos;
4156 PyObject *rep;
4157 Py_ssize_t repsize, k, startpos;
4158 startpos = i-1;
4159#if SIZEOF_WCHAR_T == 2
4160 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004161#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004162 rep = unicode_encode_call_errorhandler(
4163 errors, &errorHandler, "utf-8", "surrogates not allowed",
4164 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4165 &exc, startpos, startpos+1, &newpos);
4166 if (!rep)
4167 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004169 if (PyBytes_Check(rep))
4170 repsize = PyBytes_GET_SIZE(rep);
4171 else
4172 repsize = PyUnicode_GET_SIZE(rep);
4173
4174 if (repsize > 4) {
4175 Py_ssize_t offset;
4176
4177 if (result == NULL)
4178 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004179 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004180 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004182 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4183 /* integer overflow */
4184 PyErr_NoMemory();
4185 goto error;
4186 }
4187 nallocated += repsize - 4;
4188 if (result != NULL) {
4189 if (_PyBytes_Resize(&result, nallocated) < 0)
4190 goto error;
4191 } else {
4192 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004193 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004194 goto error;
4195 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4196 }
4197 p = PyBytes_AS_STRING(result) + offset;
4198 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004200 if (PyBytes_Check(rep)) {
4201 char *prep = PyBytes_AS_STRING(rep);
4202 for(k = repsize; k > 0; k--)
4203 *p++ = *prep++;
4204 } else /* rep is unicode */ {
4205 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4206 Py_UNICODE c;
4207
4208 for(k=0; k<repsize; k++) {
4209 c = prep[k];
4210 if (0x80 <= c) {
4211 raise_encode_exception(&exc, "utf-8",
4212 PyUnicode_AS_UNICODE(unicode),
4213 size, i-1, i,
4214 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004215 goto error;
4216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004217 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004218 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004220 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004221 } else if (ch < 0x10000) {
4222 *p++ = (char)(0xe0 | (ch >> 12));
4223 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4224 *p++ = (char)(0x80 | (ch & 0x3f));
4225 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004226 /* Encode UCS4 Unicode ordinals */
4227 *p++ = (char)(0xf0 | (ch >> 18));
4228 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4229 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4230 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004231#if SIZEOF_WCHAR_T == 2
4232 wchar_offset++;
4233#endif
Tim Peters602f7402002-04-27 18:03:26 +00004234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004236
Guido van Rossum98297ee2007-11-06 21:34:58 +00004237 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004238 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004239 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004240 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004241 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004242 }
4243 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004244 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004245 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004246 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004247 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004249
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004250 Py_XDECREF(errorHandler);
4251 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004252 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004253 error:
4254 Py_XDECREF(errorHandler);
4255 Py_XDECREF(exc);
4256 Py_XDECREF(result);
4257 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004258
Tim Peters602f7402002-04-27 18:03:26 +00004259#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260}
4261
Alexander Belopolsky40018472011-02-26 01:02:56 +00004262PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4264 Py_ssize_t size,
4265 const char *errors)
4266{
4267 PyObject *v, *unicode;
4268
4269 unicode = PyUnicode_FromUnicode(s, size);
4270 if (unicode == NULL)
4271 return NULL;
4272 v = _PyUnicode_AsUTF8String(unicode, errors);
4273 Py_DECREF(unicode);
4274 return v;
4275}
4276
4277PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004278PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004280 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004281}
4282
Walter Dörwald41980ca2007-08-16 21:55:45 +00004283/* --- UTF-32 Codec ------------------------------------------------------- */
4284
4285PyObject *
4286PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 Py_ssize_t size,
4288 const char *errors,
4289 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004290{
4291 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4292}
4293
4294PyObject *
4295PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004296 Py_ssize_t size,
4297 const char *errors,
4298 int *byteorder,
4299 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004300{
4301 const char *starts = s;
4302 Py_ssize_t startinpos;
4303 Py_ssize_t endinpos;
4304 Py_ssize_t outpos;
4305 PyUnicodeObject *unicode;
4306 Py_UNICODE *p;
4307#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004308 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004309 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004310#else
4311 const int pairs = 0;
4312#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004313 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004314 int bo = 0; /* assume native ordering by default */
4315 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004316 /* Offsets from q for retrieving bytes in the right order. */
4317#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4318 int iorder[] = {0, 1, 2, 3};
4319#else
4320 int iorder[] = {3, 2, 1, 0};
4321#endif
4322 PyObject *errorHandler = NULL;
4323 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004324
Walter Dörwald41980ca2007-08-16 21:55:45 +00004325 q = (unsigned char *)s;
4326 e = q + size;
4327
4328 if (byteorder)
4329 bo = *byteorder;
4330
4331 /* Check for BOM marks (U+FEFF) in the input and adjust current
4332 byte order setting accordingly. In native mode, the leading BOM
4333 mark is skipped, in all other modes, it is copied to the output
4334 stream as-is (giving a ZWNBSP character). */
4335 if (bo == 0) {
4336 if (size >= 4) {
4337 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004338 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004339#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004340 if (bom == 0x0000FEFF) {
4341 q += 4;
4342 bo = -1;
4343 }
4344 else if (bom == 0xFFFE0000) {
4345 q += 4;
4346 bo = 1;
4347 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004348#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004349 if (bom == 0x0000FEFF) {
4350 q += 4;
4351 bo = 1;
4352 }
4353 else if (bom == 0xFFFE0000) {
4354 q += 4;
4355 bo = -1;
4356 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004357#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004358 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004359 }
4360
4361 if (bo == -1) {
4362 /* force LE */
4363 iorder[0] = 0;
4364 iorder[1] = 1;
4365 iorder[2] = 2;
4366 iorder[3] = 3;
4367 }
4368 else if (bo == 1) {
4369 /* force BE */
4370 iorder[0] = 3;
4371 iorder[1] = 2;
4372 iorder[2] = 1;
4373 iorder[3] = 0;
4374 }
4375
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004376 /* On narrow builds we split characters outside the BMP into two
4377 codepoints => count how much extra space we need. */
4378#ifndef Py_UNICODE_WIDE
4379 for (qq = q; qq < e; qq += 4)
4380 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4381 pairs++;
4382#endif
4383
4384 /* This might be one to much, because of a BOM */
4385 unicode = _PyUnicode_New((size+3)/4+pairs);
4386 if (!unicode)
4387 return NULL;
4388 if (size == 0)
4389 return (PyObject *)unicode;
4390
4391 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004392 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004393
Walter Dörwald41980ca2007-08-16 21:55:45 +00004394 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 Py_UCS4 ch;
4396 /* remaining bytes at the end? (size should be divisible by 4) */
4397 if (e-q<4) {
4398 if (consumed)
4399 break;
4400 errmsg = "truncated data";
4401 startinpos = ((const char *)q)-starts;
4402 endinpos = ((const char *)e)-starts;
4403 goto utf32Error;
4404 /* The remaining input chars are ignored if the callback
4405 chooses to skip the input */
4406 }
4407 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4408 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004409
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 if (ch >= 0x110000)
4411 {
4412 errmsg = "codepoint not in range(0x110000)";
4413 startinpos = ((const char *)q)-starts;
4414 endinpos = startinpos+4;
4415 goto utf32Error;
4416 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004417#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004418 if (ch >= 0x10000)
4419 {
4420 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4421 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4422 }
4423 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004424#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 *p++ = ch;
4426 q += 4;
4427 continue;
4428 utf32Error:
4429 outpos = p-PyUnicode_AS_UNICODE(unicode);
4430 if (unicode_decode_call_errorhandler(
4431 errors, &errorHandler,
4432 "utf32", errmsg,
4433 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4434 &unicode, &outpos, &p))
4435 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004436 }
4437
4438 if (byteorder)
4439 *byteorder = bo;
4440
4441 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004443
4444 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004445 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004446 goto onError;
4447
4448 Py_XDECREF(errorHandler);
4449 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004450 if (PyUnicode_READY(unicode) == -1) {
4451 Py_DECREF(unicode);
4452 return NULL;
4453 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004454 return (PyObject *)unicode;
4455
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004457 Py_DECREF(unicode);
4458 Py_XDECREF(errorHandler);
4459 Py_XDECREF(exc);
4460 return NULL;
4461}
4462
4463PyObject *
4464PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 Py_ssize_t size,
4466 const char *errors,
4467 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004468{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004469 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004470 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004471 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004472#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004473 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004474#else
4475 const int pairs = 0;
4476#endif
4477 /* Offsets from p for storing byte pairs in the right order. */
4478#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4479 int iorder[] = {0, 1, 2, 3};
4480#else
4481 int iorder[] = {3, 2, 1, 0};
4482#endif
4483
Benjamin Peterson29060642009-01-31 22:14:21 +00004484#define STORECHAR(CH) \
4485 do { \
4486 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4487 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4488 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4489 p[iorder[0]] = (CH) & 0xff; \
4490 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004491 } while(0)
4492
4493 /* In narrow builds we can output surrogate pairs as one codepoint,
4494 so we need less space. */
4495#ifndef Py_UNICODE_WIDE
4496 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4498 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4499 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004500#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004501 nsize = (size - pairs + (byteorder == 0));
4502 bytesize = nsize * 4;
4503 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004504 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004505 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004506 if (v == NULL)
4507 return NULL;
4508
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004509 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004510 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004512 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004513 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004514
4515 if (byteorder == -1) {
4516 /* force LE */
4517 iorder[0] = 0;
4518 iorder[1] = 1;
4519 iorder[2] = 2;
4520 iorder[3] = 3;
4521 }
4522 else if (byteorder == 1) {
4523 /* force BE */
4524 iorder[0] = 3;
4525 iorder[1] = 2;
4526 iorder[2] = 1;
4527 iorder[3] = 0;
4528 }
4529
4530 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004532#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4534 Py_UCS4 ch2 = *s;
4535 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4536 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4537 s++;
4538 size--;
4539 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004540 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004541#endif
4542 STORECHAR(ch);
4543 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004544
4545 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004546 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004547#undef STORECHAR
4548}
4549
Alexander Belopolsky40018472011-02-26 01:02:56 +00004550PyObject *
4551PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004552{
4553 if (!PyUnicode_Check(unicode)) {
4554 PyErr_BadArgument();
4555 return NULL;
4556 }
4557 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 PyUnicode_GET_SIZE(unicode),
4559 NULL,
4560 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004561}
4562
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563/* --- UTF-16 Codec ------------------------------------------------------- */
4564
Tim Peters772747b2001-08-09 22:21:55 +00004565PyObject *
4566PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004567 Py_ssize_t size,
4568 const char *errors,
4569 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570{
Walter Dörwald69652032004-09-07 20:24:22 +00004571 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4572}
4573
Antoine Pitrouab868312009-01-10 15:40:25 +00004574/* Two masks for fast checking of whether a C 'long' may contain
4575 UTF16-encoded surrogate characters. This is an efficient heuristic,
4576 assuming that non-surrogate characters with a code point >= 0x8000 are
4577 rare in most input.
4578 FAST_CHAR_MASK is used when the input is in native byte ordering,
4579 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004580*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004581#if (SIZEOF_LONG == 8)
4582# define FAST_CHAR_MASK 0x8000800080008000L
4583# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4584#elif (SIZEOF_LONG == 4)
4585# define FAST_CHAR_MASK 0x80008000L
4586# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4587#else
4588# error C 'long' size should be either 4 or 8!
4589#endif
4590
Walter Dörwald69652032004-09-07 20:24:22 +00004591PyObject *
4592PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 Py_ssize_t size,
4594 const char *errors,
4595 int *byteorder,
4596 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004597{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004599 Py_ssize_t startinpos;
4600 Py_ssize_t endinpos;
4601 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602 PyUnicodeObject *unicode;
4603 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004604 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004605 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004606 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004607 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004608 /* Offsets from q for retrieving byte pairs in the right order. */
4609#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4610 int ihi = 1, ilo = 0;
4611#else
4612 int ihi = 0, ilo = 1;
4613#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614 PyObject *errorHandler = NULL;
4615 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616
4617 /* Note: size will always be longer than the resulting Unicode
4618 character count */
4619 unicode = _PyUnicode_New(size);
4620 if (!unicode)
4621 return NULL;
4622 if (size == 0)
4623 return (PyObject *)unicode;
4624
4625 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004626 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004627 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004628 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629
4630 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004631 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004633 /* Check for BOM marks (U+FEFF) in the input and adjust current
4634 byte order setting accordingly. In native mode, the leading BOM
4635 mark is skipped, in all other modes, it is copied to the output
4636 stream as-is (giving a ZWNBSP character). */
4637 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004638 if (size >= 2) {
4639 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004640#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004641 if (bom == 0xFEFF) {
4642 q += 2;
4643 bo = -1;
4644 }
4645 else if (bom == 0xFFFE) {
4646 q += 2;
4647 bo = 1;
4648 }
Tim Petersced69f82003-09-16 20:30:58 +00004649#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004650 if (bom == 0xFEFF) {
4651 q += 2;
4652 bo = 1;
4653 }
4654 else if (bom == 0xFFFE) {
4655 q += 2;
4656 bo = -1;
4657 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004658#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004659 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004660 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004661
Tim Peters772747b2001-08-09 22:21:55 +00004662 if (bo == -1) {
4663 /* force LE */
4664 ihi = 1;
4665 ilo = 0;
4666 }
4667 else if (bo == 1) {
4668 /* force BE */
4669 ihi = 0;
4670 ilo = 1;
4671 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004672#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4673 native_ordering = ilo < ihi;
4674#else
4675 native_ordering = ilo > ihi;
4676#endif
Tim Peters772747b2001-08-09 22:21:55 +00004677
Antoine Pitrouab868312009-01-10 15:40:25 +00004678 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004679 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004680 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004681 /* First check for possible aligned read of a C 'long'. Unaligned
4682 reads are more expensive, better to defer to another iteration. */
4683 if (!((size_t) q & LONG_PTR_MASK)) {
4684 /* Fast path for runs of non-surrogate chars. */
4685 register const unsigned char *_q = q;
4686 Py_UNICODE *_p = p;
4687 if (native_ordering) {
4688 /* Native ordering is simple: as long as the input cannot
4689 possibly contain a surrogate char, do an unrolled copy
4690 of several 16-bit code points to the target object.
4691 The non-surrogate check is done on several input bytes
4692 at a time (as many as a C 'long' can contain). */
4693 while (_q < aligned_end) {
4694 unsigned long data = * (unsigned long *) _q;
4695 if (data & FAST_CHAR_MASK)
4696 break;
4697 _p[0] = ((unsigned short *) _q)[0];
4698 _p[1] = ((unsigned short *) _q)[1];
4699#if (SIZEOF_LONG == 8)
4700 _p[2] = ((unsigned short *) _q)[2];
4701 _p[3] = ((unsigned short *) _q)[3];
4702#endif
4703 _q += SIZEOF_LONG;
4704 _p += SIZEOF_LONG / 2;
4705 }
4706 }
4707 else {
4708 /* Byteswapped ordering is similar, but we must decompose
4709 the copy bytewise, and take care of zero'ing out the
4710 upper bytes if the target object is in 32-bit units
4711 (that is, in UCS-4 builds). */
4712 while (_q < aligned_end) {
4713 unsigned long data = * (unsigned long *) _q;
4714 if (data & SWAPPED_FAST_CHAR_MASK)
4715 break;
4716 /* Zero upper bytes in UCS-4 builds */
4717#if (Py_UNICODE_SIZE > 2)
4718 _p[0] = 0;
4719 _p[1] = 0;
4720#if (SIZEOF_LONG == 8)
4721 _p[2] = 0;
4722 _p[3] = 0;
4723#endif
4724#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004725 /* Issue #4916; UCS-4 builds on big endian machines must
4726 fill the two last bytes of each 4-byte unit. */
4727#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4728# define OFF 2
4729#else
4730# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004731#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004732 ((unsigned char *) _p)[OFF + 1] = _q[0];
4733 ((unsigned char *) _p)[OFF + 0] = _q[1];
4734 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4735 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4736#if (SIZEOF_LONG == 8)
4737 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4738 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4739 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4740 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4741#endif
4742#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004743 _q += SIZEOF_LONG;
4744 _p += SIZEOF_LONG / 2;
4745 }
4746 }
4747 p = _p;
4748 q = _q;
4749 if (q >= e)
4750 break;
4751 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004753
Benjamin Peterson14339b62009-01-31 16:36:08 +00004754 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004755
4756 if (ch < 0xD800 || ch > 0xDFFF) {
4757 *p++ = ch;
4758 continue;
4759 }
4760
4761 /* UTF-16 code pair: */
4762 if (q > e) {
4763 errmsg = "unexpected end of data";
4764 startinpos = (((const char *)q) - 2) - starts;
4765 endinpos = ((const char *)e) + 1 - starts;
4766 goto utf16Error;
4767 }
4768 if (0xD800 <= ch && ch <= 0xDBFF) {
4769 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4770 q += 2;
4771 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004772#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004773 *p++ = ch;
4774 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004775#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004776 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004777#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004778 continue;
4779 }
4780 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004781 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 startinpos = (((const char *)q)-4)-starts;
4783 endinpos = startinpos+2;
4784 goto utf16Error;
4785 }
4786
Benjamin Peterson14339b62009-01-31 16:36:08 +00004787 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004788 errmsg = "illegal encoding";
4789 startinpos = (((const char *)q)-2)-starts;
4790 endinpos = startinpos+2;
4791 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004792
Benjamin Peterson29060642009-01-31 22:14:21 +00004793 utf16Error:
4794 outpos = p - PyUnicode_AS_UNICODE(unicode);
4795 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004796 errors,
4797 &errorHandler,
4798 "utf16", errmsg,
4799 &starts,
4800 (const char **)&e,
4801 &startinpos,
4802 &endinpos,
4803 &exc,
4804 (const char **)&q,
4805 &unicode,
4806 &outpos,
4807 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004808 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004810 /* remaining byte at the end? (size should be even) */
4811 if (e == q) {
4812 if (!consumed) {
4813 errmsg = "truncated data";
4814 startinpos = ((const char *)q) - starts;
4815 endinpos = ((const char *)e) + 1 - starts;
4816 outpos = p - PyUnicode_AS_UNICODE(unicode);
4817 if (unicode_decode_call_errorhandler(
4818 errors,
4819 &errorHandler,
4820 "utf16", errmsg,
4821 &starts,
4822 (const char **)&e,
4823 &startinpos,
4824 &endinpos,
4825 &exc,
4826 (const char **)&q,
4827 &unicode,
4828 &outpos,
4829 &p))
4830 goto onError;
4831 /* The remaining input chars are ignored if the callback
4832 chooses to skip the input */
4833 }
4834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835
4836 if (byteorder)
4837 *byteorder = bo;
4838
Walter Dörwald69652032004-09-07 20:24:22 +00004839 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004840 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004841
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004843 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 goto onError;
4845
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 Py_XDECREF(errorHandler);
4847 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004848 if (PyUnicode_READY(unicode) == -1) {
4849 Py_DECREF(unicode);
4850 return NULL;
4851 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 return (PyObject *)unicode;
4853
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856 Py_XDECREF(errorHandler);
4857 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858 return NULL;
4859}
4860
Antoine Pitrouab868312009-01-10 15:40:25 +00004861#undef FAST_CHAR_MASK
4862#undef SWAPPED_FAST_CHAR_MASK
4863
Tim Peters772747b2001-08-09 22:21:55 +00004864PyObject *
4865PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 Py_ssize_t size,
4867 const char *errors,
4868 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004870 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004871 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004872 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004873#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004874 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004875#else
4876 const int pairs = 0;
4877#endif
Tim Peters772747b2001-08-09 22:21:55 +00004878 /* Offsets from p for storing byte pairs in the right order. */
4879#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4880 int ihi = 1, ilo = 0;
4881#else
4882 int ihi = 0, ilo = 1;
4883#endif
4884
Benjamin Peterson29060642009-01-31 22:14:21 +00004885#define STORECHAR(CH) \
4886 do { \
4887 p[ihi] = ((CH) >> 8) & 0xff; \
4888 p[ilo] = (CH) & 0xff; \
4889 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004890 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004892#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004893 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004894 if (s[i] >= 0x10000)
4895 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004896#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004897 /* 2 * (size + pairs + (byteorder == 0)) */
4898 if (size > PY_SSIZE_T_MAX ||
4899 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004900 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004901 nsize = size + pairs + (byteorder == 0);
4902 bytesize = nsize * 2;
4903 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004904 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004905 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906 if (v == NULL)
4907 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004909 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004911 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004912 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004913 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004914
4915 if (byteorder == -1) {
4916 /* force LE */
4917 ihi = 1;
4918 ilo = 0;
4919 }
4920 else if (byteorder == 1) {
4921 /* force BE */
4922 ihi = 0;
4923 ilo = 1;
4924 }
4925
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004926 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004927 Py_UNICODE ch = *s++;
4928 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004929#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004930 if (ch >= 0x10000) {
4931 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4932 ch = 0xD800 | ((ch-0x10000) >> 10);
4933 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004934#endif
Tim Peters772747b2001-08-09 22:21:55 +00004935 STORECHAR(ch);
4936 if (ch2)
4937 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004938 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004939
4940 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004941 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004942#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943}
4944
Alexander Belopolsky40018472011-02-26 01:02:56 +00004945PyObject *
4946PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947{
4948 if (!PyUnicode_Check(unicode)) {
4949 PyErr_BadArgument();
4950 return NULL;
4951 }
4952 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004953 PyUnicode_GET_SIZE(unicode),
4954 NULL,
4955 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956}
4957
4958/* --- Unicode Escape Codec ----------------------------------------------- */
4959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004960/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4961 if all the escapes in the string make it still a valid ASCII string.
4962 Returns -1 if any escapes were found which cause the string to
4963 pop out of ASCII range. Otherwise returns the length of the
4964 required buffer to hold the string.
4965 */
4966Py_ssize_t
4967length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4968{
4969 const unsigned char *p = (const unsigned char *)s;
4970 const unsigned char *end = p + size;
4971 Py_ssize_t length = 0;
4972
4973 if (size < 0)
4974 return -1;
4975
4976 for (; p < end; ++p) {
4977 if (*p > 127) {
4978 /* Non-ASCII */
4979 return -1;
4980 }
4981 else if (*p != '\\') {
4982 /* Normal character */
4983 ++length;
4984 }
4985 else {
4986 /* Backslash-escape, check next char */
4987 ++p;
4988 /* Escape sequence reaches till end of string or
4989 non-ASCII follow-up. */
4990 if (p >= end || *p > 127)
4991 return -1;
4992 switch (*p) {
4993 case '\n':
4994 /* backslash + \n result in zero characters */
4995 break;
4996 case '\\': case '\'': case '\"':
4997 case 'b': case 'f': case 't':
4998 case 'n': case 'r': case 'v': case 'a':
4999 ++length;
5000 break;
5001 case '0': case '1': case '2': case '3':
5002 case '4': case '5': case '6': case '7':
5003 case 'x': case 'u': case 'U': case 'N':
5004 /* these do not guarantee ASCII characters */
5005 return -1;
5006 default:
5007 /* count the backslash + the other character */
5008 length += 2;
5009 }
5010 }
5011 }
5012 return length;
5013}
5014
5015/* Similar to PyUnicode_WRITE but either write into wstr field
5016 or treat string as ASCII. */
5017#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5018 do { \
5019 if ((kind) != PyUnicode_WCHAR_KIND) \
5020 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5021 else \
5022 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5023 } while (0)
5024
5025#define WRITE_WSTR(buf, index, value) \
5026 assert(kind == PyUnicode_WCHAR_KIND), \
5027 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5028
5029
Fredrik Lundh06d12682001-01-24 07:59:11 +00005030static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005031
Alexander Belopolsky40018472011-02-26 01:02:56 +00005032PyObject *
5033PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005034 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005035 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005037 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005038 Py_ssize_t startinpos;
5039 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005040 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005042 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005044 char* message;
5045 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005046 PyObject *errorHandler = NULL;
5047 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005048 Py_ssize_t ascii_length;
5049 Py_ssize_t i;
5050 int kind;
5051 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005053 ascii_length = length_of_escaped_ascii_string(s, size);
5054
5055 /* After length_of_escaped_ascii_string() there are two alternatives,
5056 either the string is pure ASCII with named escapes like \n, etc.
5057 and we determined it's exact size (common case)
5058 or it contains \x, \u, ... escape sequences. then we create a
5059 legacy wchar string and resize it at the end of this function. */
5060 if (ascii_length >= 0) {
5061 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5062 if (!v)
5063 goto onError;
5064 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5065 kind = PyUnicode_1BYTE_KIND;
5066 data = PyUnicode_DATA(v);
5067 }
5068 else {
5069 /* Escaped strings will always be longer than the resulting
5070 Unicode string, so we start with size here and then reduce the
5071 length after conversion to the true value.
5072 (but if the error callback returns a long replacement string
5073 we'll have to allocate more space) */
5074 v = _PyUnicode_New(size);
5075 if (!v)
5076 goto onError;
5077 kind = PyUnicode_WCHAR_KIND;
5078 data = PyUnicode_AS_UNICODE(v);
5079 }
5080
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081 if (size == 0)
5082 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005083 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005085
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086 while (s < end) {
5087 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005088 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005089 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005091 if (kind == PyUnicode_WCHAR_KIND) {
5092 assert(i < _PyUnicode_WSTR_LENGTH(v));
5093 }
5094 else {
5095 /* The only case in which i == ascii_length is a backslash
5096 followed by a newline. */
5097 assert(i <= ascii_length);
5098 }
5099
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100 /* Non-escape characters are interpreted as Unicode ordinals */
5101 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005102 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 continue;
5104 }
5105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005106 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 /* \ - Escapes */
5108 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005109 c = *s++;
5110 if (s > end)
5111 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005112
5113 if (kind == PyUnicode_WCHAR_KIND) {
5114 assert(i < _PyUnicode_WSTR_LENGTH(v));
5115 }
5116 else {
5117 /* The only case in which i == ascii_length is a backslash
5118 followed by a newline. */
5119 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5120 }
5121
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005122 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123
Benjamin Peterson29060642009-01-31 22:14:21 +00005124 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005126 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5127 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5128 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5129 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5130 /* FF */
5131 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5132 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5133 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5134 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5135 /* VT */
5136 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5137 /* BEL, not classic C */
5138 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139
Benjamin Peterson29060642009-01-31 22:14:21 +00005140 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141 case '0': case '1': case '2': case '3':
5142 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005143 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005144 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005145 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005146 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005147 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005149 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150 break;
5151
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 /* hex escapes */
5153 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005155 digits = 2;
5156 message = "truncated \\xXX escape";
5157 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158
Benjamin Peterson29060642009-01-31 22:14:21 +00005159 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005161 digits = 4;
5162 message = "truncated \\uXXXX escape";
5163 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164
Benjamin Peterson29060642009-01-31 22:14:21 +00005165 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005166 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005167 digits = 8;
5168 message = "truncated \\UXXXXXXXX escape";
5169 hexescape:
5170 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005171 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005172 if (s+digits>end) {
5173 endinpos = size;
5174 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005175 errors, &errorHandler,
5176 "unicodeescape", "end of string in escape sequence",
5177 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005178 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005179 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005180 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005181 goto nextByte;
5182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005183 for (j = 0; j < digits; ++j) {
5184 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005185 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005186 endinpos = (s+j+1)-starts;
5187 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005188 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 errors, &errorHandler,
5190 "unicodeescape", message,
5191 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005192 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005193 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005194 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005195 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005196 }
5197 chr = (chr<<4) & ~0xF;
5198 if (c >= '0' && c <= '9')
5199 chr += c - '0';
5200 else if (c >= 'a' && c <= 'f')
5201 chr += 10 + c - 'a';
5202 else
5203 chr += 10 + c - 'A';
5204 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005205 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005206 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005207 /* _decoding_error will have already written into the
5208 target buffer. */
5209 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005210 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005211 /* when we get here, chr is a 32-bit unicode character */
5212 if (chr <= 0xffff)
5213 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005214 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005215 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005216 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005217 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005218#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005219 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005220#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005221 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005222 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5223 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005224#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005225 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005226 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005227 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005228 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005229 errors, &errorHandler,
5230 "unicodeescape", "illegal Unicode character",
5231 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005232 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005233 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005234 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005235 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005236 break;
5237
Benjamin Peterson29060642009-01-31 22:14:21 +00005238 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005239 case 'N':
5240 message = "malformed \\N character escape";
5241 if (ucnhash_CAPI == NULL) {
5242 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005243 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5244 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005245 if (ucnhash_CAPI == NULL)
5246 goto ucnhashError;
5247 }
5248 if (*s == '{') {
5249 const char *start = s+1;
5250 /* look for the closing brace */
5251 while (*s != '}' && s < end)
5252 s++;
5253 if (s > start && s < end && *s == '}') {
5254 /* found a name. look it up in the unicode database */
5255 message = "unknown Unicode character name";
5256 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005257 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5258 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005259 goto store;
5260 }
5261 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005262 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005263 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005264 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005265 errors, &errorHandler,
5266 "unicodeescape", message,
5267 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005268 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005269 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005270 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005271 break;
5272
5273 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005274 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005275 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005276 message = "\\ at end of string";
5277 s--;
5278 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005279 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005280 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005281 errors, &errorHandler,
5282 "unicodeescape", message,
5283 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005284 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005285 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005286 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005287 }
5288 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005289 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5290 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005291 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005292 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005294 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005295 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005297 /* Ensure the length prediction worked in case of ASCII strings */
5298 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5299
5300 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5301 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005302 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005303 Py_XDECREF(errorHandler);
5304 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005306
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005308 PyErr_SetString(
5309 PyExc_UnicodeError,
5310 "\\N escapes not supported (can't load unicodedata module)"
5311 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005312 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005313 Py_XDECREF(errorHandler);
5314 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005315 return NULL;
5316
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005319 Py_XDECREF(errorHandler);
5320 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321 return NULL;
5322}
5323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005324#undef WRITE_ASCII_OR_WSTR
5325#undef WRITE_WSTR
5326
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327/* Return a Unicode-Escape string version of the Unicode object.
5328
5329 If quotes is true, the string is enclosed in u"" or u'' quotes as
5330 appropriate.
5331
5332*/
5333
Walter Dörwald79e913e2007-05-12 11:08:06 +00005334static const char *hexdigits = "0123456789abcdef";
5335
Alexander Belopolsky40018472011-02-26 01:02:56 +00005336PyObject *
5337PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005338 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005340 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005343#ifdef Py_UNICODE_WIDE
5344 const Py_ssize_t expandsize = 10;
5345#else
5346 const Py_ssize_t expandsize = 6;
5347#endif
5348
Thomas Wouters89f507f2006-12-13 04:49:30 +00005349 /* XXX(nnorwitz): rather than over-allocating, it would be
5350 better to choose a different scheme. Perhaps scan the
5351 first N-chars of the string and allocate based on that size.
5352 */
5353 /* Initial allocation is based on the longest-possible unichr
5354 escape.
5355
5356 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5357 unichr, so in this case it's the longest unichr escape. In
5358 narrow (UTF-16) builds this is five chars per source unichr
5359 since there are two unichrs in the surrogate pair, so in narrow
5360 (UTF-16) builds it's not the longest unichr escape.
5361
5362 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5363 so in the narrow (UTF-16) build case it's the longest unichr
5364 escape.
5365 */
5366
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005367 if (size == 0)
5368 return PyBytes_FromStringAndSize(NULL, 0);
5369
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005370 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005372
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005373 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 2
5375 + expandsize*size
5376 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 if (repr == NULL)
5378 return NULL;
5379
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005380 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 while (size-- > 0) {
5383 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005384
Walter Dörwald79e913e2007-05-12 11:08:06 +00005385 /* Escape backslashes */
5386 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 *p++ = '\\';
5388 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005389 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005390 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005391
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005392#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005393 /* Map 21-bit characters to '\U00xxxxxx' */
5394 else if (ch >= 0x10000) {
5395 *p++ = '\\';
5396 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005397 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5398 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5399 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5400 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5401 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5402 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5403 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5404 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005406 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005407#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5409 else if (ch >= 0xD800 && ch < 0xDC00) {
5410 Py_UNICODE ch2;
5411 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005412
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 ch2 = *s++;
5414 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005415 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5417 *p++ = '\\';
5418 *p++ = 'U';
5419 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5420 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5421 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5422 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5423 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5424 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5425 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5426 *p++ = hexdigits[ucs & 0x0000000F];
5427 continue;
5428 }
5429 /* Fall through: isolated surrogates are copied as-is */
5430 s--;
5431 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005432 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005433#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005434
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005436 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 *p++ = '\\';
5438 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005439 *p++ = hexdigits[(ch >> 12) & 0x000F];
5440 *p++ = hexdigits[(ch >> 8) & 0x000F];
5441 *p++ = hexdigits[(ch >> 4) & 0x000F];
5442 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005444
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005445 /* Map special whitespace to '\t', \n', '\r' */
5446 else if (ch == '\t') {
5447 *p++ = '\\';
5448 *p++ = 't';
5449 }
5450 else if (ch == '\n') {
5451 *p++ = '\\';
5452 *p++ = 'n';
5453 }
5454 else if (ch == '\r') {
5455 *p++ = '\\';
5456 *p++ = 'r';
5457 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005458
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005459 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005460 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005462 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005463 *p++ = hexdigits[(ch >> 4) & 0x000F];
5464 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005465 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005466
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 /* Copy everything else as-is */
5468 else
5469 *p++ = (char) ch;
5470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005472 assert(p - PyBytes_AS_STRING(repr) > 0);
5473 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5474 return NULL;
5475 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476}
5477
Alexander Belopolsky40018472011-02-26 01:02:56 +00005478PyObject *
5479PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005481 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 if (!PyUnicode_Check(unicode)) {
5483 PyErr_BadArgument();
5484 return NULL;
5485 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005486 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5487 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005488 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489}
5490
5491/* --- Raw Unicode Escape Codec ------------------------------------------- */
5492
Alexander Belopolsky40018472011-02-26 01:02:56 +00005493PyObject *
5494PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005495 Py_ssize_t size,
5496 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005498 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005499 Py_ssize_t startinpos;
5500 Py_ssize_t endinpos;
5501 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005503 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 const char *end;
5505 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005506 PyObject *errorHandler = NULL;
5507 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005508
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 /* Escaped strings will always be longer than the resulting
5510 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511 length after conversion to the true value. (But decoding error
5512 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513 v = _PyUnicode_New(size);
5514 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005518 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 end = s + size;
5520 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 unsigned char c;
5522 Py_UCS4 x;
5523 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005524 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525
Benjamin Peterson29060642009-01-31 22:14:21 +00005526 /* Non-escape characters are interpreted as Unicode ordinals */
5527 if (*s != '\\') {
5528 *p++ = (unsigned char)*s++;
5529 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005530 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 startinpos = s-starts;
5532
5533 /* \u-escapes are only interpreted iff the number of leading
5534 backslashes if odd */
5535 bs = s;
5536 for (;s < end;) {
5537 if (*s != '\\')
5538 break;
5539 *p++ = (unsigned char)*s++;
5540 }
5541 if (((s - bs) & 1) == 0 ||
5542 s >= end ||
5543 (*s != 'u' && *s != 'U')) {
5544 continue;
5545 }
5546 p--;
5547 count = *s=='u' ? 4 : 8;
5548 s++;
5549
5550 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5551 outpos = p-PyUnicode_AS_UNICODE(v);
5552 for (x = 0, i = 0; i < count; ++i, ++s) {
5553 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005554 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005555 endinpos = s-starts;
5556 if (unicode_decode_call_errorhandler(
5557 errors, &errorHandler,
5558 "rawunicodeescape", "truncated \\uXXXX",
5559 &starts, &end, &startinpos, &endinpos, &exc, &s,
5560 &v, &outpos, &p))
5561 goto onError;
5562 goto nextByte;
5563 }
5564 x = (x<<4) & ~0xF;
5565 if (c >= '0' && c <= '9')
5566 x += c - '0';
5567 else if (c >= 'a' && c <= 'f')
5568 x += 10 + c - 'a';
5569 else
5570 x += 10 + c - 'A';
5571 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005572 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005573 /* UCS-2 character */
5574 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005575 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 /* UCS-4 character. Either store directly, or as
5577 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005578#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005580#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 x -= 0x10000L;
5582 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5583 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005584#endif
5585 } else {
5586 endinpos = s-starts;
5587 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005588 if (unicode_decode_call_errorhandler(
5589 errors, &errorHandler,
5590 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 &starts, &end, &startinpos, &endinpos, &exc, &s,
5592 &v, &outpos, &p))
5593 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005594 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 nextByte:
5596 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005598 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005600 Py_XDECREF(errorHandler);
5601 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005602 if (PyUnicode_READY(v) == -1) {
5603 Py_DECREF(v);
5604 return NULL;
5605 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005607
Benjamin Peterson29060642009-01-31 22:14:21 +00005608 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005610 Py_XDECREF(errorHandler);
5611 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 return NULL;
5613}
5614
Alexander Belopolsky40018472011-02-26 01:02:56 +00005615PyObject *
5616PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005617 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005619 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 char *p;
5621 char *q;
5622
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005623#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005624 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005625#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005626 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005627#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005628
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005629 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005631
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005632 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 if (repr == NULL)
5634 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005635 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005636 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005638 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 while (size-- > 0) {
5640 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005641#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005642 /* Map 32-bit characters to '\Uxxxxxxxx' */
5643 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005644 *p++ = '\\';
5645 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005646 *p++ = hexdigits[(ch >> 28) & 0xf];
5647 *p++ = hexdigits[(ch >> 24) & 0xf];
5648 *p++ = hexdigits[(ch >> 20) & 0xf];
5649 *p++ = hexdigits[(ch >> 16) & 0xf];
5650 *p++ = hexdigits[(ch >> 12) & 0xf];
5651 *p++ = hexdigits[(ch >> 8) & 0xf];
5652 *p++ = hexdigits[(ch >> 4) & 0xf];
5653 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005654 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005655 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005656#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5658 if (ch >= 0xD800 && ch < 0xDC00) {
5659 Py_UNICODE ch2;
5660 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005661
Benjamin Peterson29060642009-01-31 22:14:21 +00005662 ch2 = *s++;
5663 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005664 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5666 *p++ = '\\';
5667 *p++ = 'U';
5668 *p++ = hexdigits[(ucs >> 28) & 0xf];
5669 *p++ = hexdigits[(ucs >> 24) & 0xf];
5670 *p++ = hexdigits[(ucs >> 20) & 0xf];
5671 *p++ = hexdigits[(ucs >> 16) & 0xf];
5672 *p++ = hexdigits[(ucs >> 12) & 0xf];
5673 *p++ = hexdigits[(ucs >> 8) & 0xf];
5674 *p++ = hexdigits[(ucs >> 4) & 0xf];
5675 *p++ = hexdigits[ucs & 0xf];
5676 continue;
5677 }
5678 /* Fall through: isolated surrogates are copied as-is */
5679 s--;
5680 size++;
5681 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005682#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 /* Map 16-bit characters to '\uxxxx' */
5684 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 *p++ = '\\';
5686 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005687 *p++ = hexdigits[(ch >> 12) & 0xf];
5688 *p++ = hexdigits[(ch >> 8) & 0xf];
5689 *p++ = hexdigits[(ch >> 4) & 0xf];
5690 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 /* Copy everything else as-is */
5693 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 *p++ = (char) ch;
5695 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005696 size = p - q;
5697
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005698 assert(size > 0);
5699 if (_PyBytes_Resize(&repr, size) < 0)
5700 return NULL;
5701 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702}
5703
Alexander Belopolsky40018472011-02-26 01:02:56 +00005704PyObject *
5705PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005707 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005709 PyErr_BadArgument();
5710 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005712 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5713 PyUnicode_GET_SIZE(unicode));
5714
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005715 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716}
5717
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005718/* --- Unicode Internal Codec ------------------------------------------- */
5719
Alexander Belopolsky40018472011-02-26 01:02:56 +00005720PyObject *
5721_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005722 Py_ssize_t size,
5723 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005724{
5725 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005726 Py_ssize_t startinpos;
5727 Py_ssize_t endinpos;
5728 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005729 PyUnicodeObject *v;
5730 Py_UNICODE *p;
5731 const char *end;
5732 const char *reason;
5733 PyObject *errorHandler = NULL;
5734 PyObject *exc = NULL;
5735
Neal Norwitzd43069c2006-01-08 01:12:10 +00005736#ifdef Py_UNICODE_WIDE
5737 Py_UNICODE unimax = PyUnicode_GetMax();
5738#endif
5739
Thomas Wouters89f507f2006-12-13 04:49:30 +00005740 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005741 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5742 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005744 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5745 as string was created with the old API. */
5746 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005748 p = PyUnicode_AS_UNICODE(v);
5749 end = s + size;
5750
5751 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005752 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005753 /* We have to sanity check the raw data, otherwise doom looms for
5754 some malformed UCS-4 data. */
5755 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005756#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005757 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005758#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005759 end-s < Py_UNICODE_SIZE
5760 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005762 startinpos = s - starts;
5763 if (end-s < Py_UNICODE_SIZE) {
5764 endinpos = end-starts;
5765 reason = "truncated input";
5766 }
5767 else {
5768 endinpos = s - starts + Py_UNICODE_SIZE;
5769 reason = "illegal code point (> 0x10FFFF)";
5770 }
5771 outpos = p - PyUnicode_AS_UNICODE(v);
5772 if (unicode_decode_call_errorhandler(
5773 errors, &errorHandler,
5774 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005775 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005776 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005777 goto onError;
5778 }
5779 }
5780 else {
5781 p++;
5782 s += Py_UNICODE_SIZE;
5783 }
5784 }
5785
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005786 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005787 goto onError;
5788 Py_XDECREF(errorHandler);
5789 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005790 if (PyUnicode_READY(v) == -1) {
5791 Py_DECREF(v);
5792 return NULL;
5793 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005794 return (PyObject *)v;
5795
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005797 Py_XDECREF(v);
5798 Py_XDECREF(errorHandler);
5799 Py_XDECREF(exc);
5800 return NULL;
5801}
5802
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803/* --- Latin-1 Codec ------------------------------------------------------ */
5804
Alexander Belopolsky40018472011-02-26 01:02:56 +00005805PyObject *
5806PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005807 Py_ssize_t size,
5808 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005811 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812}
5813
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005814/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005815static void
5816make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005817 const char *encoding,
5818 const Py_UNICODE *unicode, Py_ssize_t size,
5819 Py_ssize_t startpos, Py_ssize_t endpos,
5820 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005822 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005823 *exceptionObject = PyUnicodeEncodeError_Create(
5824 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 }
5826 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005827 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5828 goto onError;
5829 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5830 goto onError;
5831 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5832 goto onError;
5833 return;
5834 onError:
5835 Py_DECREF(*exceptionObject);
5836 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 }
5838}
5839
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005840/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005841static void
5842raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005843 const char *encoding,
5844 const Py_UNICODE *unicode, Py_ssize_t size,
5845 Py_ssize_t startpos, Py_ssize_t endpos,
5846 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005847{
5848 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005850 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005852}
5853
5854/* error handling callback helper:
5855 build arguments, call the callback and check the arguments,
5856 put the result into newpos and return the replacement string, which
5857 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005858static PyObject *
5859unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005860 PyObject **errorHandler,
5861 const char *encoding, const char *reason,
5862 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5863 Py_ssize_t startpos, Py_ssize_t endpos,
5864 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005865{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005866 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867
5868 PyObject *restuple;
5869 PyObject *resunicode;
5870
5871 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005873 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005875 }
5876
5877 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005879 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005881
5882 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005884 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005885 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005886 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005887 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005888 Py_DECREF(restuple);
5889 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005890 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005891 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 &resunicode, newpos)) {
5893 Py_DECREF(restuple);
5894 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005896 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5897 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5898 Py_DECREF(restuple);
5899 return NULL;
5900 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005901 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005902 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005903 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5905 Py_DECREF(restuple);
5906 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005907 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005908 Py_INCREF(resunicode);
5909 Py_DECREF(restuple);
5910 return resunicode;
5911}
5912
Alexander Belopolsky40018472011-02-26 01:02:56 +00005913static PyObject *
5914unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005915 Py_ssize_t size,
5916 const char *errors,
5917 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005918{
5919 /* output object */
5920 PyObject *res;
5921 /* pointers to the beginning and end+1 of input */
5922 const Py_UNICODE *startp = p;
5923 const Py_UNICODE *endp = p + size;
5924 /* pointer to the beginning of the unencodable characters */
5925 /* const Py_UNICODE *badp = NULL; */
5926 /* pointer into the output */
5927 char *str;
5928 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005929 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005930 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5931 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005932 PyObject *errorHandler = NULL;
5933 PyObject *exc = NULL;
5934 /* the following variable is used for caching string comparisons
5935 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5936 int known_errorHandler = -1;
5937
5938 /* allocate enough for a simple encoding without
5939 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005940 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005941 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005942 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005943 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005944 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005945 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005946 ressize = size;
5947
5948 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005950
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 /* can we encode this? */
5952 if (c<limit) {
5953 /* no overflow check, because we know that the space is enough */
5954 *str++ = (char)c;
5955 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005956 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 else {
5958 Py_ssize_t unicodepos = p-startp;
5959 Py_ssize_t requiredsize;
5960 PyObject *repunicode;
5961 Py_ssize_t repsize;
5962 Py_ssize_t newpos;
5963 Py_ssize_t respos;
5964 Py_UNICODE *uni2;
5965 /* startpos for collecting unencodable chars */
5966 const Py_UNICODE *collstart = p;
5967 const Py_UNICODE *collend = p;
5968 /* find all unecodable characters */
5969 while ((collend < endp) && ((*collend)>=limit))
5970 ++collend;
5971 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5972 if (known_errorHandler==-1) {
5973 if ((errors==NULL) || (!strcmp(errors, "strict")))
5974 known_errorHandler = 1;
5975 else if (!strcmp(errors, "replace"))
5976 known_errorHandler = 2;
5977 else if (!strcmp(errors, "ignore"))
5978 known_errorHandler = 3;
5979 else if (!strcmp(errors, "xmlcharrefreplace"))
5980 known_errorHandler = 4;
5981 else
5982 known_errorHandler = 0;
5983 }
5984 switch (known_errorHandler) {
5985 case 1: /* strict */
5986 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5987 goto onError;
5988 case 2: /* replace */
5989 while (collstart++<collend)
5990 *str++ = '?'; /* fall through */
5991 case 3: /* ignore */
5992 p = collend;
5993 break;
5994 case 4: /* xmlcharrefreplace */
5995 respos = str - PyBytes_AS_STRING(res);
5996 /* determine replacement size (temporarily (mis)uses p) */
5997 for (p = collstart, repsize = 0; p < collend; ++p) {
5998 if (*p<10)
5999 repsize += 2+1+1;
6000 else if (*p<100)
6001 repsize += 2+2+1;
6002 else if (*p<1000)
6003 repsize += 2+3+1;
6004 else if (*p<10000)
6005 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006006#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 else
6008 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006009#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 else if (*p<100000)
6011 repsize += 2+5+1;
6012 else if (*p<1000000)
6013 repsize += 2+6+1;
6014 else
6015 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006016#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 }
6018 requiredsize = respos+repsize+(endp-collend);
6019 if (requiredsize > ressize) {
6020 if (requiredsize<2*ressize)
6021 requiredsize = 2*ressize;
6022 if (_PyBytes_Resize(&res, requiredsize))
6023 goto onError;
6024 str = PyBytes_AS_STRING(res) + respos;
6025 ressize = requiredsize;
6026 }
6027 /* generate replacement (temporarily (mis)uses p) */
6028 for (p = collstart; p < collend; ++p) {
6029 str += sprintf(str, "&#%d;", (int)*p);
6030 }
6031 p = collend;
6032 break;
6033 default:
6034 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6035 encoding, reason, startp, size, &exc,
6036 collstart-startp, collend-startp, &newpos);
6037 if (repunicode == NULL)
6038 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006039 if (PyBytes_Check(repunicode)) {
6040 /* Directly copy bytes result to output. */
6041 repsize = PyBytes_Size(repunicode);
6042 if (repsize > 1) {
6043 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006044 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006045 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6046 Py_DECREF(repunicode);
6047 goto onError;
6048 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006049 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006050 ressize += repsize-1;
6051 }
6052 memcpy(str, PyBytes_AsString(repunicode), repsize);
6053 str += repsize;
6054 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006055 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006056 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006057 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 /* need more space? (at least enough for what we
6059 have+the replacement+the rest of the string, so
6060 we won't have to check space for encodable characters) */
6061 respos = str - PyBytes_AS_STRING(res);
6062 repsize = PyUnicode_GET_SIZE(repunicode);
6063 requiredsize = respos+repsize+(endp-collend);
6064 if (requiredsize > ressize) {
6065 if (requiredsize<2*ressize)
6066 requiredsize = 2*ressize;
6067 if (_PyBytes_Resize(&res, requiredsize)) {
6068 Py_DECREF(repunicode);
6069 goto onError;
6070 }
6071 str = PyBytes_AS_STRING(res) + respos;
6072 ressize = requiredsize;
6073 }
6074 /* check if there is anything unencodable in the replacement
6075 and copy it to the output */
6076 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6077 c = *uni2;
6078 if (c >= limit) {
6079 raise_encode_exception(&exc, encoding, startp, size,
6080 unicodepos, unicodepos+1, reason);
6081 Py_DECREF(repunicode);
6082 goto onError;
6083 }
6084 *str = (char)c;
6085 }
6086 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006087 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006088 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006089 }
6090 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006091 /* Resize if we allocated to much */
6092 size = str - PyBytes_AS_STRING(res);
6093 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006094 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006095 if (_PyBytes_Resize(&res, size) < 0)
6096 goto onError;
6097 }
6098
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006099 Py_XDECREF(errorHandler);
6100 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006101 return res;
6102
6103 onError:
6104 Py_XDECREF(res);
6105 Py_XDECREF(errorHandler);
6106 Py_XDECREF(exc);
6107 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108}
6109
Alexander Belopolsky40018472011-02-26 01:02:56 +00006110PyObject *
6111PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006112 Py_ssize_t size,
6113 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006115 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116}
6117
Alexander Belopolsky40018472011-02-26 01:02:56 +00006118PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006119_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120{
6121 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 PyErr_BadArgument();
6123 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006125 if (PyUnicode_READY(unicode) == -1)
6126 return NULL;
6127 /* Fast path: if it is a one-byte string, construct
6128 bytes object directly. */
6129 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6130 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6131 PyUnicode_GET_LENGTH(unicode));
6132 /* Non-Latin-1 characters present. Defer to above function to
6133 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006136 errors);
6137}
6138
6139PyObject*
6140PyUnicode_AsLatin1String(PyObject *unicode)
6141{
6142 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143}
6144
6145/* --- 7-bit ASCII Codec -------------------------------------------------- */
6146
Alexander Belopolsky40018472011-02-26 01:02:56 +00006147PyObject *
6148PyUnicode_DecodeASCII(const char *s,
6149 Py_ssize_t size,
6150 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006152 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 PyUnicodeObject *v;
6154 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006155 Py_ssize_t startinpos;
6156 Py_ssize_t endinpos;
6157 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006159 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006160 PyObject *errorHandler = NULL;
6161 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006162 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006163
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006165 if (size == 1 && *(unsigned char*)s < 128)
6166 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6167
6168 /* Fast path. Assume the input actually *is* ASCII, and allocate
6169 a single-block Unicode object with that assumption. If there is
6170 an error, drop the object and start over. */
6171 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6172 if (v == NULL)
6173 goto onError;
6174 d = PyUnicode_1BYTE_DATA(v);
6175 for (i = 0; i < size; i++) {
6176 unsigned char ch = ((unsigned char*)s)[i];
6177 if (ch < 128)
6178 d[i] = ch;
6179 else
6180 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006181 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006182 if (i == size)
6183 return (PyObject*)v;
6184 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006185
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 v = _PyUnicode_New(size);
6187 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006190 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006192 e = s + size;
6193 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 register unsigned char c = (unsigned char)*s;
6195 if (c < 128) {
6196 *p++ = c;
6197 ++s;
6198 }
6199 else {
6200 startinpos = s-starts;
6201 endinpos = startinpos + 1;
6202 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6203 if (unicode_decode_call_errorhandler(
6204 errors, &errorHandler,
6205 "ascii", "ordinal not in range(128)",
6206 &starts, &e, &startinpos, &endinpos, &exc, &s,
6207 &v, &outpos, &p))
6208 goto onError;
6209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006211 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6213 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006214 Py_XDECREF(errorHandler);
6215 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006216 if (PyUnicode_READY(v) == -1) {
6217 Py_DECREF(v);
6218 return NULL;
6219 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006221
Benjamin Peterson29060642009-01-31 22:14:21 +00006222 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006224 Py_XDECREF(errorHandler);
6225 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 return NULL;
6227}
6228
Alexander Belopolsky40018472011-02-26 01:02:56 +00006229PyObject *
6230PyUnicode_EncodeASCII(const Py_UNICODE *p,
6231 Py_ssize_t size,
6232 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006234 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235}
6236
Alexander Belopolsky40018472011-02-26 01:02:56 +00006237PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006238_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239{
6240 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 PyErr_BadArgument();
6242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006244 if (PyUnicode_READY(unicode) == -1)
6245 return NULL;
6246 /* Fast path: if it is an ASCII-only string, construct bytes object
6247 directly. Else defer to above function to raise the exception. */
6248 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6249 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6250 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006253 errors);
6254}
6255
6256PyObject *
6257PyUnicode_AsASCIIString(PyObject *unicode)
6258{
6259 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260}
6261
Victor Stinner99b95382011-07-04 14:23:54 +02006262#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006263
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006264/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006265
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006266#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006267#define NEED_RETRY
6268#endif
6269
6270/* XXX This code is limited to "true" double-byte encodings, as
6271 a) it assumes an incomplete character consists of a single byte, and
6272 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006274
Alexander Belopolsky40018472011-02-26 01:02:56 +00006275static int
6276is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006277{
6278 const char *curr = s + offset;
6279
6280 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 const char *prev = CharPrev(s, curr);
6282 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006283 }
6284 return 0;
6285}
6286
6287/*
6288 * Decode MBCS string into unicode object. If 'final' is set, converts
6289 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6290 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006291static int
6292decode_mbcs(PyUnicodeObject **v,
6293 const char *s, /* MBCS string */
6294 int size, /* sizeof MBCS string */
6295 int final,
6296 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006297{
6298 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006299 Py_ssize_t n;
6300 DWORD usize;
6301 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006302
6303 assert(size >= 0);
6304
Victor Stinner554f3f02010-06-16 23:33:54 +00006305 /* check and handle 'errors' arg */
6306 if (errors==NULL || strcmp(errors, "strict")==0)
6307 flags = MB_ERR_INVALID_CHARS;
6308 else if (strcmp(errors, "ignore")==0)
6309 flags = 0;
6310 else {
6311 PyErr_Format(PyExc_ValueError,
6312 "mbcs encoding does not support errors='%s'",
6313 errors);
6314 return -1;
6315 }
6316
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006317 /* Skip trailing lead-byte unless 'final' is set */
6318 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006320
6321 /* First get the size of the result */
6322 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006323 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6324 if (usize==0)
6325 goto mbcs_decode_error;
6326 } else
6327 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006328
6329 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 /* Create unicode object */
6331 *v = _PyUnicode_New(usize);
6332 if (*v == NULL)
6333 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006334 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006335 }
6336 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 /* Extend unicode object */
6338 n = PyUnicode_GET_SIZE(*v);
6339 if (_PyUnicode_Resize(v, n + usize) < 0)
6340 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006341 }
6342
6343 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006344 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006346 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6347 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006349 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006350 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006351
6352mbcs_decode_error:
6353 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6354 we raise a UnicodeDecodeError - else it is a 'generic'
6355 windows error
6356 */
6357 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6358 /* Ideally, we should get reason from FormatMessage - this
6359 is the Windows 2000 English version of the message
6360 */
6361 PyObject *exc = NULL;
6362 const char *reason = "No mapping for the Unicode character exists "
6363 "in the target multi-byte code page.";
6364 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6365 if (exc != NULL) {
6366 PyCodec_StrictErrors(exc);
6367 Py_DECREF(exc);
6368 }
6369 } else {
6370 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6371 }
6372 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006373}
6374
Alexander Belopolsky40018472011-02-26 01:02:56 +00006375PyObject *
6376PyUnicode_DecodeMBCSStateful(const char *s,
6377 Py_ssize_t size,
6378 const char *errors,
6379 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006380{
6381 PyUnicodeObject *v = NULL;
6382 int done;
6383
6384 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006386
6387#ifdef NEED_RETRY
6388 retry:
6389 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006390 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006391 else
6392#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006393 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006394
6395 if (done < 0) {
6396 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006398 }
6399
6400 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006402
6403#ifdef NEED_RETRY
6404 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 s += done;
6406 size -= done;
6407 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006408 }
6409#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006410 if (PyUnicode_READY(v) == -1) {
6411 Py_DECREF(v);
6412 return NULL;
6413 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006414 return (PyObject *)v;
6415}
6416
Alexander Belopolsky40018472011-02-26 01:02:56 +00006417PyObject *
6418PyUnicode_DecodeMBCS(const char *s,
6419 Py_ssize_t size,
6420 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006421{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006422 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6423}
6424
6425/*
6426 * Convert unicode into string object (MBCS).
6427 * Returns 0 if succeed, -1 otherwise.
6428 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006429static int
6430encode_mbcs(PyObject **repr,
6431 const Py_UNICODE *p, /* unicode */
6432 int size, /* size of unicode */
6433 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006434{
Victor Stinner554f3f02010-06-16 23:33:54 +00006435 BOOL usedDefaultChar = FALSE;
6436 BOOL *pusedDefaultChar;
6437 int mbcssize;
6438 Py_ssize_t n;
6439 PyObject *exc = NULL;
6440 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006441
6442 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006443
Victor Stinner554f3f02010-06-16 23:33:54 +00006444 /* check and handle 'errors' arg */
6445 if (errors==NULL || strcmp(errors, "strict")==0) {
6446 flags = WC_NO_BEST_FIT_CHARS;
6447 pusedDefaultChar = &usedDefaultChar;
6448 } else if (strcmp(errors, "replace")==0) {
6449 flags = 0;
6450 pusedDefaultChar = NULL;
6451 } else {
6452 PyErr_Format(PyExc_ValueError,
6453 "mbcs encoding does not support errors='%s'",
6454 errors);
6455 return -1;
6456 }
6457
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006458 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006459 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006460 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6461 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 if (mbcssize == 0) {
6463 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6464 return -1;
6465 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006466 /* If we used a default char, then we failed! */
6467 if (pusedDefaultChar && *pusedDefaultChar)
6468 goto mbcs_encode_error;
6469 } else {
6470 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006471 }
6472
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006473 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006474 /* Create string object */
6475 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6476 if (*repr == NULL)
6477 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006478 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006479 }
6480 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 /* Extend string object */
6482 n = PyBytes_Size(*repr);
6483 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6484 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006485 }
6486
6487 /* Do the conversion */
6488 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006490 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6491 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6493 return -1;
6494 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006495 if (pusedDefaultChar && *pusedDefaultChar)
6496 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006497 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006498 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006499
6500mbcs_encode_error:
6501 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6502 Py_XDECREF(exc);
6503 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006504}
6505
Alexander Belopolsky40018472011-02-26 01:02:56 +00006506PyObject *
6507PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6508 Py_ssize_t size,
6509 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006510{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006511 PyObject *repr = NULL;
6512 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006513
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006514#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006516 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006517 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006518 else
6519#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006520 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006521
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006522 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 Py_XDECREF(repr);
6524 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006525 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006526
6527#ifdef NEED_RETRY
6528 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 p += INT_MAX;
6530 size -= INT_MAX;
6531 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006532 }
6533#endif
6534
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006535 return repr;
6536}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006537
Alexander Belopolsky40018472011-02-26 01:02:56 +00006538PyObject *
6539PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006540{
6541 if (!PyUnicode_Check(unicode)) {
6542 PyErr_BadArgument();
6543 return NULL;
6544 }
6545 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 PyUnicode_GET_SIZE(unicode),
6547 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006548}
6549
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006550#undef NEED_RETRY
6551
Victor Stinner99b95382011-07-04 14:23:54 +02006552#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006553
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554/* --- Character Mapping Codec -------------------------------------------- */
6555
Alexander Belopolsky40018472011-02-26 01:02:56 +00006556PyObject *
6557PyUnicode_DecodeCharmap(const char *s,
6558 Py_ssize_t size,
6559 PyObject *mapping,
6560 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006562 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006563 Py_ssize_t startinpos;
6564 Py_ssize_t endinpos;
6565 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006566 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 PyUnicodeObject *v;
6568 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006569 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006570 PyObject *errorHandler = NULL;
6571 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006572 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006573 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006574
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 /* Default to Latin-1 */
6576 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578
6579 v = _PyUnicode_New(size);
6580 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006585 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006586 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 mapstring = PyUnicode_AS_UNICODE(mapping);
6588 maplen = PyUnicode_GET_SIZE(mapping);
6589 while (s < e) {
6590 unsigned char ch = *s;
6591 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 if (ch < maplen)
6594 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 if (x == 0xfffe) {
6597 /* undefined mapping */
6598 outpos = p-PyUnicode_AS_UNICODE(v);
6599 startinpos = s-starts;
6600 endinpos = startinpos+1;
6601 if (unicode_decode_call_errorhandler(
6602 errors, &errorHandler,
6603 "charmap", "character maps to <undefined>",
6604 &starts, &e, &startinpos, &endinpos, &exc, &s,
6605 &v, &outpos, &p)) {
6606 goto onError;
6607 }
6608 continue;
6609 }
6610 *p++ = x;
6611 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006612 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006613 }
6614 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 while (s < e) {
6616 unsigned char ch = *s;
6617 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006618
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6620 w = PyLong_FromLong((long)ch);
6621 if (w == NULL)
6622 goto onError;
6623 x = PyObject_GetItem(mapping, w);
6624 Py_DECREF(w);
6625 if (x == NULL) {
6626 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6627 /* No mapping found means: mapping is undefined. */
6628 PyErr_Clear();
6629 x = Py_None;
6630 Py_INCREF(x);
6631 } else
6632 goto onError;
6633 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006634
Benjamin Peterson29060642009-01-31 22:14:21 +00006635 /* Apply mapping */
6636 if (PyLong_Check(x)) {
6637 long value = PyLong_AS_LONG(x);
6638 if (value < 0 || value > 65535) {
6639 PyErr_SetString(PyExc_TypeError,
6640 "character mapping must be in range(65536)");
6641 Py_DECREF(x);
6642 goto onError;
6643 }
6644 *p++ = (Py_UNICODE)value;
6645 }
6646 else if (x == Py_None) {
6647 /* undefined mapping */
6648 outpos = p-PyUnicode_AS_UNICODE(v);
6649 startinpos = s-starts;
6650 endinpos = startinpos+1;
6651 if (unicode_decode_call_errorhandler(
6652 errors, &errorHandler,
6653 "charmap", "character maps to <undefined>",
6654 &starts, &e, &startinpos, &endinpos, &exc, &s,
6655 &v, &outpos, &p)) {
6656 Py_DECREF(x);
6657 goto onError;
6658 }
6659 Py_DECREF(x);
6660 continue;
6661 }
6662 else if (PyUnicode_Check(x)) {
6663 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006664
Benjamin Peterson29060642009-01-31 22:14:21 +00006665 if (targetsize == 1)
6666 /* 1-1 mapping */
6667 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006668
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 else if (targetsize > 1) {
6670 /* 1-n mapping */
6671 if (targetsize > extrachars) {
6672 /* resize first */
6673 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6674 Py_ssize_t needed = (targetsize - extrachars) + \
6675 (targetsize << 2);
6676 extrachars += needed;
6677 /* XXX overflow detection missing */
6678 if (_PyUnicode_Resize(&v,
6679 PyUnicode_GET_SIZE(v) + needed) < 0) {
6680 Py_DECREF(x);
6681 goto onError;
6682 }
6683 p = PyUnicode_AS_UNICODE(v) + oldpos;
6684 }
6685 Py_UNICODE_COPY(p,
6686 PyUnicode_AS_UNICODE(x),
6687 targetsize);
6688 p += targetsize;
6689 extrachars -= targetsize;
6690 }
6691 /* 1-0 mapping: skip the character */
6692 }
6693 else {
6694 /* wrong return value */
6695 PyErr_SetString(PyExc_TypeError,
6696 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006697 Py_DECREF(x);
6698 goto onError;
6699 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006700 Py_DECREF(x);
6701 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 }
6704 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6706 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707 Py_XDECREF(errorHandler);
6708 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006709 if (PyUnicode_READY(v) == -1) {
6710 Py_DECREF(v);
6711 return NULL;
6712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006714
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006716 Py_XDECREF(errorHandler);
6717 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 Py_XDECREF(v);
6719 return NULL;
6720}
6721
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006722/* Charmap encoding: the lookup table */
6723
Alexander Belopolsky40018472011-02-26 01:02:56 +00006724struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 PyObject_HEAD
6726 unsigned char level1[32];
6727 int count2, count3;
6728 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006729};
6730
6731static PyObject*
6732encoding_map_size(PyObject *obj, PyObject* args)
6733{
6734 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006735 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006737}
6738
6739static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006740 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 PyDoc_STR("Return the size (in bytes) of this object") },
6742 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006743};
6744
6745static void
6746encoding_map_dealloc(PyObject* o)
6747{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006748 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006749}
6750
6751static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006752 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 "EncodingMap", /*tp_name*/
6754 sizeof(struct encoding_map), /*tp_basicsize*/
6755 0, /*tp_itemsize*/
6756 /* methods */
6757 encoding_map_dealloc, /*tp_dealloc*/
6758 0, /*tp_print*/
6759 0, /*tp_getattr*/
6760 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006761 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 0, /*tp_repr*/
6763 0, /*tp_as_number*/
6764 0, /*tp_as_sequence*/
6765 0, /*tp_as_mapping*/
6766 0, /*tp_hash*/
6767 0, /*tp_call*/
6768 0, /*tp_str*/
6769 0, /*tp_getattro*/
6770 0, /*tp_setattro*/
6771 0, /*tp_as_buffer*/
6772 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6773 0, /*tp_doc*/
6774 0, /*tp_traverse*/
6775 0, /*tp_clear*/
6776 0, /*tp_richcompare*/
6777 0, /*tp_weaklistoffset*/
6778 0, /*tp_iter*/
6779 0, /*tp_iternext*/
6780 encoding_map_methods, /*tp_methods*/
6781 0, /*tp_members*/
6782 0, /*tp_getset*/
6783 0, /*tp_base*/
6784 0, /*tp_dict*/
6785 0, /*tp_descr_get*/
6786 0, /*tp_descr_set*/
6787 0, /*tp_dictoffset*/
6788 0, /*tp_init*/
6789 0, /*tp_alloc*/
6790 0, /*tp_new*/
6791 0, /*tp_free*/
6792 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006793};
6794
6795PyObject*
6796PyUnicode_BuildEncodingMap(PyObject* string)
6797{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006798 PyObject *result;
6799 struct encoding_map *mresult;
6800 int i;
6801 int need_dict = 0;
6802 unsigned char level1[32];
6803 unsigned char level2[512];
6804 unsigned char *mlevel1, *mlevel2, *mlevel3;
6805 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006806 int kind;
6807 void *data;
6808 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006810 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006811 PyErr_BadArgument();
6812 return NULL;
6813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006814 kind = PyUnicode_KIND(string);
6815 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006816 memset(level1, 0xFF, sizeof level1);
6817 memset(level2, 0xFF, sizeof level2);
6818
6819 /* If there isn't a one-to-one mapping of NULL to \0,
6820 or if there are non-BMP characters, we need to use
6821 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006822 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006823 need_dict = 1;
6824 for (i = 1; i < 256; i++) {
6825 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006826 ch = PyUnicode_READ(kind, data, i);
6827 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006828 need_dict = 1;
6829 break;
6830 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006831 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006832 /* unmapped character */
6833 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006834 l1 = ch >> 11;
6835 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006836 if (level1[l1] == 0xFF)
6837 level1[l1] = count2++;
6838 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006839 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006840 }
6841
6842 if (count2 >= 0xFF || count3 >= 0xFF)
6843 need_dict = 1;
6844
6845 if (need_dict) {
6846 PyObject *result = PyDict_New();
6847 PyObject *key, *value;
6848 if (!result)
6849 return NULL;
6850 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006851 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006852 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006853 if (!key || !value)
6854 goto failed1;
6855 if (PyDict_SetItem(result, key, value) == -1)
6856 goto failed1;
6857 Py_DECREF(key);
6858 Py_DECREF(value);
6859 }
6860 return result;
6861 failed1:
6862 Py_XDECREF(key);
6863 Py_XDECREF(value);
6864 Py_DECREF(result);
6865 return NULL;
6866 }
6867
6868 /* Create a three-level trie */
6869 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6870 16*count2 + 128*count3 - 1);
6871 if (!result)
6872 return PyErr_NoMemory();
6873 PyObject_Init(result, &EncodingMapType);
6874 mresult = (struct encoding_map*)result;
6875 mresult->count2 = count2;
6876 mresult->count3 = count3;
6877 mlevel1 = mresult->level1;
6878 mlevel2 = mresult->level23;
6879 mlevel3 = mresult->level23 + 16*count2;
6880 memcpy(mlevel1, level1, 32);
6881 memset(mlevel2, 0xFF, 16*count2);
6882 memset(mlevel3, 0, 128*count3);
6883 count3 = 0;
6884 for (i = 1; i < 256; i++) {
6885 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006886 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006887 /* unmapped character */
6888 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006889 o1 = PyUnicode_READ(kind, data, i)>>11;
6890 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006891 i2 = 16*mlevel1[o1] + o2;
6892 if (mlevel2[i2] == 0xFF)
6893 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006894 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006895 i3 = 128*mlevel2[i2] + o3;
6896 mlevel3[i3] = i;
6897 }
6898 return result;
6899}
6900
6901static int
6902encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6903{
6904 struct encoding_map *map = (struct encoding_map*)mapping;
6905 int l1 = c>>11;
6906 int l2 = (c>>7) & 0xF;
6907 int l3 = c & 0x7F;
6908 int i;
6909
6910#ifdef Py_UNICODE_WIDE
6911 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006912 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006913 }
6914#endif
6915 if (c == 0)
6916 return 0;
6917 /* level 1*/
6918 i = map->level1[l1];
6919 if (i == 0xFF) {
6920 return -1;
6921 }
6922 /* level 2*/
6923 i = map->level23[16*i+l2];
6924 if (i == 0xFF) {
6925 return -1;
6926 }
6927 /* level 3 */
6928 i = map->level23[16*map->count2 + 128*i + l3];
6929 if (i == 0) {
6930 return -1;
6931 }
6932 return i;
6933}
6934
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006935/* Lookup the character ch in the mapping. If the character
6936 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006937 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006938static PyObject *
6939charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940{
Christian Heimes217cfd12007-12-02 14:31:20 +00006941 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006942 PyObject *x;
6943
6944 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946 x = PyObject_GetItem(mapping, w);
6947 Py_DECREF(w);
6948 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6950 /* No mapping found means: mapping is undefined. */
6951 PyErr_Clear();
6952 x = Py_None;
6953 Py_INCREF(x);
6954 return x;
6955 } else
6956 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006958 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006960 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006961 long value = PyLong_AS_LONG(x);
6962 if (value < 0 || value > 255) {
6963 PyErr_SetString(PyExc_TypeError,
6964 "character mapping must be in range(256)");
6965 Py_DECREF(x);
6966 return NULL;
6967 }
6968 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006970 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 /* wrong return value */
6974 PyErr_Format(PyExc_TypeError,
6975 "character mapping must return integer, bytes or None, not %.400s",
6976 x->ob_type->tp_name);
6977 Py_DECREF(x);
6978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 }
6980}
6981
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006982static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006983charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006984{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006985 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6986 /* exponentially overallocate to minimize reallocations */
6987 if (requiredsize < 2*outsize)
6988 requiredsize = 2*outsize;
6989 if (_PyBytes_Resize(outobj, requiredsize))
6990 return -1;
6991 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006992}
6993
Benjamin Peterson14339b62009-01-31 16:36:08 +00006994typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006995 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006996} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006997/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006998 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006999 space is available. Return a new reference to the object that
7000 was put in the output buffer, or Py_None, if the mapping was undefined
7001 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007002 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007003static charmapencode_result
7004charmapencode_output(Py_UNICODE c, PyObject *mapping,
7005 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007006{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007007 PyObject *rep;
7008 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007009 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007010
Christian Heimes90aa7642007-12-19 02:45:37 +00007011 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007012 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007014 if (res == -1)
7015 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 if (outsize<requiredsize)
7017 if (charmapencode_resize(outobj, outpos, requiredsize))
7018 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007019 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 outstart[(*outpos)++] = (char)res;
7021 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007022 }
7023
7024 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007025 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007027 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 Py_DECREF(rep);
7029 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007030 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007031 if (PyLong_Check(rep)) {
7032 Py_ssize_t requiredsize = *outpos+1;
7033 if (outsize<requiredsize)
7034 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7035 Py_DECREF(rep);
7036 return enc_EXCEPTION;
7037 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007038 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007040 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007041 else {
7042 const char *repchars = PyBytes_AS_STRING(rep);
7043 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7044 Py_ssize_t requiredsize = *outpos+repsize;
7045 if (outsize<requiredsize)
7046 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7047 Py_DECREF(rep);
7048 return enc_EXCEPTION;
7049 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007050 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 memcpy(outstart + *outpos, repchars, repsize);
7052 *outpos += repsize;
7053 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007054 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007055 Py_DECREF(rep);
7056 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007057}
7058
7059/* handle an error in PyUnicode_EncodeCharmap
7060 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007061static int
7062charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007063 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007064 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007065 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007066 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007067{
7068 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007069 Py_ssize_t repsize;
7070 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007071 Py_UNICODE *uni2;
7072 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007073 Py_ssize_t collstartpos = *inpos;
7074 Py_ssize_t collendpos = *inpos+1;
7075 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007076 char *encoding = "charmap";
7077 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007078 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007079
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007080 /* find all unencodable characters */
7081 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007082 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007083 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007084 int res = encoding_map_lookup(p[collendpos], mapping);
7085 if (res != -1)
7086 break;
7087 ++collendpos;
7088 continue;
7089 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007090
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 rep = charmapencode_lookup(p[collendpos], mapping);
7092 if (rep==NULL)
7093 return -1;
7094 else if (rep!=Py_None) {
7095 Py_DECREF(rep);
7096 break;
7097 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007098 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007099 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007100 }
7101 /* cache callback name lookup
7102 * (if not done yet, i.e. it's the first error) */
7103 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 if ((errors==NULL) || (!strcmp(errors, "strict")))
7105 *known_errorHandler = 1;
7106 else if (!strcmp(errors, "replace"))
7107 *known_errorHandler = 2;
7108 else if (!strcmp(errors, "ignore"))
7109 *known_errorHandler = 3;
7110 else if (!strcmp(errors, "xmlcharrefreplace"))
7111 *known_errorHandler = 4;
7112 else
7113 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007114 }
7115 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007116 case 1: /* strict */
7117 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7118 return -1;
7119 case 2: /* replace */
7120 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 x = charmapencode_output('?', mapping, res, respos);
7122 if (x==enc_EXCEPTION) {
7123 return -1;
7124 }
7125 else if (x==enc_FAILED) {
7126 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7127 return -1;
7128 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007129 }
7130 /* fall through */
7131 case 3: /* ignore */
7132 *inpos = collendpos;
7133 break;
7134 case 4: /* xmlcharrefreplace */
7135 /* generate replacement (temporarily (mis)uses p) */
7136 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 char buffer[2+29+1+1];
7138 char *cp;
7139 sprintf(buffer, "&#%d;", (int)p[collpos]);
7140 for (cp = buffer; *cp; ++cp) {
7141 x = charmapencode_output(*cp, mapping, res, respos);
7142 if (x==enc_EXCEPTION)
7143 return -1;
7144 else if (x==enc_FAILED) {
7145 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7146 return -1;
7147 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007148 }
7149 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007150 *inpos = collendpos;
7151 break;
7152 default:
7153 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007154 encoding, reason, p, size, exceptionObject,
7155 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007156 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007157 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007158 if (PyBytes_Check(repunicode)) {
7159 /* Directly copy bytes result to output. */
7160 Py_ssize_t outsize = PyBytes_Size(*res);
7161 Py_ssize_t requiredsize;
7162 repsize = PyBytes_Size(repunicode);
7163 requiredsize = *respos + repsize;
7164 if (requiredsize > outsize)
7165 /* Make room for all additional bytes. */
7166 if (charmapencode_resize(res, respos, requiredsize)) {
7167 Py_DECREF(repunicode);
7168 return -1;
7169 }
7170 memcpy(PyBytes_AsString(*res) + *respos,
7171 PyBytes_AsString(repunicode), repsize);
7172 *respos += repsize;
7173 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007174 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007175 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007176 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007177 /* generate replacement */
7178 repsize = PyUnicode_GET_SIZE(repunicode);
7179 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007180 x = charmapencode_output(*uni2, mapping, res, respos);
7181 if (x==enc_EXCEPTION) {
7182 return -1;
7183 }
7184 else if (x==enc_FAILED) {
7185 Py_DECREF(repunicode);
7186 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7187 return -1;
7188 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007189 }
7190 *inpos = newpos;
7191 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007192 }
7193 return 0;
7194}
7195
Alexander Belopolsky40018472011-02-26 01:02:56 +00007196PyObject *
7197PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7198 Py_ssize_t size,
7199 PyObject *mapping,
7200 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007202 /* output object */
7203 PyObject *res = NULL;
7204 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007205 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007206 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007207 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007208 PyObject *errorHandler = NULL;
7209 PyObject *exc = NULL;
7210 /* the following variable is used for caching string comparisons
7211 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7212 * 3=ignore, 4=xmlcharrefreplace */
7213 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214
7215 /* Default to Latin-1 */
7216 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007219 /* allocate enough for a simple encoding without
7220 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007221 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007222 if (res == NULL)
7223 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007224 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007227 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007228 /* try to encode it */
7229 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7230 if (x==enc_EXCEPTION) /* error */
7231 goto onError;
7232 if (x==enc_FAILED) { /* unencodable character */
7233 if (charmap_encoding_error(p, size, &inpos, mapping,
7234 &exc,
7235 &known_errorHandler, &errorHandler, errors,
7236 &res, &respos)) {
7237 goto onError;
7238 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007239 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007240 else
7241 /* done with this character => adjust input position */
7242 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007245 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007246 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007247 if (_PyBytes_Resize(&res, respos) < 0)
7248 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007249
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007250 Py_XDECREF(exc);
7251 Py_XDECREF(errorHandler);
7252 return res;
7253
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007255 Py_XDECREF(res);
7256 Py_XDECREF(exc);
7257 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258 return NULL;
7259}
7260
Alexander Belopolsky40018472011-02-26 01:02:56 +00007261PyObject *
7262PyUnicode_AsCharmapString(PyObject *unicode,
7263 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264{
7265 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 PyErr_BadArgument();
7267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 }
7269 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 PyUnicode_GET_SIZE(unicode),
7271 mapping,
7272 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273}
7274
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007275/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007276static void
7277make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007278 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007279 Py_ssize_t startpos, Py_ssize_t endpos,
7280 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007282 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007283 *exceptionObject = _PyUnicodeTranslateError_Create(
7284 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285 }
7286 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7288 goto onError;
7289 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7290 goto onError;
7291 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7292 goto onError;
7293 return;
7294 onError:
7295 Py_DECREF(*exceptionObject);
7296 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297 }
7298}
7299
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007300/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007301static void
7302raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007303 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007304 Py_ssize_t startpos, Py_ssize_t endpos,
7305 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007306{
7307 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007308 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007309 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007310 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007311}
7312
7313/* error handling callback helper:
7314 build arguments, call the callback and check the arguments,
7315 put the result into newpos and return the replacement string, which
7316 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007317static PyObject *
7318unicode_translate_call_errorhandler(const char *errors,
7319 PyObject **errorHandler,
7320 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007321 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007322 Py_ssize_t startpos, Py_ssize_t endpos,
7323 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007324{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007325 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007326
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007327 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007328 PyObject *restuple;
7329 PyObject *resunicode;
7330
7331 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007333 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007334 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007335 }
7336
7337 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007338 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007339 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007340 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007341
7342 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007344 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007346 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007347 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 Py_DECREF(restuple);
7349 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007350 }
7351 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 &resunicode, &i_newpos)) {
7353 Py_DECREF(restuple);
7354 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007355 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007356 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007357 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007358 else
7359 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007360 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7362 Py_DECREF(restuple);
7363 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007364 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007365 Py_INCREF(resunicode);
7366 Py_DECREF(restuple);
7367 return resunicode;
7368}
7369
7370/* Lookup the character ch in the mapping and put the result in result,
7371 which must be decrefed by the caller.
7372 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007373static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007374charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007375{
Christian Heimes217cfd12007-12-02 14:31:20 +00007376 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007377 PyObject *x;
7378
7379 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007381 x = PyObject_GetItem(mapping, w);
7382 Py_DECREF(w);
7383 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7385 /* No mapping found means: use 1:1 mapping. */
7386 PyErr_Clear();
7387 *result = NULL;
7388 return 0;
7389 } else
7390 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007391 }
7392 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 *result = x;
7394 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007395 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007396 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 long value = PyLong_AS_LONG(x);
7398 long max = PyUnicode_GetMax();
7399 if (value < 0 || value > max) {
7400 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007401 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 Py_DECREF(x);
7403 return -1;
7404 }
7405 *result = x;
7406 return 0;
7407 }
7408 else if (PyUnicode_Check(x)) {
7409 *result = x;
7410 return 0;
7411 }
7412 else {
7413 /* wrong return value */
7414 PyErr_SetString(PyExc_TypeError,
7415 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007416 Py_DECREF(x);
7417 return -1;
7418 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007419}
7420/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 if not reallocate and adjust various state variables.
7422 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007423static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007424charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007426{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007427 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007428 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007429 /* exponentially overallocate to minimize reallocations */
7430 if (requiredsize < 2 * oldsize)
7431 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007432 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7433 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007434 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007435 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007436 }
7437 return 0;
7438}
7439/* lookup the character, put the result in the output string and adjust
7440 various state variables. Return a new reference to the object that
7441 was put in the output buffer in *result, or Py_None, if the mapping was
7442 undefined (in which case no character was written).
7443 The called must decref result.
7444 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007445static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007446charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7447 PyObject *mapping, Py_UCS4 **output,
7448 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007449 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007451 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7452 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007454 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007456 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007457 }
7458 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007460 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007462 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007463 }
7464 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007465 Py_ssize_t repsize;
7466 if (PyUnicode_READY(*res) == -1)
7467 return -1;
7468 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 if (repsize==1) {
7470 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007471 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 }
7473 else if (repsize!=0) {
7474 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007475 Py_ssize_t requiredsize = *opos +
7476 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007478 Py_ssize_t i;
7479 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007481 for(i = 0; i < repsize; i++)
7482 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007484 }
7485 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007487 return 0;
7488}
7489
Alexander Belopolsky40018472011-02-26 01:02:56 +00007490PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007491_PyUnicode_TranslateCharmap(PyObject *input,
7492 PyObject *mapping,
7493 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007495 /* input object */
7496 char *idata;
7497 Py_ssize_t size, i;
7498 int kind;
7499 /* output buffer */
7500 Py_UCS4 *output = NULL;
7501 Py_ssize_t osize;
7502 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007503 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007504 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007505 char *reason = "character maps to <undefined>";
7506 PyObject *errorHandler = NULL;
7507 PyObject *exc = NULL;
7508 /* the following variable is used for caching string comparisons
7509 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7510 * 3=ignore, 4=xmlcharrefreplace */
7511 int known_errorHandler = -1;
7512
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 PyErr_BadArgument();
7515 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007518 if (PyUnicode_READY(input) == -1)
7519 return NULL;
7520 idata = (char*)PyUnicode_DATA(input);
7521 kind = PyUnicode_KIND(input);
7522 size = PyUnicode_GET_LENGTH(input);
7523 i = 0;
7524
7525 if (size == 0) {
7526 Py_INCREF(input);
7527 return input;
7528 }
7529
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007530 /* allocate enough for a simple 1:1 translation without
7531 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007532 osize = size;
7533 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7534 opos = 0;
7535 if (output == NULL) {
7536 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007540 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 /* try to encode it */
7542 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007543 if (charmaptranslate_output(input, i, mapping,
7544 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007545 Py_XDECREF(x);
7546 goto onError;
7547 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007548 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007549 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007550 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 else { /* untranslatable character */
7552 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7553 Py_ssize_t repsize;
7554 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007555 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007556 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007557 Py_ssize_t collstart = i;
7558 Py_ssize_t collend = i+1;
7559 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560
Benjamin Peterson29060642009-01-31 22:14:21 +00007561 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007562 while (collend < size) {
7563 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007564 goto onError;
7565 Py_XDECREF(x);
7566 if (x!=Py_None)
7567 break;
7568 ++collend;
7569 }
7570 /* cache callback name lookup
7571 * (if not done yet, i.e. it's the first error) */
7572 if (known_errorHandler==-1) {
7573 if ((errors==NULL) || (!strcmp(errors, "strict")))
7574 known_errorHandler = 1;
7575 else if (!strcmp(errors, "replace"))
7576 known_errorHandler = 2;
7577 else if (!strcmp(errors, "ignore"))
7578 known_errorHandler = 3;
7579 else if (!strcmp(errors, "xmlcharrefreplace"))
7580 known_errorHandler = 4;
7581 else
7582 known_errorHandler = 0;
7583 }
7584 switch (known_errorHandler) {
7585 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007586 raise_translate_exception(&exc, input, collstart,
7587 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007588 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 case 2: /* replace */
7590 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007591 for (coll = collstart; coll<collend; coll++)
7592 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 /* fall through */
7594 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007595 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 break;
7597 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007598 /* generate replacement (temporarily (mis)uses i) */
7599 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 char buffer[2+29+1+1];
7601 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007602 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7603 if (charmaptranslate_makespace(&output, &osize,
7604 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 goto onError;
7606 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007607 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007609 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 break;
7611 default:
7612 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007613 reason, input, &exc,
7614 collstart, collend, &newpos);
7615 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 goto onError;
7617 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007618 repsize = PyUnicode_GET_LENGTH(repunicode);
7619 if (charmaptranslate_makespace(&output, &osize,
7620 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 Py_DECREF(repunicode);
7622 goto onError;
7623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007624 for (uni2 = 0; repsize-->0; ++uni2)
7625 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7626 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007628 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007629 }
7630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007631 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7632 if (!res)
7633 goto onError;
7634 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007635 Py_XDECREF(exc);
7636 Py_XDECREF(errorHandler);
7637 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007640 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007641 Py_XDECREF(exc);
7642 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643 return NULL;
7644}
7645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007646/* Deprecated. Use PyUnicode_Translate instead. */
7647PyObject *
7648PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7649 Py_ssize_t size,
7650 PyObject *mapping,
7651 const char *errors)
7652{
7653 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7654 if (!unicode)
7655 return NULL;
7656 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7657}
7658
Alexander Belopolsky40018472011-02-26 01:02:56 +00007659PyObject *
7660PyUnicode_Translate(PyObject *str,
7661 PyObject *mapping,
7662 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663{
7664 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007665
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666 str = PyUnicode_FromObject(str);
7667 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007669 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670 Py_DECREF(str);
7671 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007672
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 Py_XDECREF(str);
7675 return NULL;
7676}
Tim Petersced69f82003-09-16 20:30:58 +00007677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007678static Py_UCS4
7679fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7680{
7681 /* No need to call PyUnicode_READY(self) because this function is only
7682 called as a callback from fixup() which does it already. */
7683 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7684 const int kind = PyUnicode_KIND(self);
7685 void *data = PyUnicode_DATA(self);
7686 Py_UCS4 maxchar = 0, ch, fixed;
7687 Py_ssize_t i;
7688
7689 for (i = 0; i < len; ++i) {
7690 ch = PyUnicode_READ(kind, data, i);
7691 fixed = 0;
7692 if (ch > 127) {
7693 if (Py_UNICODE_ISSPACE(ch))
7694 fixed = ' ';
7695 else {
7696 const int decimal = Py_UNICODE_TODECIMAL(ch);
7697 if (decimal >= 0)
7698 fixed = '0' + decimal;
7699 }
7700 if (fixed != 0) {
7701 if (fixed > maxchar)
7702 maxchar = fixed;
7703 PyUnicode_WRITE(kind, data, i, fixed);
7704 }
7705 else if (ch > maxchar)
7706 maxchar = ch;
7707 }
7708 else if (ch > maxchar)
7709 maxchar = ch;
7710 }
7711
7712 return maxchar;
7713}
7714
7715PyObject *
7716_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7717{
7718 if (!PyUnicode_Check(unicode)) {
7719 PyErr_BadInternalCall();
7720 return NULL;
7721 }
7722 if (PyUnicode_READY(unicode) == -1)
7723 return NULL;
7724 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7725 /* If the string is already ASCII, just return the same string */
7726 Py_INCREF(unicode);
7727 return unicode;
7728 }
7729 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7730}
7731
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007732PyObject *
7733PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7734 Py_ssize_t length)
7735{
7736 PyObject *result;
7737 Py_UNICODE *p; /* write pointer into result */
7738 Py_ssize_t i;
7739 /* Copy to a new string */
7740 result = (PyObject *)_PyUnicode_New(length);
7741 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7742 if (result == NULL)
7743 return result;
7744 p = PyUnicode_AS_UNICODE(result);
7745 /* Iterate over code points */
7746 for (i = 0; i < length; i++) {
7747 Py_UNICODE ch =s[i];
7748 if (ch > 127) {
7749 int decimal = Py_UNICODE_TODECIMAL(ch);
7750 if (decimal >= 0)
7751 p[i] = '0' + decimal;
7752 }
7753 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007754 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7755 Py_DECREF(result);
7756 return NULL;
7757 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007758 return result;
7759}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007760/* --- Decimal Encoder ---------------------------------------------------- */
7761
Alexander Belopolsky40018472011-02-26 01:02:56 +00007762int
7763PyUnicode_EncodeDecimal(Py_UNICODE *s,
7764 Py_ssize_t length,
7765 char *output,
7766 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007767{
7768 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007769 PyObject *errorHandler = NULL;
7770 PyObject *exc = NULL;
7771 const char *encoding = "decimal";
7772 const char *reason = "invalid decimal Unicode string";
7773 /* the following variable is used for caching string comparisons
7774 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7775 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007776
7777 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 PyErr_BadArgument();
7779 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007780 }
7781
7782 p = s;
7783 end = s + length;
7784 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 register Py_UNICODE ch = *p;
7786 int decimal;
7787 PyObject *repunicode;
7788 Py_ssize_t repsize;
7789 Py_ssize_t newpos;
7790 Py_UNICODE *uni2;
7791 Py_UNICODE *collstart;
7792 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007793
Benjamin Peterson29060642009-01-31 22:14:21 +00007794 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007795 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 ++p;
7797 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007798 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007799 decimal = Py_UNICODE_TODECIMAL(ch);
7800 if (decimal >= 0) {
7801 *output++ = '0' + decimal;
7802 ++p;
7803 continue;
7804 }
7805 if (0 < ch && ch < 256) {
7806 *output++ = (char)ch;
7807 ++p;
7808 continue;
7809 }
7810 /* All other characters are considered unencodable */
7811 collstart = p;
7812 collend = p+1;
7813 while (collend < end) {
7814 if ((0 < *collend && *collend < 256) ||
7815 !Py_UNICODE_ISSPACE(*collend) ||
7816 Py_UNICODE_TODECIMAL(*collend))
7817 break;
7818 }
7819 /* cache callback name lookup
7820 * (if not done yet, i.e. it's the first error) */
7821 if (known_errorHandler==-1) {
7822 if ((errors==NULL) || (!strcmp(errors, "strict")))
7823 known_errorHandler = 1;
7824 else if (!strcmp(errors, "replace"))
7825 known_errorHandler = 2;
7826 else if (!strcmp(errors, "ignore"))
7827 known_errorHandler = 3;
7828 else if (!strcmp(errors, "xmlcharrefreplace"))
7829 known_errorHandler = 4;
7830 else
7831 known_errorHandler = 0;
7832 }
7833 switch (known_errorHandler) {
7834 case 1: /* strict */
7835 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7836 goto onError;
7837 case 2: /* replace */
7838 for (p = collstart; p < collend; ++p)
7839 *output++ = '?';
7840 /* fall through */
7841 case 3: /* ignore */
7842 p = collend;
7843 break;
7844 case 4: /* xmlcharrefreplace */
7845 /* generate replacement (temporarily (mis)uses p) */
7846 for (p = collstart; p < collend; ++p)
7847 output += sprintf(output, "&#%d;", (int)*p);
7848 p = collend;
7849 break;
7850 default:
7851 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7852 encoding, reason, s, length, &exc,
7853 collstart-s, collend-s, &newpos);
7854 if (repunicode == NULL)
7855 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007856 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007857 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007858 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7859 Py_DECREF(repunicode);
7860 goto onError;
7861 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007862 /* generate replacement */
7863 repsize = PyUnicode_GET_SIZE(repunicode);
7864 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7865 Py_UNICODE ch = *uni2;
7866 if (Py_UNICODE_ISSPACE(ch))
7867 *output++ = ' ';
7868 else {
7869 decimal = Py_UNICODE_TODECIMAL(ch);
7870 if (decimal >= 0)
7871 *output++ = '0' + decimal;
7872 else if (0 < ch && ch < 256)
7873 *output++ = (char)ch;
7874 else {
7875 Py_DECREF(repunicode);
7876 raise_encode_exception(&exc, encoding,
7877 s, length, collstart-s, collend-s, reason);
7878 goto onError;
7879 }
7880 }
7881 }
7882 p = s + newpos;
7883 Py_DECREF(repunicode);
7884 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007885 }
7886 /* 0-terminate the output string */
7887 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007888 Py_XDECREF(exc);
7889 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007890 return 0;
7891
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007893 Py_XDECREF(exc);
7894 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007895 return -1;
7896}
7897
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898/* --- Helpers ------------------------------------------------------------ */
7899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007900#include "stringlib/ucs1lib.h"
7901#include "stringlib/fastsearch.h"
7902#include "stringlib/partition.h"
7903#include "stringlib/split.h"
7904#include "stringlib/count.h"
7905#include "stringlib/find.h"
7906#include "stringlib/localeutil.h"
7907#include "stringlib/undef.h"
7908
7909#include "stringlib/ucs2lib.h"
7910#include "stringlib/fastsearch.h"
7911#include "stringlib/partition.h"
7912#include "stringlib/split.h"
7913#include "stringlib/count.h"
7914#include "stringlib/find.h"
7915#include "stringlib/localeutil.h"
7916#include "stringlib/undef.h"
7917
7918#include "stringlib/ucs4lib.h"
7919#include "stringlib/fastsearch.h"
7920#include "stringlib/partition.h"
7921#include "stringlib/split.h"
7922#include "stringlib/count.h"
7923#include "stringlib/find.h"
7924#include "stringlib/localeutil.h"
7925#include "stringlib/undef.h"
7926
7927static Py_ssize_t
7928any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7929 const Py_UCS1*, Py_ssize_t,
7930 Py_ssize_t, Py_ssize_t),
7931 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7932 const Py_UCS2*, Py_ssize_t,
7933 Py_ssize_t, Py_ssize_t),
7934 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7935 const Py_UCS4*, Py_ssize_t,
7936 Py_ssize_t, Py_ssize_t),
7937 PyObject* s1, PyObject* s2,
7938 Py_ssize_t start,
7939 Py_ssize_t end)
7940{
7941 int kind1, kind2, kind;
7942 void *buf1, *buf2;
7943 Py_ssize_t len1, len2, result;
7944
7945 kind1 = PyUnicode_KIND(s1);
7946 kind2 = PyUnicode_KIND(s2);
7947 kind = kind1 > kind2 ? kind1 : kind2;
7948 buf1 = PyUnicode_DATA(s1);
7949 buf2 = PyUnicode_DATA(s2);
7950 if (kind1 != kind)
7951 buf1 = _PyUnicode_AsKind(s1, kind);
7952 if (!buf1)
7953 return -2;
7954 if (kind2 != kind)
7955 buf2 = _PyUnicode_AsKind(s2, kind);
7956 if (!buf2) {
7957 if (kind1 != kind) PyMem_Free(buf1);
7958 return -2;
7959 }
7960 len1 = PyUnicode_GET_LENGTH(s1);
7961 len2 = PyUnicode_GET_LENGTH(s2);
7962
7963 switch(kind) {
7964 case PyUnicode_1BYTE_KIND:
7965 result = ucs1(buf1, len1, buf2, len2, start, end);
7966 break;
7967 case PyUnicode_2BYTE_KIND:
7968 result = ucs2(buf1, len1, buf2, len2, start, end);
7969 break;
7970 case PyUnicode_4BYTE_KIND:
7971 result = ucs4(buf1, len1, buf2, len2, start, end);
7972 break;
7973 default:
7974 assert(0); result = -2;
7975 }
7976
7977 if (kind1 != kind)
7978 PyMem_Free(buf1);
7979 if (kind2 != kind)
7980 PyMem_Free(buf2);
7981
7982 return result;
7983}
7984
7985Py_ssize_t
7986_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7987 Py_ssize_t n_buffer,
7988 void *digits, Py_ssize_t n_digits,
7989 Py_ssize_t min_width,
7990 const char *grouping,
7991 const char *thousands_sep)
7992{
7993 switch(kind) {
7994 case PyUnicode_1BYTE_KIND:
7995 return _PyUnicode_ucs1_InsertThousandsGrouping(
7996 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7997 min_width, grouping, thousands_sep);
7998 case PyUnicode_2BYTE_KIND:
7999 return _PyUnicode_ucs2_InsertThousandsGrouping(
8000 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8001 min_width, grouping, thousands_sep);
8002 case PyUnicode_4BYTE_KIND:
8003 return _PyUnicode_ucs4_InsertThousandsGrouping(
8004 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8005 min_width, grouping, thousands_sep);
8006 }
8007 assert(0);
8008 return -1;
8009}
8010
8011
Eric Smith8c663262007-08-25 02:26:07 +00008012#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008013#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008014
Thomas Wouters477c8d52006-05-27 19:21:47 +00008015#include "stringlib/count.h"
8016#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008017
Thomas Wouters477c8d52006-05-27 19:21:47 +00008018/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008019#define ADJUST_INDICES(start, end, len) \
8020 if (end > len) \
8021 end = len; \
8022 else if (end < 0) { \
8023 end += len; \
8024 if (end < 0) \
8025 end = 0; \
8026 } \
8027 if (start < 0) { \
8028 start += len; \
8029 if (start < 0) \
8030 start = 0; \
8031 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008032
Alexander Belopolsky40018472011-02-26 01:02:56 +00008033Py_ssize_t
8034PyUnicode_Count(PyObject *str,
8035 PyObject *substr,
8036 Py_ssize_t start,
8037 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008039 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008040 PyUnicodeObject* str_obj;
8041 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008042 int kind1, kind2, kind;
8043 void *buf1 = NULL, *buf2 = NULL;
8044 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008045
Thomas Wouters477c8d52006-05-27 19:21:47 +00008046 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008047 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008049 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008050 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 Py_DECREF(str_obj);
8052 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053 }
Tim Petersced69f82003-09-16 20:30:58 +00008054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008055 kind1 = PyUnicode_KIND(str_obj);
8056 kind2 = PyUnicode_KIND(sub_obj);
8057 kind = kind1 > kind2 ? kind1 : kind2;
8058 buf1 = PyUnicode_DATA(str_obj);
8059 if (kind1 != kind)
8060 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8061 if (!buf1)
8062 goto onError;
8063 buf2 = PyUnicode_DATA(sub_obj);
8064 if (kind2 != kind)
8065 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8066 if (!buf2)
8067 goto onError;
8068 len1 = PyUnicode_GET_LENGTH(str_obj);
8069 len2 = PyUnicode_GET_LENGTH(sub_obj);
8070
8071 ADJUST_INDICES(start, end, len1);
8072 switch(kind) {
8073 case PyUnicode_1BYTE_KIND:
8074 result = ucs1lib_count(
8075 ((Py_UCS1*)buf1) + start, end - start,
8076 buf2, len2, PY_SSIZE_T_MAX
8077 );
8078 break;
8079 case PyUnicode_2BYTE_KIND:
8080 result = ucs2lib_count(
8081 ((Py_UCS2*)buf1) + start, end - start,
8082 buf2, len2, PY_SSIZE_T_MAX
8083 );
8084 break;
8085 case PyUnicode_4BYTE_KIND:
8086 result = ucs4lib_count(
8087 ((Py_UCS4*)buf1) + start, end - start,
8088 buf2, len2, PY_SSIZE_T_MAX
8089 );
8090 break;
8091 default:
8092 assert(0); result = 0;
8093 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008094
8095 Py_DECREF(sub_obj);
8096 Py_DECREF(str_obj);
8097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008098 if (kind1 != kind)
8099 PyMem_Free(buf1);
8100 if (kind2 != kind)
8101 PyMem_Free(buf2);
8102
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008104 onError:
8105 Py_DECREF(sub_obj);
8106 Py_DECREF(str_obj);
8107 if (kind1 != kind && buf1)
8108 PyMem_Free(buf1);
8109 if (kind2 != kind && buf2)
8110 PyMem_Free(buf2);
8111 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112}
8113
Alexander Belopolsky40018472011-02-26 01:02:56 +00008114Py_ssize_t
8115PyUnicode_Find(PyObject *str,
8116 PyObject *sub,
8117 Py_ssize_t start,
8118 Py_ssize_t end,
8119 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008121 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008122
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008124 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008126 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008127 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008128 Py_DECREF(str);
8129 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130 }
Tim Petersced69f82003-09-16 20:30:58 +00008131
Thomas Wouters477c8d52006-05-27 19:21:47 +00008132 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008133 result = any_find_slice(
8134 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8135 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008136 );
8137 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008138 result = any_find_slice(
8139 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8140 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008141 );
8142
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008144 Py_DECREF(sub);
8145
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 return result;
8147}
8148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008149Py_ssize_t
8150PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8151 Py_ssize_t start, Py_ssize_t end,
8152 int direction)
8153{
8154 char *result;
8155 int kind;
8156 if (PyUnicode_READY(str) == -1)
8157 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008158 if (start < 0 || end < 0) {
8159 PyErr_SetString(PyExc_IndexError, "string index out of range");
8160 return -2;
8161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008162 if (end > PyUnicode_GET_LENGTH(str))
8163 end = PyUnicode_GET_LENGTH(str);
8164 kind = PyUnicode_KIND(str);
8165 result = findchar(PyUnicode_1BYTE_DATA(str)
8166 + PyUnicode_KIND_SIZE(kind, start),
8167 kind,
8168 end-start, ch, direction);
8169 if (!result)
8170 return -1;
8171 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8172}
8173
Alexander Belopolsky40018472011-02-26 01:02:56 +00008174static int
8175tailmatch(PyUnicodeObject *self,
8176 PyUnicodeObject *substring,
8177 Py_ssize_t start,
8178 Py_ssize_t end,
8179 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008181 int kind_self;
8182 int kind_sub;
8183 void *data_self;
8184 void *data_sub;
8185 Py_ssize_t offset;
8186 Py_ssize_t i;
8187 Py_ssize_t end_sub;
8188
8189 if (PyUnicode_READY(self) == -1 ||
8190 PyUnicode_READY(substring) == -1)
8191 return 0;
8192
8193 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194 return 1;
8195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008196 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8197 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008201 kind_self = PyUnicode_KIND(self);
8202 data_self = PyUnicode_DATA(self);
8203 kind_sub = PyUnicode_KIND(substring);
8204 data_sub = PyUnicode_DATA(substring);
8205 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8206
8207 if (direction > 0)
8208 offset = end;
8209 else
8210 offset = start;
8211
8212 if (PyUnicode_READ(kind_self, data_self, offset) ==
8213 PyUnicode_READ(kind_sub, data_sub, 0) &&
8214 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8215 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8216 /* If both are of the same kind, memcmp is sufficient */
8217 if (kind_self == kind_sub) {
8218 return ! memcmp((char *)data_self +
8219 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8220 data_sub,
8221 PyUnicode_GET_LENGTH(substring) *
8222 PyUnicode_CHARACTER_SIZE(substring));
8223 }
8224 /* otherwise we have to compare each character by first accesing it */
8225 else {
8226 /* We do not need to compare 0 and len(substring)-1 because
8227 the if statement above ensured already that they are equal
8228 when we end up here. */
8229 // TODO: honor direction and do a forward or backwards search
8230 for (i = 1; i < end_sub; ++i) {
8231 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8232 PyUnicode_READ(kind_sub, data_sub, i))
8233 return 0;
8234 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237 }
8238
8239 return 0;
8240}
8241
Alexander Belopolsky40018472011-02-26 01:02:56 +00008242Py_ssize_t
8243PyUnicode_Tailmatch(PyObject *str,
8244 PyObject *substr,
8245 Py_ssize_t start,
8246 Py_ssize_t end,
8247 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008249 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008250
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251 str = PyUnicode_FromObject(str);
8252 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254 substr = PyUnicode_FromObject(substr);
8255 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 Py_DECREF(str);
8257 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258 }
Tim Petersced69f82003-09-16 20:30:58 +00008259
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 (PyUnicodeObject *)substr,
8262 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263 Py_DECREF(str);
8264 Py_DECREF(substr);
8265 return result;
8266}
8267
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268/* Apply fixfct filter to the Unicode object self and return a
8269 reference to the modified object */
8270
Alexander Belopolsky40018472011-02-26 01:02:56 +00008271static PyObject *
8272fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008273 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008275 PyObject *u;
8276 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008278 if (PyUnicode_READY(self) == -1)
8279 return NULL;
8280 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8281 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8282 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008286 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8287 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008289 /* fix functions return the new maximum character in a string,
8290 if the kind of the resulting unicode object does not change,
8291 everything is fine. Otherwise we need to change the string kind
8292 and re-run the fix function. */
8293 maxchar_new = fixfct((PyUnicodeObject*)u);
8294 if (maxchar_new == 0)
8295 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8296 else if (maxchar_new <= 127)
8297 maxchar_new = 127;
8298 else if (maxchar_new <= 255)
8299 maxchar_new = 255;
8300 else if (maxchar_new <= 65535)
8301 maxchar_new = 65535;
8302 else
8303 maxchar_new = 1114111; /* 0x10ffff */
8304
8305 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 /* fixfct should return TRUE if it modified the buffer. If
8307 FALSE, return a reference to the original buffer instead
8308 (to save space, not time) */
8309 Py_INCREF(self);
8310 Py_DECREF(u);
8311 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008313 else if (maxchar_new == maxchar_old) {
8314 return u;
8315 }
8316 else {
8317 /* In case the maximum character changed, we need to
8318 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008319 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008320 if (v == NULL) {
8321 Py_DECREF(u);
8322 return NULL;
8323 }
8324 if (maxchar_new > maxchar_old) {
8325 /* If the maxchar increased so that the kind changed, not all
8326 characters are representable anymore and we need to fix the
8327 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008328 if (PyUnicode_CopyCharacters(v, 0,
8329 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008330 PyUnicode_GET_LENGTH(self)) < 0)
8331 {
8332 Py_DECREF(u);
8333 return NULL;
8334 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335 maxchar_old = fixfct((PyUnicodeObject*)v);
8336 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8337 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008338 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008339 if (PyUnicode_CopyCharacters(v, 0,
8340 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008341 PyUnicode_GET_LENGTH(self)) < 0)
8342 {
8343 Py_DECREF(u);
8344 return NULL;
8345 }
8346 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008347
8348 Py_DECREF(u);
8349 return v;
8350 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351}
8352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008354fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356 /* No need to call PyUnicode_READY(self) because this function is only
8357 called as a callback from fixup() which does it already. */
8358 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8359 const int kind = PyUnicode_KIND(self);
8360 void *data = PyUnicode_DATA(self);
8361 int touched = 0;
8362 Py_UCS4 maxchar = 0;
8363 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365 for (i = 0; i < len; ++i) {
8366 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8367 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8368 if (up != ch) {
8369 if (up > maxchar)
8370 maxchar = up;
8371 PyUnicode_WRITE(kind, data, i, up);
8372 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 else if (ch > maxchar)
8375 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376 }
8377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008378 if (touched)
8379 return maxchar;
8380 else
8381 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382}
8383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008384static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008385fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8388 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8389 const int kind = PyUnicode_KIND(self);
8390 void *data = PyUnicode_DATA(self);
8391 int touched = 0;
8392 Py_UCS4 maxchar = 0;
8393 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008395 for(i = 0; i < len; ++i) {
8396 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8397 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8398 if (lo != ch) {
8399 if (lo > maxchar)
8400 maxchar = lo;
8401 PyUnicode_WRITE(kind, data, i, lo);
8402 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 else if (ch > maxchar)
8405 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406 }
8407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008408 if (touched)
8409 return maxchar;
8410 else
8411 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412}
8413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008414static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008415fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8418 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8419 const int kind = PyUnicode_KIND(self);
8420 void *data = PyUnicode_DATA(self);
8421 int touched = 0;
8422 Py_UCS4 maxchar = 0;
8423 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008425 for(i = 0; i < len; ++i) {
8426 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8427 Py_UCS4 nu = 0;
8428
8429 if (Py_UNICODE_ISUPPER(ch))
8430 nu = Py_UNICODE_TOLOWER(ch);
8431 else if (Py_UNICODE_ISLOWER(ch))
8432 nu = Py_UNICODE_TOUPPER(ch);
8433
8434 if (nu != 0) {
8435 if (nu > maxchar)
8436 maxchar = nu;
8437 PyUnicode_WRITE(kind, data, i, nu);
8438 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440 else if (ch > maxchar)
8441 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442 }
8443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008444 if (touched)
8445 return maxchar;
8446 else
8447 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448}
8449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008450static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008451fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008453 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8454 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8455 const int kind = PyUnicode_KIND(self);
8456 void *data = PyUnicode_DATA(self);
8457 int touched = 0;
8458 Py_UCS4 maxchar = 0;
8459 Py_ssize_t i = 0;
8460 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008461
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008462 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464
8465 ch = PyUnicode_READ(kind, data, i);
8466 if (!Py_UNICODE_ISUPPER(ch)) {
8467 maxchar = Py_UNICODE_TOUPPER(ch);
8468 PyUnicode_WRITE(kind, data, i, maxchar);
8469 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 ++i;
8472 for(; i < len; ++i) {
8473 ch = PyUnicode_READ(kind, data, i);
8474 if (!Py_UNICODE_ISLOWER(ch)) {
8475 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8476 if (lo > maxchar)
8477 maxchar = lo;
8478 PyUnicode_WRITE(kind, data, i, lo);
8479 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008480 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 else if (ch > maxchar)
8482 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008483 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484
8485 if (touched)
8486 return maxchar;
8487 else
8488 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489}
8490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008491static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008492fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008494 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8495 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8496 const int kind = PyUnicode_KIND(self);
8497 void *data = PyUnicode_DATA(self);
8498 Py_UCS4 maxchar = 0;
8499 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 int previous_is_cased;
8501
8502 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503 if (len == 1) {
8504 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8505 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8506 if (ti != ch) {
8507 PyUnicode_WRITE(kind, data, i, ti);
8508 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 }
8510 else
8511 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 for(; i < len; ++i) {
8515 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8516 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008517
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 nu = Py_UNICODE_TOTITLE(ch);
8522
8523 if (nu > maxchar)
8524 maxchar = nu;
8525 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008526
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 if (Py_UNICODE_ISLOWER(ch) ||
8528 Py_UNICODE_ISUPPER(ch) ||
8529 Py_UNICODE_ISTITLE(ch))
8530 previous_is_cased = 1;
8531 else
8532 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008534 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535}
8536
Tim Peters8ce9f162004-08-27 01:49:32 +00008537PyObject *
8538PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008541 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008543 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008544 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8545 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008546 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008547 Py_ssize_t sz, i, res_offset;
8548 Py_UCS4 maxchar = 0;
8549 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550
Tim Peters05eba1f2004-08-27 21:32:02 +00008551 fseq = PySequence_Fast(seq, "");
8552 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008553 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008554 }
8555
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008556 /* NOTE: the following code can't call back into Python code,
8557 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008558 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008559
Tim Peters05eba1f2004-08-27 21:32:02 +00008560 seqlen = PySequence_Fast_GET_SIZE(fseq);
8561 /* If empty sequence, return u"". */
8562 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008563 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008564 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008565 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008566 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008567 /* If singleton sequence with an exact Unicode, return that. */
8568 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 item = items[0];
8570 if (PyUnicode_CheckExact(item)) {
8571 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 goto Done;
8574 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008575 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008576 else {
8577 /* Set up sep and seplen */
8578 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 /* fall back to a blank space separator */
8580 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008581 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008583 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008584 else {
8585 if (!PyUnicode_Check(separator)) {
8586 PyErr_Format(PyExc_TypeError,
8587 "separator: expected str instance,"
8588 " %.80s found",
8589 Py_TYPE(separator)->tp_name);
8590 goto onError;
8591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 if (PyUnicode_READY(separator) == -1)
8593 goto onError;
8594 sep = separator;
8595 seplen = PyUnicode_GET_LENGTH(separator);
8596 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8597 /* inc refcount to keep this code path symetric with the
8598 above case of a blank separator */
8599 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008600 }
8601 }
8602
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008603 /* There are at least two things to join, or else we have a subclass
8604 * of str in the sequence.
8605 * Do a pre-pass to figure out the total amount of space we'll
8606 * need (sz), and see whether all argument are strings.
8607 */
8608 sz = 0;
8609 for (i = 0; i < seqlen; i++) {
8610 const Py_ssize_t old_sz = sz;
8611 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 if (!PyUnicode_Check(item)) {
8613 PyErr_Format(PyExc_TypeError,
8614 "sequence item %zd: expected str instance,"
8615 " %.80s found",
8616 i, Py_TYPE(item)->tp_name);
8617 goto onError;
8618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 if (PyUnicode_READY(item) == -1)
8620 goto onError;
8621 sz += PyUnicode_GET_LENGTH(item);
8622 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8623 if (item_maxchar > maxchar)
8624 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008625 if (i != 0)
8626 sz += seplen;
8627 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8628 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008630 goto onError;
8631 }
8632 }
Tim Petersced69f82003-09-16 20:30:58 +00008633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008635 if (res == NULL)
8636 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008637
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008638 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008640 Py_ssize_t itemlen;
8641 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 /* Copy item, and maybe the separator. */
8644 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008645 if (PyUnicode_CopyCharacters(res, res_offset,
8646 sep, 0, seplen) < 0)
8647 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008650 if (PyUnicode_CopyCharacters(res, res_offset,
8651 item, 0, itemlen) < 0)
8652 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008656
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008658 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 Py_XDECREF(sep);
8660 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008663 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008665 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666 return NULL;
8667}
8668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669#define FILL(kind, data, value, start, length) \
8670 do { \
8671 Py_ssize_t i_ = 0; \
8672 assert(kind != PyUnicode_WCHAR_KIND); \
8673 switch ((kind)) { \
8674 case PyUnicode_1BYTE_KIND: { \
8675 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8676 memset(to_, (unsigned char)value, length); \
8677 break; \
8678 } \
8679 case PyUnicode_2BYTE_KIND: { \
8680 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8681 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8682 break; \
8683 } \
8684 default: { \
8685 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8686 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8687 break; \
8688 } \
8689 } \
8690 } while (0)
8691
Alexander Belopolsky40018472011-02-26 01:02:56 +00008692static PyUnicodeObject *
8693pad(PyUnicodeObject *self,
8694 Py_ssize_t left,
8695 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 PyObject *u;
8699 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008700 int kind;
8701 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702
8703 if (left < 0)
8704 left = 0;
8705 if (right < 0)
8706 right = 0;
8707
Tim Peters7a29bd52001-09-12 03:03:31 +00008708 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709 Py_INCREF(self);
8710 return self;
8711 }
8712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8714 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008715 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8716 return NULL;
8717 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008718 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8719 if (fill > maxchar)
8720 maxchar = fill;
8721 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008722 if (!u)
8723 return NULL;
8724
8725 kind = PyUnicode_KIND(u);
8726 data = PyUnicode_DATA(u);
8727 if (left)
8728 FILL(kind, data, fill, 0, left);
8729 if (right)
8730 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008731 if (PyUnicode_CopyCharacters(u, left,
8732 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008733 _PyUnicode_LENGTH(self)) < 0)
8734 {
8735 Py_DECREF(u);
8736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737 }
8738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008741#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742
Alexander Belopolsky40018472011-02-26 01:02:56 +00008743PyObject *
8744PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747
8748 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008749 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008752 switch(PyUnicode_KIND(string)) {
8753 case PyUnicode_1BYTE_KIND:
8754 list = ucs1lib_splitlines(
8755 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8756 PyUnicode_GET_LENGTH(string), keepends);
8757 break;
8758 case PyUnicode_2BYTE_KIND:
8759 list = ucs2lib_splitlines(
8760 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8761 PyUnicode_GET_LENGTH(string), keepends);
8762 break;
8763 case PyUnicode_4BYTE_KIND:
8764 list = ucs4lib_splitlines(
8765 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8766 PyUnicode_GET_LENGTH(string), keepends);
8767 break;
8768 default:
8769 assert(0);
8770 list = 0;
8771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772 Py_DECREF(string);
8773 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774}
8775
Alexander Belopolsky40018472011-02-26 01:02:56 +00008776static PyObject *
8777split(PyUnicodeObject *self,
8778 PyUnicodeObject *substring,
8779 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008781 int kind1, kind2, kind;
8782 void *buf1, *buf2;
8783 Py_ssize_t len1, len2;
8784 PyObject* out;
8785
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008787 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008789 if (PyUnicode_READY(self) == -1)
8790 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 if (substring == NULL)
8793 switch(PyUnicode_KIND(self)) {
8794 case PyUnicode_1BYTE_KIND:
8795 return ucs1lib_split_whitespace(
8796 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8797 PyUnicode_GET_LENGTH(self), maxcount
8798 );
8799 case PyUnicode_2BYTE_KIND:
8800 return ucs2lib_split_whitespace(
8801 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8802 PyUnicode_GET_LENGTH(self), maxcount
8803 );
8804 case PyUnicode_4BYTE_KIND:
8805 return ucs4lib_split_whitespace(
8806 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8807 PyUnicode_GET_LENGTH(self), maxcount
8808 );
8809 default:
8810 assert(0);
8811 return NULL;
8812 }
8813
8814 if (PyUnicode_READY(substring) == -1)
8815 return NULL;
8816
8817 kind1 = PyUnicode_KIND(self);
8818 kind2 = PyUnicode_KIND(substring);
8819 kind = kind1 > kind2 ? kind1 : kind2;
8820 buf1 = PyUnicode_DATA(self);
8821 buf2 = PyUnicode_DATA(substring);
8822 if (kind1 != kind)
8823 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8824 if (!buf1)
8825 return NULL;
8826 if (kind2 != kind)
8827 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8828 if (!buf2) {
8829 if (kind1 != kind) PyMem_Free(buf1);
8830 return NULL;
8831 }
8832 len1 = PyUnicode_GET_LENGTH(self);
8833 len2 = PyUnicode_GET_LENGTH(substring);
8834
8835 switch(kind) {
8836 case PyUnicode_1BYTE_KIND:
8837 out = ucs1lib_split(
8838 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8839 break;
8840 case PyUnicode_2BYTE_KIND:
8841 out = ucs2lib_split(
8842 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8843 break;
8844 case PyUnicode_4BYTE_KIND:
8845 out = ucs4lib_split(
8846 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8847 break;
8848 default:
8849 out = NULL;
8850 }
8851 if (kind1 != kind)
8852 PyMem_Free(buf1);
8853 if (kind2 != kind)
8854 PyMem_Free(buf2);
8855 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856}
8857
Alexander Belopolsky40018472011-02-26 01:02:56 +00008858static PyObject *
8859rsplit(PyUnicodeObject *self,
8860 PyUnicodeObject *substring,
8861 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008862{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008863 int kind1, kind2, kind;
8864 void *buf1, *buf2;
8865 Py_ssize_t len1, len2;
8866 PyObject* out;
8867
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008868 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008869 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008871 if (PyUnicode_READY(self) == -1)
8872 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 if (substring == NULL)
8875 switch(PyUnicode_KIND(self)) {
8876 case PyUnicode_1BYTE_KIND:
8877 return ucs1lib_rsplit_whitespace(
8878 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8879 PyUnicode_GET_LENGTH(self), maxcount
8880 );
8881 case PyUnicode_2BYTE_KIND:
8882 return ucs2lib_rsplit_whitespace(
8883 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8884 PyUnicode_GET_LENGTH(self), maxcount
8885 );
8886 case PyUnicode_4BYTE_KIND:
8887 return ucs4lib_rsplit_whitespace(
8888 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8889 PyUnicode_GET_LENGTH(self), maxcount
8890 );
8891 default:
8892 assert(0);
8893 return NULL;
8894 }
8895
8896 if (PyUnicode_READY(substring) == -1)
8897 return NULL;
8898
8899 kind1 = PyUnicode_KIND(self);
8900 kind2 = PyUnicode_KIND(substring);
8901 kind = kind1 > kind2 ? kind1 : kind2;
8902 buf1 = PyUnicode_DATA(self);
8903 buf2 = PyUnicode_DATA(substring);
8904 if (kind1 != kind)
8905 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8906 if (!buf1)
8907 return NULL;
8908 if (kind2 != kind)
8909 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8910 if (!buf2) {
8911 if (kind1 != kind) PyMem_Free(buf1);
8912 return NULL;
8913 }
8914 len1 = PyUnicode_GET_LENGTH(self);
8915 len2 = PyUnicode_GET_LENGTH(substring);
8916
8917 switch(kind) {
8918 case PyUnicode_1BYTE_KIND:
8919 out = ucs1lib_rsplit(
8920 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8921 break;
8922 case PyUnicode_2BYTE_KIND:
8923 out = ucs2lib_rsplit(
8924 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8925 break;
8926 case PyUnicode_4BYTE_KIND:
8927 out = ucs4lib_rsplit(
8928 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8929 break;
8930 default:
8931 out = NULL;
8932 }
8933 if (kind1 != kind)
8934 PyMem_Free(buf1);
8935 if (kind2 != kind)
8936 PyMem_Free(buf2);
8937 return out;
8938}
8939
8940static Py_ssize_t
8941anylib_find(int kind, void *buf1, Py_ssize_t len1,
8942 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8943{
8944 switch(kind) {
8945 case PyUnicode_1BYTE_KIND:
8946 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8947 case PyUnicode_2BYTE_KIND:
8948 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8949 case PyUnicode_4BYTE_KIND:
8950 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8951 }
8952 assert(0);
8953 return -1;
8954}
8955
8956static Py_ssize_t
8957anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8958 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8959{
8960 switch(kind) {
8961 case PyUnicode_1BYTE_KIND:
8962 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8963 case PyUnicode_2BYTE_KIND:
8964 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8965 case PyUnicode_4BYTE_KIND:
8966 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8967 }
8968 assert(0);
8969 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008970}
8971
Alexander Belopolsky40018472011-02-26 01:02:56 +00008972static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973replace(PyObject *self, PyObject *str1,
8974 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 PyObject *u;
8977 char *sbuf = PyUnicode_DATA(self);
8978 char *buf1 = PyUnicode_DATA(str1);
8979 char *buf2 = PyUnicode_DATA(str2);
8980 int srelease = 0, release1 = 0, release2 = 0;
8981 int skind = PyUnicode_KIND(self);
8982 int kind1 = PyUnicode_KIND(str1);
8983 int kind2 = PyUnicode_KIND(str2);
8984 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8985 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8986 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987
8988 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008991 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008993 if (skind < kind1)
8994 /* substring too wide to be present */
8995 goto nothing;
8996
8997 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008998 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008999 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009001 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009003 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 Py_UCS4 u1, u2, maxchar;
9005 int mayshrink, rkind;
9006 u1 = PyUnicode_READ_CHAR(str1, 0);
9007 if (!findchar(sbuf, PyUnicode_KIND(self),
9008 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009009 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010 u2 = PyUnicode_READ_CHAR(str2, 0);
9011 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9012 /* Replacing u1 with u2 may cause a maxchar reduction in the
9013 result string. */
9014 mayshrink = maxchar > 127;
9015 if (u2 > maxchar) {
9016 maxchar = u2;
9017 mayshrink = 0;
9018 }
9019 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009020 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009021 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009022 if (PyUnicode_CopyCharacters(u, 0,
9023 (PyObject*)self, 0, slen) < 0)
9024 {
9025 Py_DECREF(u);
9026 return NULL;
9027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028 rkind = PyUnicode_KIND(u);
9029 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9030 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009031 if (--maxcount < 0)
9032 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009033 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035 if (mayshrink) {
9036 PyObject *tmp = u;
9037 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9038 PyUnicode_GET_LENGTH(tmp));
9039 Py_DECREF(tmp);
9040 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 int rkind = skind;
9043 char *res;
9044 if (kind1 < rkind) {
9045 /* widen substring */
9046 buf1 = _PyUnicode_AsKind(str1, rkind);
9047 if (!buf1) goto error;
9048 release1 = 1;
9049 }
9050 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009051 if (i < 0)
9052 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 if (rkind > kind2) {
9054 /* widen replacement */
9055 buf2 = _PyUnicode_AsKind(str2, rkind);
9056 if (!buf2) goto error;
9057 release2 = 1;
9058 }
9059 else if (rkind < kind2) {
9060 /* widen self and buf1 */
9061 rkind = kind2;
9062 if (release1) PyMem_Free(buf1);
9063 sbuf = _PyUnicode_AsKind(self, rkind);
9064 if (!sbuf) goto error;
9065 srelease = 1;
9066 buf1 = _PyUnicode_AsKind(str1, rkind);
9067 if (!buf1) goto error;
9068 release1 = 1;
9069 }
9070 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9071 if (!res) {
9072 PyErr_NoMemory();
9073 goto error;
9074 }
9075 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009076 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9078 buf2,
9079 PyUnicode_KIND_SIZE(rkind, len2));
9080 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009081
9082 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9084 slen-i,
9085 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009086 if (i == -1)
9087 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9089 buf2,
9090 PyUnicode_KIND_SIZE(rkind, len2));
9091 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093
9094 u = PyUnicode_FromKindAndData(rkind, res, slen);
9095 PyMem_Free(res);
9096 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 Py_ssize_t n, i, j, ires;
9101 Py_ssize_t product, new_size;
9102 int rkind = skind;
9103 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105 if (kind1 < rkind) {
9106 buf1 = _PyUnicode_AsKind(str1, rkind);
9107 if (!buf1) goto error;
9108 release1 = 1;
9109 }
9110 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009111 if (n == 0)
9112 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009113 if (kind2 < rkind) {
9114 buf2 = _PyUnicode_AsKind(str2, rkind);
9115 if (!buf2) goto error;
9116 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118 else if (kind2 > rkind) {
9119 rkind = kind2;
9120 sbuf = _PyUnicode_AsKind(self, rkind);
9121 if (!sbuf) goto error;
9122 srelease = 1;
9123 if (release1) PyMem_Free(buf1);
9124 buf1 = _PyUnicode_AsKind(str1, rkind);
9125 if (!buf1) goto error;
9126 release1 = 1;
9127 }
9128 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9129 PyUnicode_GET_LENGTH(str1))); */
9130 product = n * (len2-len1);
9131 if ((product / (len2-len1)) != n) {
9132 PyErr_SetString(PyExc_OverflowError,
9133 "replace string is too long");
9134 goto error;
9135 }
9136 new_size = slen + product;
9137 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9138 PyErr_SetString(PyExc_OverflowError,
9139 "replace string is too long");
9140 goto error;
9141 }
9142 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9143 if (!res)
9144 goto error;
9145 ires = i = 0;
9146 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009147 while (n-- > 0) {
9148 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009149 j = anylib_find(rkind,
9150 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9151 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009152 if (j == -1)
9153 break;
9154 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009155 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9157 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9158 PyUnicode_KIND_SIZE(rkind, j-i));
9159 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009160 }
9161 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009162 if (len2 > 0) {
9163 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9164 buf2,
9165 PyUnicode_KIND_SIZE(rkind, len2));
9166 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009167 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009169 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009170 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009171 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009172 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9173 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9174 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009175 } else {
9176 /* interleave */
9177 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9179 buf2,
9180 PyUnicode_KIND_SIZE(rkind, len2));
9181 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009182 if (--n <= 0)
9183 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009184 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9185 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9186 PyUnicode_KIND_SIZE(rkind, 1));
9187 ires++;
9188 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009189 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009190 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9191 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9192 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009193 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009195 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 if (srelease)
9198 PyMem_FREE(sbuf);
9199 if (release1)
9200 PyMem_FREE(buf1);
9201 if (release2)
9202 PyMem_FREE(buf2);
9203 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009204
Benjamin Peterson29060642009-01-31 22:14:21 +00009205 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009206 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207 if (srelease)
9208 PyMem_FREE(sbuf);
9209 if (release1)
9210 PyMem_FREE(buf1);
9211 if (release2)
9212 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009213 if (PyUnicode_CheckExact(self)) {
9214 Py_INCREF(self);
9215 return (PyObject *) self;
9216 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009217 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218 error:
9219 if (srelease && sbuf)
9220 PyMem_FREE(sbuf);
9221 if (release1 && buf1)
9222 PyMem_FREE(buf1);
9223 if (release2 && buf2)
9224 PyMem_FREE(buf2);
9225 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226}
9227
9228/* --- Unicode Object Methods --------------------------------------------- */
9229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009230PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009231 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232\n\
9233Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009234characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235
9236static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009237unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239 return fixup(self, fixtitle);
9240}
9241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009242PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009243 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244\n\
9245Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009246have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009247
9248static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009249unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009251 return fixup(self, fixcapitalize);
9252}
9253
9254#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009255PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009256 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009257\n\
9258Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009259normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009260
9261static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009262unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009263{
9264 PyObject *list;
9265 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009266 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009267
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268 /* Split into words */
9269 list = split(self, NULL, -1);
9270 if (!list)
9271 return NULL;
9272
9273 /* Capitalize each word */
9274 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9275 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009276 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009277 if (item == NULL)
9278 goto onError;
9279 Py_DECREF(PyList_GET_ITEM(list, i));
9280 PyList_SET_ITEM(list, i, item);
9281 }
9282
9283 /* Join the words to form a new string */
9284 item = PyUnicode_Join(NULL, list);
9285
Benjamin Peterson29060642009-01-31 22:14:21 +00009286 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287 Py_DECREF(list);
9288 return (PyObject *)item;
9289}
9290#endif
9291
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009292/* Argument converter. Coerces to a single unicode character */
9293
9294static int
9295convert_uc(PyObject *obj, void *addr)
9296{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009298 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009299
Benjamin Peterson14339b62009-01-31 16:36:08 +00009300 uniobj = PyUnicode_FromObject(obj);
9301 if (uniobj == NULL) {
9302 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009303 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009304 return 0;
9305 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009307 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009308 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009309 Py_DECREF(uniobj);
9310 return 0;
9311 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009313 Py_DECREF(uniobj);
9314 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009315}
9316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009317PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009318 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009319\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009320Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009321done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322
9323static PyObject *
9324unicode_center(PyUnicodeObject *self, PyObject *args)
9325{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009326 Py_ssize_t marg, left;
9327 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328 Py_UCS4 fillchar = ' ';
9329
Victor Stinnere9a29352011-10-01 02:14:59 +02009330 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009332
Victor Stinnere9a29352011-10-01 02:14:59 +02009333 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009334 return NULL;
9335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337 Py_INCREF(self);
9338 return (PyObject*) self;
9339 }
9340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009342 left = marg / 2 + (marg & width & 1);
9343
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009344 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345}
9346
Marc-André Lemburge5034372000-08-08 08:04:29 +00009347#if 0
9348
9349/* This code should go into some future Unicode collation support
9350 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009351 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009352
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009353/* speedy UTF-16 code point order comparison */
9354/* gleaned from: */
9355/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9356
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009357static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009358{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009359 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009360 0, 0, 0, 0, 0, 0, 0, 0,
9361 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009362 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009363};
9364
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365static int
9366unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9367{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009368 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009369
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370 Py_UNICODE *s1 = str1->str;
9371 Py_UNICODE *s2 = str2->str;
9372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373 len1 = str1->_base._base.length;
9374 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009375
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009377 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009378
9379 c1 = *s1++;
9380 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009381
Benjamin Peterson29060642009-01-31 22:14:21 +00009382 if (c1 > (1<<11) * 26)
9383 c1 += utf16Fixup[c1>>11];
9384 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009385 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009386 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009387
9388 if (c1 != c2)
9389 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009390
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009391 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392 }
9393
9394 return (len1 < len2) ? -1 : (len1 != len2);
9395}
9396
Marc-André Lemburge5034372000-08-08 08:04:29 +00009397#else
9398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399/* This function assumes that str1 and str2 are readied by the caller. */
9400
Marc-André Lemburge5034372000-08-08 08:04:29 +00009401static int
9402unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9403{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 int kind1, kind2;
9405 void *data1, *data2;
9406 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 kind1 = PyUnicode_KIND(str1);
9409 kind2 = PyUnicode_KIND(str2);
9410 data1 = PyUnicode_DATA(str1);
9411 data2 = PyUnicode_DATA(str2);
9412 len1 = PyUnicode_GET_LENGTH(str1);
9413 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 for (i = 0; i < len1 && i < len2; ++i) {
9416 Py_UCS4 c1, c2;
9417 c1 = PyUnicode_READ(kind1, data1, i);
9418 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009419
9420 if (c1 != c2)
9421 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009422 }
9423
9424 return (len1 < len2) ? -1 : (len1 != len2);
9425}
9426
9427#endif
9428
Alexander Belopolsky40018472011-02-26 01:02:56 +00009429int
9430PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9433 if (PyUnicode_READY(left) == -1 ||
9434 PyUnicode_READY(right) == -1)
9435 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009436 return unicode_compare((PyUnicodeObject *)left,
9437 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009439 PyErr_Format(PyExc_TypeError,
9440 "Can't compare %.100s and %.100s",
9441 left->ob_type->tp_name,
9442 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443 return -1;
9444}
9445
Martin v. Löwis5b222132007-06-10 09:51:05 +00009446int
9447PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9448{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 Py_ssize_t i;
9450 int kind;
9451 void *data;
9452 Py_UCS4 chr;
9453
Martin v. Löwis5b222132007-06-10 09:51:05 +00009454 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 if (PyUnicode_READY(uni) == -1)
9456 return -1;
9457 kind = PyUnicode_KIND(uni);
9458 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009459 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9461 if (chr != str[i])
9462 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009463 /* This check keeps Python strings that end in '\0' from comparing equal
9464 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009466 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009467 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009468 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009469 return 0;
9470}
9471
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009472
Benjamin Peterson29060642009-01-31 22:14:21 +00009473#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009474 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009475
Alexander Belopolsky40018472011-02-26 01:02:56 +00009476PyObject *
9477PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009478{
9479 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009480
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009481 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9482 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 if (PyUnicode_READY(left) == -1 ||
9484 PyUnicode_READY(right) == -1)
9485 return NULL;
9486 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9487 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009488 if (op == Py_EQ) {
9489 Py_INCREF(Py_False);
9490 return Py_False;
9491 }
9492 if (op == Py_NE) {
9493 Py_INCREF(Py_True);
9494 return Py_True;
9495 }
9496 }
9497 if (left == right)
9498 result = 0;
9499 else
9500 result = unicode_compare((PyUnicodeObject *)left,
9501 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009502
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009503 /* Convert the return value to a Boolean */
9504 switch (op) {
9505 case Py_EQ:
9506 v = TEST_COND(result == 0);
9507 break;
9508 case Py_NE:
9509 v = TEST_COND(result != 0);
9510 break;
9511 case Py_LE:
9512 v = TEST_COND(result <= 0);
9513 break;
9514 case Py_GE:
9515 v = TEST_COND(result >= 0);
9516 break;
9517 case Py_LT:
9518 v = TEST_COND(result == -1);
9519 break;
9520 case Py_GT:
9521 v = TEST_COND(result == 1);
9522 break;
9523 default:
9524 PyErr_BadArgument();
9525 return NULL;
9526 }
9527 Py_INCREF(v);
9528 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009529 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009530
Brian Curtindfc80e32011-08-10 20:28:54 -05009531 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009532}
9533
Alexander Belopolsky40018472011-02-26 01:02:56 +00009534int
9535PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009536{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009537 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 int kind1, kind2, kind;
9539 void *buf1, *buf2;
9540 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009541 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009542
9543 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009544 sub = PyUnicode_FromObject(element);
9545 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 PyErr_Format(PyExc_TypeError,
9547 "'in <string>' requires string as left operand, not %s",
9548 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009549 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009551 if (PyUnicode_READY(sub) == -1)
9552 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009553
Thomas Wouters477c8d52006-05-27 19:21:47 +00009554 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009555 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009556 Py_DECREF(sub);
9557 return -1;
9558 }
9559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 kind1 = PyUnicode_KIND(str);
9561 kind2 = PyUnicode_KIND(sub);
9562 kind = kind1 > kind2 ? kind1 : kind2;
9563 buf1 = PyUnicode_DATA(str);
9564 buf2 = PyUnicode_DATA(sub);
9565 if (kind1 != kind)
9566 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9567 if (!buf1) {
9568 Py_DECREF(sub);
9569 return -1;
9570 }
9571 if (kind2 != kind)
9572 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9573 if (!buf2) {
9574 Py_DECREF(sub);
9575 if (kind1 != kind) PyMem_Free(buf1);
9576 return -1;
9577 }
9578 len1 = PyUnicode_GET_LENGTH(str);
9579 len2 = PyUnicode_GET_LENGTH(sub);
9580
9581 switch(kind) {
9582 case PyUnicode_1BYTE_KIND:
9583 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9584 break;
9585 case PyUnicode_2BYTE_KIND:
9586 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9587 break;
9588 case PyUnicode_4BYTE_KIND:
9589 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9590 break;
9591 default:
9592 result = -1;
9593 assert(0);
9594 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009595
9596 Py_DECREF(str);
9597 Py_DECREF(sub);
9598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 if (kind1 != kind)
9600 PyMem_Free(buf1);
9601 if (kind2 != kind)
9602 PyMem_Free(buf2);
9603
Guido van Rossum403d68b2000-03-13 15:55:09 +00009604 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009605}
9606
Guido van Rossumd57fd912000-03-10 22:53:23 +00009607/* Concat to string or Unicode object giving a new Unicode object. */
9608
Alexander Belopolsky40018472011-02-26 01:02:56 +00009609PyObject *
9610PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 PyObject *u = NULL, *v = NULL, *w;
9613 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009614
9615 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009616 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009617 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009618 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009620 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009621 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009622
9623 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009624 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009625 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009628 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009629 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631 }
9632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009634 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 w = PyUnicode_New(
9638 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9639 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009641 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009642 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9643 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009644 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009645 v, 0,
9646 PyUnicode_GET_LENGTH(v)) < 0)
9647 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648 Py_DECREF(u);
9649 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651
Benjamin Peterson29060642009-01-31 22:14:21 +00009652 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653 Py_XDECREF(u);
9654 Py_XDECREF(v);
9655 return NULL;
9656}
9657
Walter Dörwald1ab83302007-05-18 17:15:44 +00009658void
9659PyUnicode_Append(PyObject **pleft, PyObject *right)
9660{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009661 PyObject *new;
9662 if (*pleft == NULL)
9663 return;
9664 if (right == NULL || !PyUnicode_Check(*pleft)) {
9665 Py_DECREF(*pleft);
9666 *pleft = NULL;
9667 return;
9668 }
9669 new = PyUnicode_Concat(*pleft, right);
9670 Py_DECREF(*pleft);
9671 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009672}
9673
9674void
9675PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9676{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009677 PyUnicode_Append(pleft, right);
9678 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009679}
9680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009681PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009682 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009684Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009685string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009686interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687
9688static PyObject *
9689unicode_count(PyUnicodeObject *self, PyObject *args)
9690{
9691 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009692 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009693 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695 int kind1, kind2, kind;
9696 void *buf1, *buf2;
9697 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698
Jesus Ceaac451502011-04-20 17:09:23 +02009699 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9700 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009701 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 kind1 = PyUnicode_KIND(self);
9704 kind2 = PyUnicode_KIND(substring);
9705 kind = kind1 > kind2 ? kind1 : kind2;
9706 buf1 = PyUnicode_DATA(self);
9707 buf2 = PyUnicode_DATA(substring);
9708 if (kind1 != kind)
9709 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9710 if (!buf1) {
9711 Py_DECREF(substring);
9712 return NULL;
9713 }
9714 if (kind2 != kind)
9715 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9716 if (!buf2) {
9717 Py_DECREF(substring);
9718 if (kind1 != kind) PyMem_Free(buf1);
9719 return NULL;
9720 }
9721 len1 = PyUnicode_GET_LENGTH(self);
9722 len2 = PyUnicode_GET_LENGTH(substring);
9723
9724 ADJUST_INDICES(start, end, len1);
9725 switch(kind) {
9726 case PyUnicode_1BYTE_KIND:
9727 iresult = ucs1lib_count(
9728 ((Py_UCS1*)buf1) + start, end - start,
9729 buf2, len2, PY_SSIZE_T_MAX
9730 );
9731 break;
9732 case PyUnicode_2BYTE_KIND:
9733 iresult = ucs2lib_count(
9734 ((Py_UCS2*)buf1) + start, end - start,
9735 buf2, len2, PY_SSIZE_T_MAX
9736 );
9737 break;
9738 case PyUnicode_4BYTE_KIND:
9739 iresult = ucs4lib_count(
9740 ((Py_UCS4*)buf1) + start, end - start,
9741 buf2, len2, PY_SSIZE_T_MAX
9742 );
9743 break;
9744 default:
9745 assert(0); iresult = 0;
9746 }
9747
9748 result = PyLong_FromSsize_t(iresult);
9749
9750 if (kind1 != kind)
9751 PyMem_Free(buf1);
9752 if (kind2 != kind)
9753 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754
9755 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009756
Guido van Rossumd57fd912000-03-10 22:53:23 +00009757 return result;
9758}
9759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009760PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009761 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009763Encode S using the codec registered for encoding. Default encoding\n\
9764is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009765handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009766a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9767'xmlcharrefreplace' as well as any other name registered with\n\
9768codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769
9770static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009771unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009772{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009773 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774 char *encoding = NULL;
9775 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009776
Benjamin Peterson308d6372009-09-18 21:42:35 +00009777 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9778 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009780 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009781}
9782
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009783PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009784 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785\n\
9786Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009787If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788
9789static PyObject*
9790unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9791{
9792 Py_UNICODE *e;
9793 Py_UNICODE *p;
9794 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009795 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797 PyUnicodeObject *u;
9798 int tabsize = 8;
9799
9800 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009801 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009803 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9804 return NULL;
9805
Thomas Wouters7e474022000-07-16 12:04:32 +00009806 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009807 i = 0; /* chars up to and including most recent \n or \r */
9808 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9810 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009812 if (tabsize > 0) {
9813 incr = tabsize - (j % tabsize); /* cannot overflow */
9814 if (j > PY_SSIZE_T_MAX - incr)
9815 goto overflow1;
9816 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009817 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009820 if (j > PY_SSIZE_T_MAX - 1)
9821 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009822 j++;
9823 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009824 if (i > PY_SSIZE_T_MAX - j)
9825 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009827 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828 }
9829 }
9830
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009831 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009832 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009833
Guido van Rossumd57fd912000-03-10 22:53:23 +00009834 /* Second pass: create output string and fill it */
9835 u = _PyUnicode_New(i + j);
9836 if (!u)
9837 return NULL;
9838
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009839 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840 q = _PyUnicode_WSTR(u); /* next output char */
9841 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009845 if (tabsize > 0) {
9846 i = tabsize - (j % tabsize);
9847 j += i;
9848 while (i--) {
9849 if (q >= qe)
9850 goto overflow2;
9851 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009852 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009853 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009854 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009855 else {
9856 if (q >= qe)
9857 goto overflow2;
9858 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009859 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860 if (*p == '\n' || *p == '\r')
9861 j = 0;
9862 }
9863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 if (PyUnicode_READY(u) == -1) {
9865 Py_DECREF(u);
9866 return NULL;
9867 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009869
9870 overflow2:
9871 Py_DECREF(u);
9872 overflow1:
9873 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875}
9876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009877PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009878 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879\n\
9880Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009881such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882arguments start and end are interpreted as in slice notation.\n\
9883\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009884Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885
9886static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888{
Jesus Ceaac451502011-04-20 17:09:23 +02009889 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009890 Py_ssize_t start;
9891 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009892 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893
Jesus Ceaac451502011-04-20 17:09:23 +02009894 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9895 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009896 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 if (PyUnicode_READY(self) == -1)
9899 return NULL;
9900 if (PyUnicode_READY(substring) == -1)
9901 return NULL;
9902
9903 result = any_find_slice(
9904 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9905 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009906 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907
9908 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 if (result == -2)
9911 return NULL;
9912
Christian Heimes217cfd12007-12-02 14:31:20 +00009913 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914}
9915
9916static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +02009917unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02009919 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
9920 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009922 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923}
9924
Guido van Rossumc2504932007-09-18 19:42:40 +00009925/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009926 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009927static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009928unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929{
Guido van Rossumc2504932007-09-18 19:42:40 +00009930 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009931 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 if (_PyUnicode_HASH(self) != -1)
9934 return _PyUnicode_HASH(self);
9935 if (PyUnicode_READY(self) == -1)
9936 return -1;
9937 len = PyUnicode_GET_LENGTH(self);
9938
9939 /* The hash function as a macro, gets expanded three times below. */
9940#define HASH(P) \
9941 x = (Py_uhash_t)*P << 7; \
9942 while (--len >= 0) \
9943 x = (1000003*x) ^ (Py_uhash_t)*P++;
9944
9945 switch (PyUnicode_KIND(self)) {
9946 case PyUnicode_1BYTE_KIND: {
9947 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9948 HASH(c);
9949 break;
9950 }
9951 case PyUnicode_2BYTE_KIND: {
9952 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9953 HASH(s);
9954 break;
9955 }
9956 default: {
9957 Py_UCS4 *l;
9958 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9959 "Impossible switch case in unicode_hash");
9960 l = PyUnicode_4BYTE_DATA(self);
9961 HASH(l);
9962 break;
9963 }
9964 }
9965 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9966
Guido van Rossumc2504932007-09-18 19:42:40 +00009967 if (x == -1)
9968 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009969 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009970 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009971}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009974PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009975 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009977Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009978
9979static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009981{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009982 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009983 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009984 Py_ssize_t start;
9985 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986
Jesus Ceaac451502011-04-20 17:09:23 +02009987 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9988 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009989 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 if (PyUnicode_READY(self) == -1)
9992 return NULL;
9993 if (PyUnicode_READY(substring) == -1)
9994 return NULL;
9995
9996 result = any_find_slice(
9997 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9998 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009999 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010000
10001 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 if (result == -2)
10004 return NULL;
10005
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006 if (result < 0) {
10007 PyErr_SetString(PyExc_ValueError, "substring not found");
10008 return NULL;
10009 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010010
Christian Heimes217cfd12007-12-02 14:31:20 +000010011 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012}
10013
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010014PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010015 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010017Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010018at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019
10020static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010021unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 Py_ssize_t i, length;
10024 int kind;
10025 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026 int cased;
10027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 if (PyUnicode_READY(self) == -1)
10029 return NULL;
10030 length = PyUnicode_GET_LENGTH(self);
10031 kind = PyUnicode_KIND(self);
10032 data = PyUnicode_DATA(self);
10033
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 if (length == 1)
10036 return PyBool_FromLong(
10037 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010039 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010041 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010042
Guido van Rossumd57fd912000-03-10 22:53:23 +000010043 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 for (i = 0; i < length; i++) {
10045 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010046
Benjamin Peterson29060642009-01-31 22:14:21 +000010047 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10048 return PyBool_FromLong(0);
10049 else if (!cased && Py_UNICODE_ISLOWER(ch))
10050 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010051 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010052 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010053}
10054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010055PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010056 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010057\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010058Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010059at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060
10061static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010062unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010063{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 Py_ssize_t i, length;
10065 int kind;
10066 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010067 int cased;
10068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 if (PyUnicode_READY(self) == -1)
10070 return NULL;
10071 length = PyUnicode_GET_LENGTH(self);
10072 kind = PyUnicode_KIND(self);
10073 data = PyUnicode_DATA(self);
10074
Guido van Rossumd57fd912000-03-10 22:53:23 +000010075 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 if (length == 1)
10077 return PyBool_FromLong(
10078 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010079
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010080 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010082 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010083
Guido van Rossumd57fd912000-03-10 22:53:23 +000010084 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 for (i = 0; i < length; i++) {
10086 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010087
Benjamin Peterson29060642009-01-31 22:14:21 +000010088 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10089 return PyBool_FromLong(0);
10090 else if (!cased && Py_UNICODE_ISUPPER(ch))
10091 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010092 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010093 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010094}
10095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010096PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010097 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010098\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010099Return True if S is a titlecased string and there is at least one\n\
10100character in S, i.e. upper- and titlecase characters may only\n\
10101follow uncased characters and lowercase characters only cased ones.\n\
10102Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010103
10104static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010105unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 Py_ssize_t i, length;
10108 int kind;
10109 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110 int cased, previous_is_cased;
10111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 if (PyUnicode_READY(self) == -1)
10113 return NULL;
10114 length = PyUnicode_GET_LENGTH(self);
10115 kind = PyUnicode_KIND(self);
10116 data = PyUnicode_DATA(self);
10117
Guido van Rossumd57fd912000-03-10 22:53:23 +000010118 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 if (length == 1) {
10120 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10121 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10122 (Py_UNICODE_ISUPPER(ch) != 0));
10123 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010124
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010125 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010127 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010128
Guido van Rossumd57fd912000-03-10 22:53:23 +000010129 cased = 0;
10130 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 for (i = 0; i < length; i++) {
10132 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010133
Benjamin Peterson29060642009-01-31 22:14:21 +000010134 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10135 if (previous_is_cased)
10136 return PyBool_FromLong(0);
10137 previous_is_cased = 1;
10138 cased = 1;
10139 }
10140 else if (Py_UNICODE_ISLOWER(ch)) {
10141 if (!previous_is_cased)
10142 return PyBool_FromLong(0);
10143 previous_is_cased = 1;
10144 cased = 1;
10145 }
10146 else
10147 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010149 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150}
10151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010152PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010153 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010154\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010155Return True if all characters in S are whitespace\n\
10156and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157
10158static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010159unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 Py_ssize_t i, length;
10162 int kind;
10163 void *data;
10164
10165 if (PyUnicode_READY(self) == -1)
10166 return NULL;
10167 length = PyUnicode_GET_LENGTH(self);
10168 kind = PyUnicode_KIND(self);
10169 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 if (length == 1)
10173 return PyBool_FromLong(
10174 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010176 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010178 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 for (i = 0; i < length; i++) {
10181 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010182 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010183 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010185 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186}
10187
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010188PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010189 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010190\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010191Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010192and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010193
10194static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010195unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010196{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 Py_ssize_t i, length;
10198 int kind;
10199 void *data;
10200
10201 if (PyUnicode_READY(self) == -1)
10202 return NULL;
10203 length = PyUnicode_GET_LENGTH(self);
10204 kind = PyUnicode_KIND(self);
10205 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010206
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010207 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 if (length == 1)
10209 return PyBool_FromLong(
10210 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010211
10212 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010214 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010216 for (i = 0; i < length; i++) {
10217 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010218 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010219 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010220 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010221}
10222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010223PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010224 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010225\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010226Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010227and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010228
10229static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010230unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010231{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 int kind;
10233 void *data;
10234 Py_ssize_t len, i;
10235
10236 if (PyUnicode_READY(self) == -1)
10237 return NULL;
10238
10239 kind = PyUnicode_KIND(self);
10240 data = PyUnicode_DATA(self);
10241 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010242
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010243 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 if (len == 1) {
10245 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10246 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10247 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010248
10249 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010251 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 for (i = 0; i < len; i++) {
10254 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010255 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010256 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010257 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010258 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010259}
10260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010261PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010262 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010264Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010265False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266
10267static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010268unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 Py_ssize_t i, length;
10271 int kind;
10272 void *data;
10273
10274 if (PyUnicode_READY(self) == -1)
10275 return NULL;
10276 length = PyUnicode_GET_LENGTH(self);
10277 kind = PyUnicode_KIND(self);
10278 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 if (length == 1)
10282 return PyBool_FromLong(
10283 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010285 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010287 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 for (i = 0; i < length; i++) {
10290 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010291 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010293 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010294}
10295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010296PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010297 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010299Return True if all characters in S are digits\n\
10300and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010301
10302static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010303unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 Py_ssize_t i, length;
10306 int kind;
10307 void *data;
10308
10309 if (PyUnicode_READY(self) == -1)
10310 return NULL;
10311 length = PyUnicode_GET_LENGTH(self);
10312 kind = PyUnicode_KIND(self);
10313 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 if (length == 1) {
10317 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10318 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10319 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010321 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010323 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 for (i = 0; i < length; i++) {
10326 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010327 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010329 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010330}
10331
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010332PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010333 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010334\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010335Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010336False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010337
10338static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010339unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 Py_ssize_t i, length;
10342 int kind;
10343 void *data;
10344
10345 if (PyUnicode_READY(self) == -1)
10346 return NULL;
10347 length = PyUnicode_GET_LENGTH(self);
10348 kind = PyUnicode_KIND(self);
10349 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 if (length == 1)
10353 return PyBool_FromLong(
10354 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010356 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010358 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 for (i = 0; i < length; i++) {
10361 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010362 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010364 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365}
10366
Martin v. Löwis47383402007-08-15 07:32:56 +000010367int
10368PyUnicode_IsIdentifier(PyObject *self)
10369{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 int kind;
10371 void *data;
10372 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010373 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 if (PyUnicode_READY(self) == -1) {
10376 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010377 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 }
10379
10380 /* Special case for empty strings */
10381 if (PyUnicode_GET_LENGTH(self) == 0)
10382 return 0;
10383 kind = PyUnicode_KIND(self);
10384 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010385
10386 /* PEP 3131 says that the first character must be in
10387 XID_Start and subsequent characters in XID_Continue,
10388 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010389 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010390 letters, digits, underscore). However, given the current
10391 definition of XID_Start and XID_Continue, it is sufficient
10392 to check just for these, except that _ must be allowed
10393 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010395 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010396 return 0;
10397
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010398 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010400 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010401 return 1;
10402}
10403
10404PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010405 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010406\n\
10407Return True if S is a valid identifier according\n\
10408to the language definition.");
10409
10410static PyObject*
10411unicode_isidentifier(PyObject *self)
10412{
10413 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10414}
10415
Georg Brandl559e5d72008-06-11 18:37:52 +000010416PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010417 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010418\n\
10419Return True if all characters in S are considered\n\
10420printable in repr() or S is empty, False otherwise.");
10421
10422static PyObject*
10423unicode_isprintable(PyObject *self)
10424{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 Py_ssize_t i, length;
10426 int kind;
10427 void *data;
10428
10429 if (PyUnicode_READY(self) == -1)
10430 return NULL;
10431 length = PyUnicode_GET_LENGTH(self);
10432 kind = PyUnicode_KIND(self);
10433 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010434
10435 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 if (length == 1)
10437 return PyBool_FromLong(
10438 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010439
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 for (i = 0; i < length; i++) {
10441 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010442 Py_RETURN_FALSE;
10443 }
10444 }
10445 Py_RETURN_TRUE;
10446}
10447
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010448PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010449 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010450\n\
10451Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010452iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010453
10454static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010455unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010456{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010457 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010458}
10459
Martin v. Löwis18e16552006-02-15 17:27:45 +000010460static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010461unicode_length(PyUnicodeObject *self)
10462{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 if (PyUnicode_READY(self) == -1)
10464 return -1;
10465 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010466}
10467
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010468PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010469 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010470\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010471Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010472done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473
10474static PyObject *
10475unicode_ljust(PyUnicodeObject *self, PyObject *args)
10476{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010477 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 Py_UCS4 fillchar = ' ';
10479
10480 if (PyUnicode_READY(self) == -1)
10481 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010482
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010483 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484 return NULL;
10485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487 Py_INCREF(self);
10488 return (PyObject*) self;
10489 }
10490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492}
10493
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010494PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010495 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010497Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498
10499static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010500unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502 return fixup(self, fixlower);
10503}
10504
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010505#define LEFTSTRIP 0
10506#define RIGHTSTRIP 1
10507#define BOTHSTRIP 2
10508
10509/* Arrays indexed by above */
10510static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10511
10512#define STRIPNAME(i) (stripformat[i]+3)
10513
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010514/* externally visible for str.strip(unicode) */
10515PyObject *
10516_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10517{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 void *data;
10519 int kind;
10520 Py_ssize_t i, j, len;
10521 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10524 return NULL;
10525
10526 kind = PyUnicode_KIND(self);
10527 data = PyUnicode_DATA(self);
10528 len = PyUnicode_GET_LENGTH(self);
10529 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10530 PyUnicode_DATA(sepobj),
10531 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010532
Benjamin Peterson14339b62009-01-31 16:36:08 +000010533 i = 0;
10534 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 while (i < len &&
10536 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010537 i++;
10538 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010539 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010540
Benjamin Peterson14339b62009-01-31 16:36:08 +000010541 j = len;
10542 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010543 do {
10544 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 } while (j >= i &&
10546 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010547 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010548 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010549
Victor Stinner12bab6d2011-10-01 01:53:49 +020010550 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551}
10552
10553PyObject*
10554PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10555{
10556 unsigned char *data;
10557 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010558 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559
Victor Stinnerde636f32011-10-01 03:55:54 +020010560 if (PyUnicode_READY(self) == -1)
10561 return NULL;
10562
10563 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10564
Victor Stinner12bab6d2011-10-01 01:53:49 +020010565 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010567 if (PyUnicode_CheckExact(self)) {
10568 Py_INCREF(self);
10569 return self;
10570 }
10571 else
10572 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 }
10574
Victor Stinner12bab6d2011-10-01 01:53:49 +020010575 length = end - start;
10576 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010577 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578
Victor Stinnerde636f32011-10-01 03:55:54 +020010579 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010580 PyErr_SetString(PyExc_IndexError, "string index out of range");
10581 return NULL;
10582 }
10583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 kind = PyUnicode_KIND(self);
10585 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010586 return PyUnicode_FromKindAndData(kind,
10587 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010588 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010590
10591static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010592do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 int kind;
10595 void *data;
10596 Py_ssize_t len, i, j;
10597
10598 if (PyUnicode_READY(self) == -1)
10599 return NULL;
10600
10601 kind = PyUnicode_KIND(self);
10602 data = PyUnicode_DATA(self);
10603 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010604
Benjamin Peterson14339b62009-01-31 16:36:08 +000010605 i = 0;
10606 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010608 i++;
10609 }
10610 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010611
Benjamin Peterson14339b62009-01-31 16:36:08 +000010612 j = len;
10613 if (striptype != LEFTSTRIP) {
10614 do {
10615 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010617 j++;
10618 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010619
Victor Stinner12bab6d2011-10-01 01:53:49 +020010620 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621}
10622
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010623
10624static PyObject *
10625do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10626{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010627 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010628
Benjamin Peterson14339b62009-01-31 16:36:08 +000010629 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10630 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010631
Benjamin Peterson14339b62009-01-31 16:36:08 +000010632 if (sep != NULL && sep != Py_None) {
10633 if (PyUnicode_Check(sep))
10634 return _PyUnicode_XStrip(self, striptype, sep);
10635 else {
10636 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010637 "%s arg must be None or str",
10638 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010639 return NULL;
10640 }
10641 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010642
Benjamin Peterson14339b62009-01-31 16:36:08 +000010643 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010644}
10645
10646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010647PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010648 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010649\n\
10650Return a copy of the string S with leading and trailing\n\
10651whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010652If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010653
10654static PyObject *
10655unicode_strip(PyUnicodeObject *self, PyObject *args)
10656{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010657 if (PyTuple_GET_SIZE(args) == 0)
10658 return do_strip(self, BOTHSTRIP); /* Common case */
10659 else
10660 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010661}
10662
10663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010664PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010665 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010666\n\
10667Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010668If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010669
10670static PyObject *
10671unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10672{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010673 if (PyTuple_GET_SIZE(args) == 0)
10674 return do_strip(self, LEFTSTRIP); /* Common case */
10675 else
10676 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010677}
10678
10679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010680PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010681 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010682\n\
10683Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010684If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010685
10686static PyObject *
10687unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10688{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010689 if (PyTuple_GET_SIZE(args) == 0)
10690 return do_strip(self, RIGHTSTRIP); /* Common case */
10691 else
10692 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010693}
10694
10695
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010697unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698{
10699 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701
Georg Brandl222de0f2009-04-12 12:01:50 +000010702 if (len < 1) {
10703 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020010704 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000010705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706
Tim Peters7a29bd52001-09-12 03:03:31 +000010707 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708 /* no repeat, return original string */
10709 Py_INCREF(str);
10710 return (PyObject*) str;
10711 }
Tim Peters8f422462000-09-09 06:13:41 +000010712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 if (PyUnicode_READY(str) == -1)
10714 return NULL;
10715
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010716 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010717 PyErr_SetString(PyExc_OverflowError,
10718 "repeated string is too long");
10719 return NULL;
10720 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724 if (!u)
10725 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010726 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 if (PyUnicode_GET_LENGTH(str) == 1) {
10729 const int kind = PyUnicode_KIND(str);
10730 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10731 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010732 if (kind == PyUnicode_1BYTE_KIND)
10733 memset(to, (unsigned char)fill_char, len);
10734 else {
10735 for (n = 0; n < len; ++n)
10736 PyUnicode_WRITE(kind, to, n, fill_char);
10737 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 }
10739 else {
10740 /* number of characters copied this far */
10741 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10742 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10743 char *to = (char *) PyUnicode_DATA(u);
10744 Py_MEMCPY(to, PyUnicode_DATA(str),
10745 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010746 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 n = (done <= nchars-done) ? done : nchars-done;
10748 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010749 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751 }
10752
10753 return (PyObject*) u;
10754}
10755
Alexander Belopolsky40018472011-02-26 01:02:56 +000010756PyObject *
10757PyUnicode_Replace(PyObject *obj,
10758 PyObject *subobj,
10759 PyObject *replobj,
10760 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761{
10762 PyObject *self;
10763 PyObject *str1;
10764 PyObject *str2;
10765 PyObject *result;
10766
10767 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010768 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010769 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010771 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010772 Py_DECREF(self);
10773 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774 }
10775 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010776 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010777 Py_DECREF(self);
10778 Py_DECREF(str1);
10779 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782 Py_DECREF(self);
10783 Py_DECREF(str1);
10784 Py_DECREF(str2);
10785 return result;
10786}
10787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010788PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010789 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010790\n\
10791Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010792old replaced by new. If the optional argument count is\n\
10793given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010794
10795static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 PyObject *str1;
10799 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010800 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801 PyObject *result;
10802
Martin v. Löwis18e16552006-02-15 17:27:45 +000010803 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010806 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 str1 = PyUnicode_FromObject(str1);
10808 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10809 return NULL;
10810 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020010811 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010812 Py_DECREF(str1);
10813 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815
10816 result = replace(self, str1, str2, maxcount);
10817
10818 Py_DECREF(str1);
10819 Py_DECREF(str2);
10820 return result;
10821}
10822
Alexander Belopolsky40018472011-02-26 01:02:56 +000010823static PyObject *
10824unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010826 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 Py_ssize_t isize;
10828 Py_ssize_t osize, squote, dquote, i, o;
10829 Py_UCS4 max, quote;
10830 int ikind, okind;
10831 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010834 return NULL;
10835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 isize = PyUnicode_GET_LENGTH(unicode);
10837 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 /* Compute length of output, quote characters, and
10840 maximum character */
10841 osize = 2; /* quotes */
10842 max = 127;
10843 squote = dquote = 0;
10844 ikind = PyUnicode_KIND(unicode);
10845 for (i = 0; i < isize; i++) {
10846 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10847 switch (ch) {
10848 case '\'': squote++; osize++; break;
10849 case '"': dquote++; osize++; break;
10850 case '\\': case '\t': case '\r': case '\n':
10851 osize += 2; break;
10852 default:
10853 /* Fast-path ASCII */
10854 if (ch < ' ' || ch == 0x7f)
10855 osize += 4; /* \xHH */
10856 else if (ch < 0x7f)
10857 osize++;
10858 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10859 osize++;
10860 max = ch > max ? ch : max;
10861 }
10862 else if (ch < 0x100)
10863 osize += 4; /* \xHH */
10864 else if (ch < 0x10000)
10865 osize += 6; /* \uHHHH */
10866 else
10867 osize += 10; /* \uHHHHHHHH */
10868 }
10869 }
10870
10871 quote = '\'';
10872 if (squote) {
10873 if (dquote)
10874 /* Both squote and dquote present. Use squote,
10875 and escape them */
10876 osize += squote;
10877 else
10878 quote = '"';
10879 }
10880
10881 repr = PyUnicode_New(osize, max);
10882 if (repr == NULL)
10883 return NULL;
10884 okind = PyUnicode_KIND(repr);
10885 odata = PyUnicode_DATA(repr);
10886
10887 PyUnicode_WRITE(okind, odata, 0, quote);
10888 PyUnicode_WRITE(okind, odata, osize-1, quote);
10889
10890 for (i = 0, o = 1; i < isize; i++) {
10891 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010892
10893 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 if ((ch == quote) || (ch == '\\')) {
10895 PyUnicode_WRITE(okind, odata, o++, '\\');
10896 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010897 continue;
10898 }
10899
Benjamin Peterson29060642009-01-31 22:14:21 +000010900 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010901 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010902 PyUnicode_WRITE(okind, odata, o++, '\\');
10903 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010904 }
10905 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906 PyUnicode_WRITE(okind, odata, o++, '\\');
10907 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010908 }
10909 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910 PyUnicode_WRITE(okind, odata, o++, '\\');
10911 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010912 }
10913
10914 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010915 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 PyUnicode_WRITE(okind, odata, o++, '\\');
10917 PyUnicode_WRITE(okind, odata, o++, 'x');
10918 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10919 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010920 }
10921
Georg Brandl559e5d72008-06-11 18:37:52 +000010922 /* Copy ASCII characters as-is */
10923 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010924 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010925 }
10926
Benjamin Peterson29060642009-01-31 22:14:21 +000010927 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010928 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010929 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010930 (categories Z* and C* except ASCII space)
10931 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010933 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010934 if (ch <= 0xff) {
10935 PyUnicode_WRITE(okind, odata, o++, '\\');
10936 PyUnicode_WRITE(okind, odata, o++, 'x');
10937 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10938 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010939 }
10940 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 else if (ch >= 0x10000) {
10942 PyUnicode_WRITE(okind, odata, o++, '\\');
10943 PyUnicode_WRITE(okind, odata, o++, 'U');
10944 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10945 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10946 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10947 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10948 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10949 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10950 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10951 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010952 }
10953 /* Map 16-bit characters to '\uxxxx' */
10954 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 PyUnicode_WRITE(okind, odata, o++, '\\');
10956 PyUnicode_WRITE(okind, odata, o++, 'u');
10957 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10958 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10959 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10960 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010961 }
10962 }
10963 /* Copy characters as-is */
10964 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010965 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010966 }
10967 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010970 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971}
10972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010973PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010974 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975\n\
10976Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010977such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978arguments start and end are interpreted as in slice notation.\n\
10979\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010980Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981
10982static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010983unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984{
Jesus Ceaac451502011-04-20 17:09:23 +020010985 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010986 Py_ssize_t start;
10987 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010988 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989
Jesus Ceaac451502011-04-20 17:09:23 +020010990 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10991 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010992 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 if (PyUnicode_READY(self) == -1)
10995 return NULL;
10996 if (PyUnicode_READY(substring) == -1)
10997 return NULL;
10998
10999 result = any_find_slice(
11000 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11001 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011002 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003
11004 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006 if (result == -2)
11007 return NULL;
11008
Christian Heimes217cfd12007-12-02 14:31:20 +000011009 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010}
11011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011012PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011013 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011015Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016
11017static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019{
Jesus Ceaac451502011-04-20 17:09:23 +020011020 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011021 Py_ssize_t start;
11022 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011023 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024
Jesus Ceaac451502011-04-20 17:09:23 +020011025 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11026 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011027 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 if (PyUnicode_READY(self) == -1)
11030 return NULL;
11031 if (PyUnicode_READY(substring) == -1)
11032 return NULL;
11033
11034 result = any_find_slice(
11035 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11036 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011037 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038
11039 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 if (result == -2)
11042 return NULL;
11043
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044 if (result < 0) {
11045 PyErr_SetString(PyExc_ValueError, "substring not found");
11046 return NULL;
11047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048
Christian Heimes217cfd12007-12-02 14:31:20 +000011049 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050}
11051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011052PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011053 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011055Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011056done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057
11058static PyObject *
11059unicode_rjust(PyUnicodeObject *self, PyObject *args)
11060{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011061 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011062 Py_UCS4 fillchar = ' ';
11063
Victor Stinnere9a29352011-10-01 02:14:59 +020011064 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011066
Victor Stinnere9a29352011-10-01 02:14:59 +020011067 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068 return NULL;
11069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011070 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071 Py_INCREF(self);
11072 return (PyObject*) self;
11073 }
11074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011075 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011076}
11077
Alexander Belopolsky40018472011-02-26 01:02:56 +000011078PyObject *
11079PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080{
11081 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011082
Guido van Rossumd57fd912000-03-10 22:53:23 +000011083 s = PyUnicode_FromObject(s);
11084 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011085 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011086 if (sep != NULL) {
11087 sep = PyUnicode_FromObject(sep);
11088 if (sep == NULL) {
11089 Py_DECREF(s);
11090 return NULL;
11091 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011092 }
11093
11094 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11095
11096 Py_DECREF(s);
11097 Py_XDECREF(sep);
11098 return result;
11099}
11100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011101PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011102 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103\n\
11104Return a list of the words in S, using sep as the\n\
11105delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011106splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011107whitespace string is a separator and empty strings are\n\
11108removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109
11110static PyObject*
11111unicode_split(PyUnicodeObject *self, PyObject *args)
11112{
11113 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011114 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115
Martin v. Löwis18e16552006-02-15 17:27:45 +000011116 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011117 return NULL;
11118
11119 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011120 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011122 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011124 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125}
11126
Thomas Wouters477c8d52006-05-27 19:21:47 +000011127PyObject *
11128PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11129{
11130 PyObject* str_obj;
11131 PyObject* sep_obj;
11132 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011133 int kind1, kind2, kind;
11134 void *buf1 = NULL, *buf2 = NULL;
11135 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011136
11137 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011138 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011139 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011140 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011142 Py_DECREF(str_obj);
11143 return NULL;
11144 }
11145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 kind1 = PyUnicode_KIND(str_in);
11147 kind2 = PyUnicode_KIND(sep_obj);
11148 kind = kind1 > kind2 ? kind1 : kind2;
11149 buf1 = PyUnicode_DATA(str_in);
11150 if (kind1 != kind)
11151 buf1 = _PyUnicode_AsKind(str_in, kind);
11152 if (!buf1)
11153 goto onError;
11154 buf2 = PyUnicode_DATA(sep_obj);
11155 if (kind2 != kind)
11156 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11157 if (!buf2)
11158 goto onError;
11159 len1 = PyUnicode_GET_LENGTH(str_obj);
11160 len2 = PyUnicode_GET_LENGTH(sep_obj);
11161
11162 switch(PyUnicode_KIND(str_in)) {
11163 case PyUnicode_1BYTE_KIND:
11164 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11165 break;
11166 case PyUnicode_2BYTE_KIND:
11167 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11168 break;
11169 case PyUnicode_4BYTE_KIND:
11170 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11171 break;
11172 default:
11173 assert(0);
11174 out = 0;
11175 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011176
11177 Py_DECREF(sep_obj);
11178 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 if (kind1 != kind)
11180 PyMem_Free(buf1);
11181 if (kind2 != kind)
11182 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011183
11184 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011185 onError:
11186 Py_DECREF(sep_obj);
11187 Py_DECREF(str_obj);
11188 if (kind1 != kind && buf1)
11189 PyMem_Free(buf1);
11190 if (kind2 != kind && buf2)
11191 PyMem_Free(buf2);
11192 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011193}
11194
11195
11196PyObject *
11197PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11198{
11199 PyObject* str_obj;
11200 PyObject* sep_obj;
11201 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011202 int kind1, kind2, kind;
11203 void *buf1 = NULL, *buf2 = NULL;
11204 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011205
11206 str_obj = PyUnicode_FromObject(str_in);
11207 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011208 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011209 sep_obj = PyUnicode_FromObject(sep_in);
11210 if (!sep_obj) {
11211 Py_DECREF(str_obj);
11212 return NULL;
11213 }
11214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215 kind1 = PyUnicode_KIND(str_in);
11216 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011217 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 buf1 = PyUnicode_DATA(str_in);
11219 if (kind1 != kind)
11220 buf1 = _PyUnicode_AsKind(str_in, kind);
11221 if (!buf1)
11222 goto onError;
11223 buf2 = PyUnicode_DATA(sep_obj);
11224 if (kind2 != kind)
11225 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11226 if (!buf2)
11227 goto onError;
11228 len1 = PyUnicode_GET_LENGTH(str_obj);
11229 len2 = PyUnicode_GET_LENGTH(sep_obj);
11230
11231 switch(PyUnicode_KIND(str_in)) {
11232 case PyUnicode_1BYTE_KIND:
11233 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11234 break;
11235 case PyUnicode_2BYTE_KIND:
11236 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11237 break;
11238 case PyUnicode_4BYTE_KIND:
11239 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11240 break;
11241 default:
11242 assert(0);
11243 out = 0;
11244 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011245
11246 Py_DECREF(sep_obj);
11247 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011248 if (kind1 != kind)
11249 PyMem_Free(buf1);
11250 if (kind2 != kind)
11251 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011252
11253 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254 onError:
11255 Py_DECREF(sep_obj);
11256 Py_DECREF(str_obj);
11257 if (kind1 != kind && buf1)
11258 PyMem_Free(buf1);
11259 if (kind2 != kind && buf2)
11260 PyMem_Free(buf2);
11261 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011262}
11263
11264PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011265 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011266\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011267Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011268the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011269found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011270
11271static PyObject*
11272unicode_partition(PyUnicodeObject *self, PyObject *separator)
11273{
11274 return PyUnicode_Partition((PyObject *)self, separator);
11275}
11276
11277PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011278 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011279\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011280Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011281the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011282separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011283
11284static PyObject*
11285unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11286{
11287 return PyUnicode_RPartition((PyObject *)self, separator);
11288}
11289
Alexander Belopolsky40018472011-02-26 01:02:56 +000011290PyObject *
11291PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011292{
11293 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011294
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011295 s = PyUnicode_FromObject(s);
11296 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011297 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011298 if (sep != NULL) {
11299 sep = PyUnicode_FromObject(sep);
11300 if (sep == NULL) {
11301 Py_DECREF(s);
11302 return NULL;
11303 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011304 }
11305
11306 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11307
11308 Py_DECREF(s);
11309 Py_XDECREF(sep);
11310 return result;
11311}
11312
11313PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011314 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011315\n\
11316Return a list of the words in S, using sep as the\n\
11317delimiter string, starting at the end of the string and\n\
11318working to the front. If maxsplit is given, at most maxsplit\n\
11319splits are done. If sep is not specified, any whitespace string\n\
11320is a separator.");
11321
11322static PyObject*
11323unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11324{
11325 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011326 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011327
Martin v. Löwis18e16552006-02-15 17:27:45 +000011328 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011329 return NULL;
11330
11331 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011332 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011333 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011334 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011335 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011336 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011337}
11338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011339PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011340 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341\n\
11342Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011343Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011344is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345
11346static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011347unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011349 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011350 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011352 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11353 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354 return NULL;
11355
Guido van Rossum86662912000-04-11 15:38:46 +000011356 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357}
11358
11359static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011360PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361{
Walter Dörwald346737f2007-05-31 10:44:43 +000011362 if (PyUnicode_CheckExact(self)) {
11363 Py_INCREF(self);
11364 return self;
11365 } else
11366 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011367 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368}
11369
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011370PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011371 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372\n\
11373Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011374and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375
11376static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011377unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011378{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379 return fixup(self, fixswapcase);
11380}
11381
Georg Brandlceee0772007-11-27 23:48:05 +000011382PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011383 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011384\n\
11385Return a translation table usable for str.translate().\n\
11386If there is only one argument, it must be a dictionary mapping Unicode\n\
11387ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011388Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011389If there are two arguments, they must be strings of equal length, and\n\
11390in the resulting dictionary, each character in x will be mapped to the\n\
11391character at the same position in y. If there is a third argument, it\n\
11392must be a string, whose characters will be mapped to None in the result.");
11393
11394static PyObject*
11395unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11396{
11397 PyObject *x, *y = NULL, *z = NULL;
11398 PyObject *new = NULL, *key, *value;
11399 Py_ssize_t i = 0;
11400 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011401
Georg Brandlceee0772007-11-27 23:48:05 +000011402 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11403 return NULL;
11404 new = PyDict_New();
11405 if (!new)
11406 return NULL;
11407 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 int x_kind, y_kind, z_kind;
11409 void *x_data, *y_data, *z_data;
11410
Georg Brandlceee0772007-11-27 23:48:05 +000011411 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011412 if (!PyUnicode_Check(x)) {
11413 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11414 "be a string if there is a second argument");
11415 goto err;
11416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011418 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11419 "arguments must have equal length");
11420 goto err;
11421 }
11422 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011423 x_kind = PyUnicode_KIND(x);
11424 y_kind = PyUnicode_KIND(y);
11425 x_data = PyUnicode_DATA(x);
11426 y_data = PyUnicode_DATA(y);
11427 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11428 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11429 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011430 if (!key || !value)
11431 goto err;
11432 res = PyDict_SetItem(new, key, value);
11433 Py_DECREF(key);
11434 Py_DECREF(value);
11435 if (res < 0)
11436 goto err;
11437 }
11438 /* create entries for deleting chars in z */
11439 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 z_kind = PyUnicode_KIND(z);
11441 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011442 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011444 if (!key)
11445 goto err;
11446 res = PyDict_SetItem(new, key, Py_None);
11447 Py_DECREF(key);
11448 if (res < 0)
11449 goto err;
11450 }
11451 }
11452 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 int kind;
11454 void *data;
11455
Georg Brandlceee0772007-11-27 23:48:05 +000011456 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011457 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011458 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11459 "to maketrans it must be a dict");
11460 goto err;
11461 }
11462 /* copy entries into the new dict, converting string keys to int keys */
11463 while (PyDict_Next(x, &i, &key, &value)) {
11464 if (PyUnicode_Check(key)) {
11465 /* convert string keys to integer keys */
11466 PyObject *newkey;
11467 if (PyUnicode_GET_SIZE(key) != 1) {
11468 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11469 "table must be of length 1");
11470 goto err;
11471 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 kind = PyUnicode_KIND(key);
11473 data = PyUnicode_DATA(key);
11474 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011475 if (!newkey)
11476 goto err;
11477 res = PyDict_SetItem(new, newkey, value);
11478 Py_DECREF(newkey);
11479 if (res < 0)
11480 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011481 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011482 /* just keep integer keys */
11483 if (PyDict_SetItem(new, key, value) < 0)
11484 goto err;
11485 } else {
11486 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11487 "be strings or integers");
11488 goto err;
11489 }
11490 }
11491 }
11492 return new;
11493 err:
11494 Py_DECREF(new);
11495 return NULL;
11496}
11497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011498PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500\n\
11501Return a copy of the string S, where all characters have been mapped\n\
11502through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011503Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011504Unmapped characters are left untouched. Characters mapped to None\n\
11505are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506
11507static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011510 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511}
11512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011513PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011516Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517
11518static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011519unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521 return fixup(self, fixupper);
11522}
11523
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011524PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011525 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011527Pad a numeric string S with zeros on the left, to fill a field\n\
11528of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529
11530static PyObject *
11531unicode_zfill(PyUnicodeObject *self, PyObject *args)
11532{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011533 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011535 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 int kind;
11537 void *data;
11538 Py_UCS4 chr;
11539
11540 if (PyUnicode_READY(self) == -1)
11541 return NULL;
11542
Martin v. Löwis18e16552006-02-15 17:27:45 +000011543 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544 return NULL;
11545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011547 if (PyUnicode_CheckExact(self)) {
11548 Py_INCREF(self);
11549 return (PyObject*) self;
11550 }
11551 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011552 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553 }
11554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556
11557 u = pad(self, fill, 0, '0');
11558
Walter Dörwald068325e2002-04-15 13:36:47 +000011559 if (u == NULL)
11560 return NULL;
11561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 kind = PyUnicode_KIND(u);
11563 data = PyUnicode_DATA(u);
11564 chr = PyUnicode_READ(kind, data, fill);
11565
11566 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011568 PyUnicode_WRITE(kind, data, 0, chr);
11569 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570 }
11571
11572 return (PyObject*) u;
11573}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574
11575#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011576static PyObject *
11577unicode__decimal2ascii(PyObject *self)
11578{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011580}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581#endif
11582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011583PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011584 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011586Return True if S starts with the specified prefix, False otherwise.\n\
11587With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011588With optional end, stop comparing S at that position.\n\
11589prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590
11591static PyObject *
11592unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011593 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011595 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011597 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011598 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011599 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600
Jesus Ceaac451502011-04-20 17:09:23 +020011601 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011602 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011603 if (PyTuple_Check(subobj)) {
11604 Py_ssize_t i;
11605 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11606 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011607 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011608 if (substring == NULL)
11609 return NULL;
11610 result = tailmatch(self, substring, start, end, -1);
11611 Py_DECREF(substring);
11612 if (result) {
11613 Py_RETURN_TRUE;
11614 }
11615 }
11616 /* nothing matched */
11617 Py_RETURN_FALSE;
11618 }
11619 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011620 if (substring == NULL) {
11621 if (PyErr_ExceptionMatches(PyExc_TypeError))
11622 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11623 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011625 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011626 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011628 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629}
11630
11631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011632PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011635Return True if S ends with the specified suffix, False otherwise.\n\
11636With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011637With optional end, stop comparing S at that position.\n\
11638suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639
11640static PyObject *
11641unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011644 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011646 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011647 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011648 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649
Jesus Ceaac451502011-04-20 17:09:23 +020011650 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011651 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011652 if (PyTuple_Check(subobj)) {
11653 Py_ssize_t i;
11654 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11655 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011656 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011657 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011658 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011659 result = tailmatch(self, substring, start, end, +1);
11660 Py_DECREF(substring);
11661 if (result) {
11662 Py_RETURN_TRUE;
11663 }
11664 }
11665 Py_RETURN_FALSE;
11666 }
11667 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011668 if (substring == NULL) {
11669 if (PyErr_ExceptionMatches(PyExc_TypeError))
11670 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11671 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011672 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011673 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011674 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011676 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677}
11678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011680
11681PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011682 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011683\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011684Return a formatted version of S, using substitutions from args and kwargs.\n\
11685The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011686
Eric Smith27bbca62010-11-04 17:06:58 +000011687PyDoc_STRVAR(format_map__doc__,
11688 "S.format_map(mapping) -> str\n\
11689\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011690Return a formatted version of S, using substitutions from mapping.\n\
11691The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011692
Eric Smith4a7d76d2008-05-30 18:10:19 +000011693static PyObject *
11694unicode__format__(PyObject* self, PyObject* args)
11695{
11696 PyObject *format_spec;
11697
11698 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11699 return NULL;
11700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11702 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011703}
11704
Eric Smith8c663262007-08-25 02:26:07 +000011705PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011707\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011708Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011709
11710static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011711unicode__sizeof__(PyUnicodeObject *v)
11712{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 Py_ssize_t size;
11714
11715 /* If it's a compact object, account for base structure +
11716 character data. */
11717 if (PyUnicode_IS_COMPACT_ASCII(v))
11718 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11719 else if (PyUnicode_IS_COMPACT(v))
11720 size = sizeof(PyCompactUnicodeObject) +
11721 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11722 else {
11723 /* If it is a two-block object, account for base object, and
11724 for character block if present. */
11725 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020011726 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011727 size += (PyUnicode_GET_LENGTH(v) + 1) *
11728 PyUnicode_CHARACTER_SIZE(v);
11729 }
11730 /* If the wstr pointer is present, account for it unless it is shared
11731 with the data pointer. Since PyUnicode_DATA will crash if the object
11732 is not ready, check whether it's either not ready (in which case the
11733 data is entirely in wstr) or if the data is not shared. */
11734 if (_PyUnicode_WSTR(v) &&
11735 (!PyUnicode_IS_READY(v) ||
11736 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11737 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011738 if (!PyUnicode_IS_COMPACT_ASCII(v)
11739 && _PyUnicode_UTF8(v)
11740 && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11741 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742
11743 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011744}
11745
11746PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011747 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011748
11749static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011750unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011751{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011752 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 if (!copy)
11754 return NULL;
11755 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011756}
11757
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758static PyMethodDef unicode_methods[] = {
11759
11760 /* Order is according to common usage: often used methods should
11761 appear first, since lookup is done sequentially. */
11762
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011763 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011764 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11765 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011766 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011767 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11768 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11769 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11770 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11771 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11772 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11773 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011774 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011775 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11776 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11777 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011778 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011779 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11780 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11781 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011782 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011783 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011784 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011785 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011786 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11787 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11788 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11789 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11790 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11791 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11792 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11793 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11794 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11795 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11796 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11797 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11798 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11799 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011800 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011801 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011802 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011803 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011804 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011805 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011806 {"maketrans", (PyCFunction) unicode_maketrans,
11807 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011808 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011809#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011810 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811#endif
11812
11813#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011814 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011815 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816#endif
11817
Benjamin Peterson14339b62009-01-31 16:36:08 +000011818 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819 {NULL, NULL}
11820};
11821
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011822static PyObject *
11823unicode_mod(PyObject *v, PyObject *w)
11824{
Brian Curtindfc80e32011-08-10 20:28:54 -050011825 if (!PyUnicode_Check(v))
11826 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011827 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011828}
11829
11830static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011831 0, /*nb_add*/
11832 0, /*nb_subtract*/
11833 0, /*nb_multiply*/
11834 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011835};
11836
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011838 (lenfunc) unicode_length, /* sq_length */
11839 PyUnicode_Concat, /* sq_concat */
11840 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11841 (ssizeargfunc) unicode_getitem, /* sq_item */
11842 0, /* sq_slice */
11843 0, /* sq_ass_item */
11844 0, /* sq_ass_slice */
11845 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846};
11847
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011848static PyObject*
11849unicode_subscript(PyUnicodeObject* self, PyObject* item)
11850{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851 if (PyUnicode_READY(self) == -1)
11852 return NULL;
11853
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011854 if (PyIndex_Check(item)) {
11855 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011856 if (i == -1 && PyErr_Occurred())
11857 return NULL;
11858 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011860 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011861 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011862 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011864 Py_UNICODE* result_buf;
11865 PyObject* result;
11866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011868 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011869 return NULL;
11870 }
11871
11872 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 return PyUnicode_New(0, 0);
11874 } else if (start == 0 && step == 1 &&
11875 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011876 PyUnicode_CheckExact(self)) {
11877 Py_INCREF(self);
11878 return (PyObject *)self;
11879 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011880 return PyUnicode_Substring((PyObject*)self,
11881 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011882 } else {
11883 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011884 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11885 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011886
Benjamin Peterson29060642009-01-31 22:14:21 +000011887 if (result_buf == NULL)
11888 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011889
11890 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11891 result_buf[i] = source_buf[cur];
11892 }
Tim Petersced69f82003-09-16 20:30:58 +000011893
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011894 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011895 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011896 return result;
11897 }
11898 } else {
11899 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11900 return NULL;
11901 }
11902}
11903
11904static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011905 (lenfunc)unicode_length, /* mp_length */
11906 (binaryfunc)unicode_subscript, /* mp_subscript */
11907 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011908};
11909
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911/* Helpers for PyUnicode_Format() */
11912
11913static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011914getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011916 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 (*p_argidx)++;
11919 if (arglen < 0)
11920 return args;
11921 else
11922 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923 }
11924 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011925 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926 return NULL;
11927}
11928
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011929/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011931static PyObject *
11932formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011934 char *p;
11935 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011937
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938 x = PyFloat_AsDouble(v);
11939 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011940 return NULL;
11941
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011943 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011944
Eric Smith0923d1d2009-04-16 20:16:10 +000011945 p = PyOS_double_to_string(x, type, prec,
11946 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011947 if (p == NULL)
11948 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011950 PyMem_Free(p);
11951 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952}
11953
Tim Peters38fd5b62000-09-21 05:43:11 +000011954static PyObject*
11955formatlong(PyObject *val, int flags, int prec, int type)
11956{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011957 char *buf;
11958 int len;
11959 PyObject *str; /* temporary string object. */
11960 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011961
Benjamin Peterson14339b62009-01-31 16:36:08 +000011962 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11963 if (!str)
11964 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011966 Py_DECREF(str);
11967 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011968}
11969
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011972 size_t buflen,
11973 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011975 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011976 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 if (PyUnicode_GET_LENGTH(v) == 1) {
11978 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011979 buf[1] = '\0';
11980 return 1;
11981 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011982 goto onError;
11983 }
11984 else {
11985 /* Integer input truncated to a character */
11986 long x;
11987 x = PyLong_AsLong(v);
11988 if (x == -1 && PyErr_Occurred())
11989 goto onError;
11990
11991 if (x < 0 || x > 0x10ffff) {
11992 PyErr_SetString(PyExc_OverflowError,
11993 "%c arg not in range(0x110000)");
11994 return -1;
11995 }
11996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011998 buf[1] = '\0';
11999 return 1;
12000 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012001
Benjamin Peterson29060642009-01-31 22:14:21 +000012002 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012003 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012005 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006}
12007
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012008/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012009 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012010*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012011#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012012
Alexander Belopolsky40018472011-02-26 01:02:56 +000012013PyObject *
12014PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 void *fmt;
12017 int fmtkind;
12018 PyObject *result;
12019 Py_UCS4 *res, *res0;
12020 Py_UCS4 max;
12021 int kind;
12022 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012026
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012028 PyErr_BadInternalCall();
12029 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12032 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012033 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 fmt = PyUnicode_DATA(uformat);
12035 fmtkind = PyUnicode_KIND(uformat);
12036 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12037 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038
12039 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12041 if (res0 == NULL) {
12042 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012043 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045
12046 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 arglen = PyTuple_Size(args);
12048 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049 }
12050 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012051 arglen = -1;
12052 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012054 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012055 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012056 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057
12058 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012060 if (--rescnt < 0) {
12061 rescnt = fmtcnt + 100;
12062 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12064 if (res0 == NULL){
12065 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012066 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 }
12068 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012069 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012070 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012072 }
12073 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012074 /* Got a format specifier */
12075 int flags = 0;
12076 Py_ssize_t width = -1;
12077 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 Py_UCS4 c = '\0';
12079 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012080 int isnumok;
12081 PyObject *v = NULL;
12082 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 void *pbuf;
12084 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012085 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086 Py_ssize_t len, len1;
12087 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089 fmtpos++;
12090 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12091 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012092 Py_ssize_t keylen;
12093 PyObject *key;
12094 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012095
Benjamin Peterson29060642009-01-31 22:14:21 +000012096 if (dict == NULL) {
12097 PyErr_SetString(PyExc_TypeError,
12098 "format requires a mapping");
12099 goto onError;
12100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012102 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012104 /* Skip over balanced parentheses */
12105 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012107 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012111 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012113 if (fmtcnt < 0 || pcount > 0) {
12114 PyErr_SetString(PyExc_ValueError,
12115 "incomplete format key");
12116 goto onError;
12117 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012118 key = PyUnicode_Substring((PyObject*)uformat,
12119 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012120 if (key == NULL)
12121 goto onError;
12122 if (args_owned) {
12123 Py_DECREF(args);
12124 args_owned = 0;
12125 }
12126 args = PyObject_GetItem(dict, key);
12127 Py_DECREF(key);
12128 if (args == NULL) {
12129 goto onError;
12130 }
12131 args_owned = 1;
12132 arglen = -1;
12133 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012134 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012135 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012137 case '-': flags |= F_LJUST; continue;
12138 case '+': flags |= F_SIGN; continue;
12139 case ' ': flags |= F_BLANK; continue;
12140 case '#': flags |= F_ALT; continue;
12141 case '0': flags |= F_ZERO; continue;
12142 }
12143 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012144 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 if (c == '*') {
12146 v = getnextarg(args, arglen, &argidx);
12147 if (v == NULL)
12148 goto onError;
12149 if (!PyLong_Check(v)) {
12150 PyErr_SetString(PyExc_TypeError,
12151 "* wants int");
12152 goto onError;
12153 }
12154 width = PyLong_AsLong(v);
12155 if (width == -1 && PyErr_Occurred())
12156 goto onError;
12157 if (width < 0) {
12158 flags |= F_LJUST;
12159 width = -width;
12160 }
12161 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012163 }
12164 else if (c >= '0' && c <= '9') {
12165 width = c - '0';
12166 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012168 if (c < '0' || c > '9')
12169 break;
12170 if ((width*10) / 10 != width) {
12171 PyErr_SetString(PyExc_ValueError,
12172 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012173 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012174 }
12175 width = width*10 + (c - '0');
12176 }
12177 }
12178 if (c == '.') {
12179 prec = 0;
12180 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012182 if (c == '*') {
12183 v = getnextarg(args, arglen, &argidx);
12184 if (v == NULL)
12185 goto onError;
12186 if (!PyLong_Check(v)) {
12187 PyErr_SetString(PyExc_TypeError,
12188 "* wants int");
12189 goto onError;
12190 }
12191 prec = PyLong_AsLong(v);
12192 if (prec == -1 && PyErr_Occurred())
12193 goto onError;
12194 if (prec < 0)
12195 prec = 0;
12196 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012198 }
12199 else if (c >= '0' && c <= '9') {
12200 prec = c - '0';
12201 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012203 if (c < '0' || c > '9')
12204 break;
12205 if ((prec*10) / 10 != prec) {
12206 PyErr_SetString(PyExc_ValueError,
12207 "prec too big");
12208 goto onError;
12209 }
12210 prec = prec*10 + (c - '0');
12211 }
12212 }
12213 } /* prec */
12214 if (fmtcnt >= 0) {
12215 if (c == 'h' || c == 'l' || c == 'L') {
12216 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012218 }
12219 }
12220 if (fmtcnt < 0) {
12221 PyErr_SetString(PyExc_ValueError,
12222 "incomplete format");
12223 goto onError;
12224 }
12225 if (c != '%') {
12226 v = getnextarg(args, arglen, &argidx);
12227 if (v == NULL)
12228 goto onError;
12229 }
12230 sign = 0;
12231 fill = ' ';
12232 switch (c) {
12233
12234 case '%':
12235 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012237 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012239 len = 1;
12240 break;
12241
12242 case 's':
12243 case 'r':
12244 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012245 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012246 temp = v;
12247 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012248 }
12249 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 if (c == 's')
12251 temp = PyObject_Str(v);
12252 else if (c == 'r')
12253 temp = PyObject_Repr(v);
12254 else
12255 temp = PyObject_ASCII(v);
12256 if (temp == NULL)
12257 goto onError;
12258 if (PyUnicode_Check(temp))
12259 /* nothing to do */;
12260 else {
12261 Py_DECREF(temp);
12262 PyErr_SetString(PyExc_TypeError,
12263 "%s argument has non-string str()");
12264 goto onError;
12265 }
12266 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267 if (PyUnicode_READY(temp) == -1) {
12268 Py_CLEAR(temp);
12269 goto onError;
12270 }
12271 pbuf = PyUnicode_DATA(temp);
12272 kind = PyUnicode_KIND(temp);
12273 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012274 if (prec >= 0 && len > prec)
12275 len = prec;
12276 break;
12277
12278 case 'i':
12279 case 'd':
12280 case 'u':
12281 case 'o':
12282 case 'x':
12283 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012284 isnumok = 0;
12285 if (PyNumber_Check(v)) {
12286 PyObject *iobj=NULL;
12287
12288 if (PyLong_Check(v)) {
12289 iobj = v;
12290 Py_INCREF(iobj);
12291 }
12292 else {
12293 iobj = PyNumber_Long(v);
12294 }
12295 if (iobj!=NULL) {
12296 if (PyLong_Check(iobj)) {
12297 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012298 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012299 Py_DECREF(iobj);
12300 if (!temp)
12301 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 if (PyUnicode_READY(temp) == -1) {
12303 Py_CLEAR(temp);
12304 goto onError;
12305 }
12306 pbuf = PyUnicode_DATA(temp);
12307 kind = PyUnicode_KIND(temp);
12308 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012309 sign = 1;
12310 }
12311 else {
12312 Py_DECREF(iobj);
12313 }
12314 }
12315 }
12316 if (!isnumok) {
12317 PyErr_Format(PyExc_TypeError,
12318 "%%%c format: a number is required, "
12319 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12320 goto onError;
12321 }
12322 if (flags & F_ZERO)
12323 fill = '0';
12324 break;
12325
12326 case 'e':
12327 case 'E':
12328 case 'f':
12329 case 'F':
12330 case 'g':
12331 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012332 temp = formatfloat(v, flags, prec, c);
12333 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012334 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 if (PyUnicode_READY(temp) == -1) {
12336 Py_CLEAR(temp);
12337 goto onError;
12338 }
12339 pbuf = PyUnicode_DATA(temp);
12340 kind = PyUnicode_KIND(temp);
12341 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012342 sign = 1;
12343 if (flags & F_ZERO)
12344 fill = '0';
12345 break;
12346
12347 case 'c':
12348 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012350 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012351 if (len < 0)
12352 goto onError;
12353 break;
12354
12355 default:
12356 PyErr_Format(PyExc_ValueError,
12357 "unsupported format character '%c' (0x%x) "
12358 "at index %zd",
12359 (31<=c && c<=126) ? (char)c : '?',
12360 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012362 goto onError;
12363 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 /* pbuf is initialized here. */
12365 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012366 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12368 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12369 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012370 len--;
12371 }
12372 else if (flags & F_SIGN)
12373 sign = '+';
12374 else if (flags & F_BLANK)
12375 sign = ' ';
12376 else
12377 sign = 0;
12378 }
12379 if (width < len)
12380 width = len;
12381 if (rescnt - (sign != 0) < width) {
12382 reslen -= rescnt;
12383 rescnt = width + fmtcnt + 100;
12384 reslen += rescnt;
12385 if (reslen < 0) {
12386 Py_XDECREF(temp);
12387 PyErr_NoMemory();
12388 goto onError;
12389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12391 if (res0 == 0) {
12392 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012393 Py_XDECREF(temp);
12394 goto onError;
12395 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012396 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012397 }
12398 if (sign) {
12399 if (fill != ' ')
12400 *res++ = sign;
12401 rescnt--;
12402 if (width > len)
12403 width--;
12404 }
12405 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012406 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12407 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012408 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12410 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 }
12412 rescnt -= 2;
12413 width -= 2;
12414 if (width < 0)
12415 width = 0;
12416 len -= 2;
12417 }
12418 if (width > len && !(flags & F_LJUST)) {
12419 do {
12420 --rescnt;
12421 *res++ = fill;
12422 } while (--width > len);
12423 }
12424 if (fill == ' ') {
12425 if (sign)
12426 *res++ = sign;
12427 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12429 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12430 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12431 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012432 }
12433 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012434 /* Copy all characters, preserving len */
12435 len1 = len;
12436 while (len1--) {
12437 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12438 rescnt--;
12439 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012440 while (--width >= len) {
12441 --rescnt;
12442 *res++ = ' ';
12443 }
12444 if (dict && (argidx < arglen) && c != '%') {
12445 PyErr_SetString(PyExc_TypeError,
12446 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012447 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012448 goto onError;
12449 }
12450 Py_XDECREF(temp);
12451 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452 } /* until end */
12453 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012454 PyErr_SetString(PyExc_TypeError,
12455 "not all arguments converted during string formatting");
12456 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457 }
12458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459
12460 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12461 if (*res > max)
12462 max = *res;
12463 result = PyUnicode_New(reslen - rescnt, max);
12464 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012465 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466 kind = PyUnicode_KIND(result);
12467 for (res = res0; res < res0+reslen-rescnt; res++)
12468 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12469 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012471 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472 }
12473 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474 return (PyObject *)result;
12475
Benjamin Peterson29060642009-01-31 22:14:21 +000012476 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478 Py_DECREF(uformat);
12479 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012480 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481 }
12482 return NULL;
12483}
12484
Jeremy Hylton938ace62002-07-17 16:30:39 +000012485static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012486unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12487
Tim Peters6d6c1a32001-08-02 04:15:00 +000012488static PyObject *
12489unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12490{
Benjamin Peterson29060642009-01-31 22:14:21 +000012491 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012492 static char *kwlist[] = {"object", "encoding", "errors", 0};
12493 char *encoding = NULL;
12494 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012495
Benjamin Peterson14339b62009-01-31 16:36:08 +000012496 if (type != &PyUnicode_Type)
12497 return unicode_subtype_new(type, args, kwds);
12498 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012499 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012500 return NULL;
12501 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012503 if (encoding == NULL && errors == NULL)
12504 return PyObject_Str(x);
12505 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012506 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012507}
12508
Guido van Rossume023fe02001-08-30 03:12:59 +000012509static PyObject *
12510unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12511{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012512 PyUnicodeObject *unicode, *self;
12513 Py_ssize_t length, char_size;
12514 int share_wstr, share_utf8;
12515 unsigned int kind;
12516 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012517
Benjamin Peterson14339b62009-01-31 16:36:08 +000012518 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012519
12520 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12521 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012522 return NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012523 assert(PyUnicode_Check(unicode));
12524 if (PyUnicode_READY(unicode))
12525 return NULL;
12526
12527 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12528 if (self == NULL) {
12529 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012530 return NULL;
12531 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012532 kind = PyUnicode_KIND(unicode);
12533 length = PyUnicode_GET_LENGTH(unicode);
12534
12535 _PyUnicode_LENGTH(self) = length;
12536 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12537 _PyUnicode_STATE(self).interned = 0;
12538 _PyUnicode_STATE(self).kind = kind;
12539 _PyUnicode_STATE(self).compact = 0;
12540 _PyUnicode_STATE(self).ascii = 0;
12541 _PyUnicode_STATE(self).ready = 1;
12542 _PyUnicode_WSTR(self) = NULL;
12543 _PyUnicode_UTF8_LENGTH(self) = 0;
12544 _PyUnicode_UTF8(self) = NULL;
12545 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012546 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012547
12548 share_utf8 = 0;
12549 share_wstr = 0;
12550 if (kind == PyUnicode_1BYTE_KIND) {
12551 char_size = 1;
12552 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12553 share_utf8 = 1;
12554 }
12555 else if (kind == PyUnicode_2BYTE_KIND) {
12556 char_size = 2;
12557 if (sizeof(wchar_t) == 2)
12558 share_wstr = 1;
12559 }
12560 else {
12561 assert(kind == PyUnicode_4BYTE_KIND);
12562 char_size = 4;
12563 if (sizeof(wchar_t) == 4)
12564 share_wstr = 1;
12565 }
12566
12567 /* Ensure we won't overflow the length. */
12568 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12569 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012571 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012572 data = PyObject_MALLOC((length + 1) * char_size);
12573 if (data == NULL) {
12574 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 goto onError;
12576 }
12577
Victor Stinnerc3c74152011-10-02 20:39:55 +020012578 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012579 if (share_utf8) {
12580 _PyUnicode_UTF8_LENGTH(self) = length;
12581 _PyUnicode_UTF8(self) = data;
12582 }
12583 if (share_wstr) {
12584 _PyUnicode_WSTR_LENGTH(self) = length;
12585 _PyUnicode_WSTR(self) = (wchar_t *)data;
12586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012588 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12589 PyUnicode_KIND_SIZE(kind, length + 1));
12590 Py_DECREF(unicode);
12591 return (PyObject *)self;
12592
12593onError:
12594 Py_DECREF(unicode);
12595 Py_DECREF(self);
12596 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012597}
12598
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012599PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012600 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012601\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012602Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012603encoding defaults to the current default string encoding.\n\
12604errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012605
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012606static PyObject *unicode_iter(PyObject *seq);
12607
Guido van Rossumd57fd912000-03-10 22:53:23 +000012608PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012609 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012610 "str", /* tp_name */
12611 sizeof(PyUnicodeObject), /* tp_size */
12612 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012614 (destructor)unicode_dealloc, /* tp_dealloc */
12615 0, /* tp_print */
12616 0, /* tp_getattr */
12617 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012618 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012619 unicode_repr, /* tp_repr */
12620 &unicode_as_number, /* tp_as_number */
12621 &unicode_as_sequence, /* tp_as_sequence */
12622 &unicode_as_mapping, /* tp_as_mapping */
12623 (hashfunc) unicode_hash, /* tp_hash*/
12624 0, /* tp_call*/
12625 (reprfunc) unicode_str, /* tp_str */
12626 PyObject_GenericGetAttr, /* tp_getattro */
12627 0, /* tp_setattro */
12628 0, /* tp_as_buffer */
12629 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012630 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012631 unicode_doc, /* tp_doc */
12632 0, /* tp_traverse */
12633 0, /* tp_clear */
12634 PyUnicode_RichCompare, /* tp_richcompare */
12635 0, /* tp_weaklistoffset */
12636 unicode_iter, /* tp_iter */
12637 0, /* tp_iternext */
12638 unicode_methods, /* tp_methods */
12639 0, /* tp_members */
12640 0, /* tp_getset */
12641 &PyBaseObject_Type, /* tp_base */
12642 0, /* tp_dict */
12643 0, /* tp_descr_get */
12644 0, /* tp_descr_set */
12645 0, /* tp_dictoffset */
12646 0, /* tp_init */
12647 0, /* tp_alloc */
12648 unicode_new, /* tp_new */
12649 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650};
12651
12652/* Initialize the Unicode implementation */
12653
Thomas Wouters78890102000-07-22 19:25:51 +000012654void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012656 int i;
12657
Thomas Wouters477c8d52006-05-27 19:21:47 +000012658 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012660 0x000A, /* LINE FEED */
12661 0x000D, /* CARRIAGE RETURN */
12662 0x001C, /* FILE SEPARATOR */
12663 0x001D, /* GROUP SEPARATOR */
12664 0x001E, /* RECORD SEPARATOR */
12665 0x0085, /* NEXT LINE */
12666 0x2028, /* LINE SEPARATOR */
12667 0x2029, /* PARAGRAPH SEPARATOR */
12668 };
12669
Fred Drakee4315f52000-05-09 19:53:39 +000012670 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012671 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012672 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012674
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012675 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012676 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012677 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012678 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012679
12680 /* initialize the linebreak bloom filter */
12681 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012683 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012684
12685 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686}
12687
12688/* Finalize the Unicode implementation */
12689
Christian Heimesa156e092008-02-16 07:38:31 +000012690int
12691PyUnicode_ClearFreeList(void)
12692{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012694}
12695
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696void
Thomas Wouters78890102000-07-22 19:25:51 +000012697_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012699 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012701 Py_XDECREF(unicode_empty);
12702 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012703
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012704 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012705 if (unicode_latin1[i]) {
12706 Py_DECREF(unicode_latin1[i]);
12707 unicode_latin1[i] = NULL;
12708 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012709 }
Christian Heimesa156e092008-02-16 07:38:31 +000012710 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012712
Walter Dörwald16807132007-05-25 13:52:07 +000012713void
12714PyUnicode_InternInPlace(PyObject **p)
12715{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012716 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12717 PyObject *t;
12718 if (s == NULL || !PyUnicode_Check(s))
12719 Py_FatalError(
12720 "PyUnicode_InternInPlace: unicode strings only please!");
12721 /* If it's a subclass, we don't really know what putting
12722 it in the interned dict might do. */
12723 if (!PyUnicode_CheckExact(s))
12724 return;
12725 if (PyUnicode_CHECK_INTERNED(s))
12726 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012727 if (PyUnicode_READY(s) == -1) {
12728 assert(0 && "ready fail in intern...");
12729 return;
12730 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012731 if (interned == NULL) {
12732 interned = PyDict_New();
12733 if (interned == NULL) {
12734 PyErr_Clear(); /* Don't leave an exception */
12735 return;
12736 }
12737 }
12738 /* It might be that the GetItem call fails even
12739 though the key is present in the dictionary,
12740 namely when this happens during a stack overflow. */
12741 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012742 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012743 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012744
Benjamin Peterson29060642009-01-31 22:14:21 +000012745 if (t) {
12746 Py_INCREF(t);
12747 Py_DECREF(*p);
12748 *p = t;
12749 return;
12750 }
Walter Dörwald16807132007-05-25 13:52:07 +000012751
Benjamin Peterson14339b62009-01-31 16:36:08 +000012752 PyThreadState_GET()->recursion_critical = 1;
12753 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12754 PyErr_Clear();
12755 PyThreadState_GET()->recursion_critical = 0;
12756 return;
12757 }
12758 PyThreadState_GET()->recursion_critical = 0;
12759 /* The two references in interned are not counted by refcnt.
12760 The deallocator will take care of this */
12761 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012762 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012763}
12764
12765void
12766PyUnicode_InternImmortal(PyObject **p)
12767{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012768 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12769
Benjamin Peterson14339b62009-01-31 16:36:08 +000012770 PyUnicode_InternInPlace(p);
12771 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012773 Py_INCREF(*p);
12774 }
Walter Dörwald16807132007-05-25 13:52:07 +000012775}
12776
12777PyObject *
12778PyUnicode_InternFromString(const char *cp)
12779{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012780 PyObject *s = PyUnicode_FromString(cp);
12781 if (s == NULL)
12782 return NULL;
12783 PyUnicode_InternInPlace(&s);
12784 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012785}
12786
Alexander Belopolsky40018472011-02-26 01:02:56 +000012787void
12788_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012789{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012790 PyObject *keys;
12791 PyUnicodeObject *s;
12792 Py_ssize_t i, n;
12793 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012794
Benjamin Peterson14339b62009-01-31 16:36:08 +000012795 if (interned == NULL || !PyDict_Check(interned))
12796 return;
12797 keys = PyDict_Keys(interned);
12798 if (keys == NULL || !PyList_Check(keys)) {
12799 PyErr_Clear();
12800 return;
12801 }
Walter Dörwald16807132007-05-25 13:52:07 +000012802
Benjamin Peterson14339b62009-01-31 16:36:08 +000012803 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12804 detector, interned unicode strings are not forcibly deallocated;
12805 rather, we give them their stolen references back, and then clear
12806 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012807
Benjamin Peterson14339b62009-01-31 16:36:08 +000012808 n = PyList_GET_SIZE(keys);
12809 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012810 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012811 for (i = 0; i < n; i++) {
12812 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012813 if (PyUnicode_READY(s) == -1)
12814 fprintf(stderr, "could not ready string\n");
12815 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012816 case SSTATE_NOT_INTERNED:
12817 /* XXX Shouldn't happen */
12818 break;
12819 case SSTATE_INTERNED_IMMORTAL:
12820 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012822 break;
12823 case SSTATE_INTERNED_MORTAL:
12824 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012826 break;
12827 default:
12828 Py_FatalError("Inconsistent interned string state.");
12829 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012830 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012831 }
12832 fprintf(stderr, "total size of all interned strings: "
12833 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12834 "mortal/immortal\n", mortal_size, immortal_size);
12835 Py_DECREF(keys);
12836 PyDict_Clear(interned);
12837 Py_DECREF(interned);
12838 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012839}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012840
12841
12842/********************* Unicode Iterator **************************/
12843
12844typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012845 PyObject_HEAD
12846 Py_ssize_t it_index;
12847 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012848} unicodeiterobject;
12849
12850static void
12851unicodeiter_dealloc(unicodeiterobject *it)
12852{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012853 _PyObject_GC_UNTRACK(it);
12854 Py_XDECREF(it->it_seq);
12855 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012856}
12857
12858static int
12859unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12860{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012861 Py_VISIT(it->it_seq);
12862 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012863}
12864
12865static PyObject *
12866unicodeiter_next(unicodeiterobject *it)
12867{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012868 PyUnicodeObject *seq;
12869 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012870
Benjamin Peterson14339b62009-01-31 16:36:08 +000012871 assert(it != NULL);
12872 seq = it->it_seq;
12873 if (seq == NULL)
12874 return NULL;
12875 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012877 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12878 int kind = PyUnicode_KIND(seq);
12879 void *data = PyUnicode_DATA(seq);
12880 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12881 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012882 if (item != NULL)
12883 ++it->it_index;
12884 return item;
12885 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012886
Benjamin Peterson14339b62009-01-31 16:36:08 +000012887 Py_DECREF(seq);
12888 it->it_seq = NULL;
12889 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012890}
12891
12892static PyObject *
12893unicodeiter_len(unicodeiterobject *it)
12894{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012895 Py_ssize_t len = 0;
12896 if (it->it_seq)
12897 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12898 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012899}
12900
12901PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12902
12903static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012904 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012905 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012906 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012907};
12908
12909PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012910 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12911 "str_iterator", /* tp_name */
12912 sizeof(unicodeiterobject), /* tp_basicsize */
12913 0, /* tp_itemsize */
12914 /* methods */
12915 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12916 0, /* tp_print */
12917 0, /* tp_getattr */
12918 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012919 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012920 0, /* tp_repr */
12921 0, /* tp_as_number */
12922 0, /* tp_as_sequence */
12923 0, /* tp_as_mapping */
12924 0, /* tp_hash */
12925 0, /* tp_call */
12926 0, /* tp_str */
12927 PyObject_GenericGetAttr, /* tp_getattro */
12928 0, /* tp_setattro */
12929 0, /* tp_as_buffer */
12930 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12931 0, /* tp_doc */
12932 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12933 0, /* tp_clear */
12934 0, /* tp_richcompare */
12935 0, /* tp_weaklistoffset */
12936 PyObject_SelfIter, /* tp_iter */
12937 (iternextfunc)unicodeiter_next, /* tp_iternext */
12938 unicodeiter_methods, /* tp_methods */
12939 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012940};
12941
12942static PyObject *
12943unicode_iter(PyObject *seq)
12944{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012945 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012946
Benjamin Peterson14339b62009-01-31 16:36:08 +000012947 if (!PyUnicode_Check(seq)) {
12948 PyErr_BadInternalCall();
12949 return NULL;
12950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951 if (PyUnicode_READY(seq) == -1)
12952 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012953 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12954 if (it == NULL)
12955 return NULL;
12956 it->it_index = 0;
12957 Py_INCREF(seq);
12958 it->it_seq = (PyUnicodeObject *)seq;
12959 _PyObject_GC_TRACK(it);
12960 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012961}
12962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012963#define UNIOP(x) Py_UNICODE_##x
12964#define UNIOP_t Py_UNICODE
12965#include "uniops.h"
12966#undef UNIOP
12967#undef UNIOP_t
12968#define UNIOP(x) Py_UCS4_##x
12969#define UNIOP_t Py_UCS4
12970#include "uniops.h"
12971#undef UNIOP
12972#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012973
Victor Stinner71133ff2010-09-01 23:43:53 +000012974Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012975PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012976{
12977 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12978 Py_UNICODE *copy;
12979 Py_ssize_t size;
12980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 if (!PyUnicode_Check(unicode)) {
12982 PyErr_BadArgument();
12983 return NULL;
12984 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012985 /* Ensure we won't overflow the size. */
12986 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12987 PyErr_NoMemory();
12988 return NULL;
12989 }
12990 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12991 size *= sizeof(Py_UNICODE);
12992 copy = PyMem_Malloc(size);
12993 if (copy == NULL) {
12994 PyErr_NoMemory();
12995 return NULL;
12996 }
12997 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12998 return copy;
12999}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013000
Georg Brandl66c221e2010-10-14 07:04:07 +000013001/* A _string module, to export formatter_parser and formatter_field_name_split
13002 to the string.Formatter class implemented in Python. */
13003
13004static PyMethodDef _string_methods[] = {
13005 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13006 METH_O, PyDoc_STR("split the argument as a field name")},
13007 {"formatter_parser", (PyCFunction) formatter_parser,
13008 METH_O, PyDoc_STR("parse the argument as a format string")},
13009 {NULL, NULL}
13010};
13011
13012static struct PyModuleDef _string_module = {
13013 PyModuleDef_HEAD_INIT,
13014 "_string",
13015 PyDoc_STR("string helper module"),
13016 0,
13017 _string_methods,
13018 NULL,
13019 NULL,
13020 NULL,
13021 NULL
13022};
13023
13024PyMODINIT_FUNC
13025PyInit__string(void)
13026{
13027 return PyModule_Create(&_string_module);
13028}
13029
13030
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013031#ifdef __cplusplus
13032}
13033#endif