blob: 74fb7ce8ffd6e0398f02dcdb292dca3e66e993c8 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
109#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
110#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
111#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
112#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
113#define _PyUnicode_KIND(op) \
114 (assert(PyUnicode_Check(op)), \
115 ((PyASCIIObject *)(op))->state.kind)
116#define _PyUnicode_GET_LENGTH(op) \
117 (assert(PyUnicode_Check(op)), \
118 ((PyASCIIObject *)(op))->length)
119
120
Walter Dörwald16807132007-05-25 13:52:07 +0000121/* This dictionary holds all interned unicode strings. Note that references
122 to strings in this dictionary are *not* counted in the string's ob_refcnt.
123 When the interned string reaches a refcnt of 0 the string deallocation
124 function will delete the reference from this dictionary.
125
126 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000127 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000128*/
129static PyObject *interned;
130
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000131/* The empty Unicode object is shared to improve performance. */
132static PyUnicodeObject *unicode_empty;
133
134/* Single character Unicode strings in the Latin-1 range are being
135 shared as well. */
136static PyUnicodeObject *unicode_latin1[256];
137
Christian Heimes190d79e2008-01-30 11:58:22 +0000138/* Fast detection of the most frequent whitespace characters */
139const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000141/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000142/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000143/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000144/* case 0x000C: * FORM FEED */
145/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000146 0, 1, 1, 1, 1, 1, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000148/* case 0x001C: * FILE SEPARATOR */
149/* case 0x001D: * GROUP SEPARATOR */
150/* case 0x001E: * RECORD SEPARATOR */
151/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000152 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000153/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000154 1, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000158
Benjamin Peterson14339b62009-01-31 16:36:08 +0000159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000167};
168
Alexander Belopolsky40018472011-02-26 01:02:56 +0000169static PyObject *
170unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000171 PyObject **errorHandler,const char *encoding, const char *reason,
172 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
173 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
174
Alexander Belopolsky40018472011-02-26 01:02:56 +0000175static void
176raise_encode_exception(PyObject **exceptionObject,
177 const char *encoding,
178 const Py_UNICODE *unicode, Py_ssize_t size,
179 Py_ssize_t startpos, Py_ssize_t endpos,
180 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000181
Christian Heimes190d79e2008-01-30 11:58:22 +0000182/* Same for linebreaks */
183static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000185/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000186/* 0x000B, * LINE TABULATION */
187/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000188/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000189 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* 0x001C, * FILE SEPARATOR */
192/* 0x001D, * GROUP SEPARATOR */
193/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 0, 0, 0, 0, 1, 1, 1, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
196 0, 0, 0, 0, 0, 0, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000199
Benjamin Peterson14339b62009-01-31 16:36:08 +0000200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000208};
209
210
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000211Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000212PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000213{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000214#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000216#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 /* This is actually an illegal character, so it should
218 not be passed to unichr. */
219 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000220#endif
221}
222
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223/* --- Bloom Filters ----------------------------------------------------- */
224
225/* stuff to implement simple "bloom filters" for Unicode characters.
226 to keep things simple, we use a single bitmask, using the least 5
227 bits from each unicode characters as the bit index. */
228
229/* the linebreak mask is set up by Unicode_Init below */
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#if LONG_BIT >= 128
232#define BLOOM_WIDTH 128
233#elif LONG_BIT >= 64
234#define BLOOM_WIDTH 64
235#elif LONG_BIT >= 32
236#define BLOOM_WIDTH 32
237#else
238#error "LONG_BIT is smaller than 32"
239#endif
240
Thomas Wouters477c8d52006-05-27 19:21:47 +0000241#define BLOOM_MASK unsigned long
242
243static BLOOM_MASK bloom_linebreak;
244
Antoine Pitrouf068f942010-01-13 14:19:12 +0000245#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
246#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000247
Benjamin Peterson29060642009-01-31 22:14:21 +0000248#define BLOOM_LINEBREAK(ch) \
249 ((ch) < 128U ? ascii_linebreak[(ch)] : \
250 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000251
Alexander Belopolsky40018472011-02-26 01:02:56 +0000252Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200253make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000254{
255 /* calculate simple bloom-style bitmask for a given unicode string */
256
Antoine Pitrouf068f942010-01-13 14:19:12 +0000257 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000258 Py_ssize_t i;
259
260 mask = 0;
261 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200262 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000263
264 return mask;
265}
266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200267#define BLOOM_MEMBER(mask, chr, str) \
268 (BLOOM(mask, chr) \
269 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271/* --- Unicode Object ----------------------------------------------------- */
272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200273static PyObject *
274substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len);
275
276static PyObject *
277fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
278
279Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
280 Py_ssize_t size, Py_UCS4 ch,
281 int direction)
282{
283 /* like wcschr, but doesn't stop at NULL characters */
284 Py_ssize_t i;
285 if (direction == 1) {
286 for(i = 0; i < size; i++)
287 if (PyUnicode_READ(kind, s, i) == ch)
288 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
289 }
290 else {
291 for(i = size-1; i >= 0; i--)
292 if (PyUnicode_READ(kind, s, i) == ch)
293 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
294 }
295 return NULL;
296}
297
Alexander Belopolsky40018472011-02-26 01:02:56 +0000298static int
299unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200300 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
302 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200304 /* Resizing is only supported for old unicode objects. */
305 assert(!PyUnicode_IS_COMPACT(unicode));
306 assert(_PyUnicode_WSTR(unicode) != NULL);
307
308 /* ... and only if they have not been readied yet, because
309 callees usually rely on the wstr representation when resizing. */
310 assert(unicode->data.any == NULL);
311
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000312 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200313 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 /* Resizing shared object (unicode_empty or single character
317 objects) in-place is not allowed. Use PyUnicode_Resize()
318 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
322 _PyUnicode_WSTR(unicode)[0] < 256U &&
323 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000325 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 return -1;
327 }
328
Thomas Wouters477c8d52006-05-27 19:21:47 +0000329 /* We allocate one more byte to make sure the string is Ux0000 terminated.
330 The overallocation is also used by fastsearch, which assumes that it's
331 safe to look at str[length] (without making any assumptions about what
332 it contains). */
333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200334 oldstr = _PyUnicode_WSTR(unicode);
335 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
336 sizeof(Py_UNICODE) * (length + 1));
337 if (!_PyUnicode_WSTR(unicode)) {
338 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 PyErr_NoMemory();
340 return -1;
341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200342 _PyUnicode_WSTR(unicode)[length] = 0;
343 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000344
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 if (unicode->data.any != NULL) {
347 PyObject_FREE(unicode->data.any);
348 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
349 PyObject_FREE(unicode->_base.utf8);
350 }
351 unicode->_base.utf8 = NULL;
352 unicode->_base.utf8_length = 0;
353 unicode->data.any = NULL;
354 _PyUnicode_LENGTH(unicode) = 0;
355 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
356 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 _PyUnicode_HASH(unicode) = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000359
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 return 0;
361}
362
363/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000364 Ux0000 terminated; some code (e.g. new_identifier)
365 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366
367 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369
370*/
371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200372#ifdef Py_DEBUG
373int unicode_old_new_calls = 0;
374#endif
375
Alexander Belopolsky40018472011-02-26 01:02:56 +0000376static PyUnicodeObject *
377_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378{
379 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200380 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381
Thomas Wouters477c8d52006-05-27 19:21:47 +0000382 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383 if (length == 0 && unicode_empty != NULL) {
384 Py_INCREF(unicode_empty);
385 return unicode_empty;
386 }
387
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000388 /* Ensure we won't overflow the size. */
389 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
390 return (PyUnicodeObject *)PyErr_NoMemory();
391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200392 if (length < 0) {
393 PyErr_SetString(PyExc_SystemError,
394 "Negative size passed to _PyUnicode_New");
395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396 }
397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200398#ifdef Py_DEBUG
399 ++unicode_old_new_calls;
400#endif
401
402 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
403 if (unicode == NULL)
404 return NULL;
405 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
406 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
407 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 PyErr_NoMemory();
409 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200411
Jeremy Hyltond8082792003-09-16 19:41:39 +0000412 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000413 * the caller fails before initializing str -- unicode_resize()
414 * reads str[0], and the Keep-Alive optimization can keep memory
415 * allocated for str alive across a call to unicode_dealloc(unicode).
416 * We don't want unicode_resize to read uninitialized memory in
417 * that case.
418 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419 _PyUnicode_WSTR(unicode)[0] = 0;
420 _PyUnicode_WSTR(unicode)[length] = 0;
421 _PyUnicode_WSTR_LENGTH(unicode) = length;
422 _PyUnicode_HASH(unicode) = -1;
423 _PyUnicode_STATE(unicode).interned = 0;
424 _PyUnicode_STATE(unicode).kind = 0;
425 _PyUnicode_STATE(unicode).compact = 0;
426 _PyUnicode_STATE(unicode).ready = 0;
427 _PyUnicode_STATE(unicode).ascii = 0;
428 unicode->data.any = NULL;
429 _PyUnicode_LENGTH(unicode) = 0;
430 unicode->_base.utf8 = NULL;
431 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000433
Benjamin Peterson29060642009-01-31 22:14:21 +0000434 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000435 /* XXX UNREF/NEWREF interface should be more symmetrical */
436 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000437 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000438 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440}
441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200442#ifdef Py_DEBUG
443int unicode_new_new_calls = 0;
444
445/* Functions wrapping macros for use in debugger */
446char *_PyUnicode_utf8(void *unicode){
447 return _PyUnicode_UTF8(unicode);
448}
449
450void *_PyUnicode_compact_data(void *unicode) {
451 return _PyUnicode_COMPACT_DATA(unicode);
452}
453void *_PyUnicode_data(void *unicode){
454 printf("obj %p\n", unicode);
455 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
456 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
457 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
458 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
459 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
460 return PyUnicode_DATA(unicode);
461}
462#endif
463
464PyObject *
465PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
466{
467 PyObject *obj;
468 PyCompactUnicodeObject *unicode;
469 void *data;
470 int kind_state;
471 int is_sharing = 0, is_ascii = 0;
472 Py_ssize_t char_size;
473 Py_ssize_t struct_size;
474
475 /* Optimization for empty strings */
476 if (size == 0 && unicode_empty != NULL) {
477 Py_INCREF(unicode_empty);
478 return (PyObject *)unicode_empty;
479 }
480
481#ifdef Py_DEBUG
482 ++unicode_new_new_calls;
483#endif
484
485 struct_size = sizeof(PyCompactUnicodeObject);
486 if (maxchar < 128) {
487 kind_state = PyUnicode_1BYTE_KIND;
488 char_size = 1;
489 is_ascii = 1;
490 struct_size = sizeof(PyASCIIObject);
491 }
492 else if (maxchar < 256) {
493 kind_state = PyUnicode_1BYTE_KIND;
494 char_size = 1;
495 }
496 else if (maxchar < 65536) {
497 kind_state = PyUnicode_2BYTE_KIND;
498 char_size = 2;
499 if (sizeof(wchar_t) == 2)
500 is_sharing = 1;
501 }
502 else {
503 kind_state = PyUnicode_4BYTE_KIND;
504 char_size = 4;
505 if (sizeof(wchar_t) == 4)
506 is_sharing = 1;
507 }
508
509 /* Ensure we won't overflow the size. */
510 if (size < 0) {
511 PyErr_SetString(PyExc_SystemError,
512 "Negative size passed to PyUnicode_New");
513 return NULL;
514 }
515 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
516 return PyErr_NoMemory();
517
518 /* Duplicated allocation code from _PyObject_New() instead of a call to
519 * PyObject_New() so we are able to allocate space for the object and
520 * it's data buffer.
521 */
522 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
523 if (obj == NULL)
524 return PyErr_NoMemory();
525 obj = PyObject_INIT(obj, &PyUnicode_Type);
526 if (obj == NULL)
527 return NULL;
528
529 unicode = (PyCompactUnicodeObject *)obj;
530 if (is_ascii)
531 data = ((PyASCIIObject*)obj) + 1;
532 else
533 data = unicode + 1;
534 _PyUnicode_LENGTH(unicode) = size;
535 _PyUnicode_HASH(unicode) = -1;
536 _PyUnicode_STATE(unicode).interned = 0;
537 _PyUnicode_STATE(unicode).kind = kind_state;
538 _PyUnicode_STATE(unicode).compact = 1;
539 _PyUnicode_STATE(unicode).ready = 1;
540 _PyUnicode_STATE(unicode).ascii = is_ascii;
541 if (is_ascii) {
542 ((char*)data)[size] = 0;
543 _PyUnicode_WSTR(unicode) = NULL;
544 }
545 else if (kind_state == PyUnicode_1BYTE_KIND) {
546 ((char*)data)[size] = 0;
547 _PyUnicode_WSTR(unicode) = NULL;
548 _PyUnicode_WSTR_LENGTH(unicode) = 0;
549 unicode->utf8_length = 0;
550 unicode->utf8 = NULL;
551 }
552 else {
553 unicode->utf8 = NULL;
554 if (kind_state == PyUnicode_2BYTE_KIND)
555 ((Py_UCS2*)data)[size] = 0;
556 else /* kind_state == PyUnicode_4BYTE_KIND */
557 ((Py_UCS4*)data)[size] = 0;
558 if (is_sharing) {
559 _PyUnicode_WSTR_LENGTH(unicode) = size;
560 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
561 }
562 else {
563 _PyUnicode_WSTR_LENGTH(unicode) = 0;
564 _PyUnicode_WSTR(unicode) = NULL;
565 }
566 }
567 return obj;
568}
569
570#if SIZEOF_WCHAR_T == 2
571/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
572 will decode surrogate pairs, the other conversions are implemented as macros
573 for efficency.
574
575 This function assumes that unicode can hold one more code point than wstr
576 characters for a terminating null character. */
577static int
578unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
579 PyUnicodeObject *unicode)
580{
581 const wchar_t *iter;
582 Py_UCS4 *ucs4_out;
583
584 assert(unicode && PyUnicode_Check(unicode));
585 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
586 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
587
588 for (iter = begin; iter < end; ) {
589 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
590 _PyUnicode_GET_LENGTH(unicode)));
591 if (*iter >= 0xD800 && *iter <= 0xDBFF
592 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
593 {
594 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
595 iter += 2;
596 }
597 else {
598 *ucs4_out++ = *iter;
599 iter++;
600 }
601 }
602 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
603 _PyUnicode_GET_LENGTH(unicode)));
604
605 return 0;
606}
607#endif
608
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200609Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
611 PyObject *from, Py_ssize_t from_start,
612 Py_ssize_t how_many)
613{
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200614 unsigned int from_kind;
615 unsigned int to_kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200616
617 assert(PyUnicode_Check(from));
618 assert(PyUnicode_Check(to));
619
620 if (PyUnicode_READY(from))
621 return -1;
622 if (PyUnicode_READY(to))
623 return -1;
624
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200625 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200626 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
627 PyErr_Format(PyExc_ValueError,
628 "Cannot write %zi characters at %zi "
629 "in a string of %zi characters",
630 how_many, to_start, PyUnicode_GET_LENGTH(to));
631 return -1;
632 }
633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634 from_kind = PyUnicode_KIND(from);
635 to_kind = PyUnicode_KIND(to);
636
637 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200638 /* fast path */
639 Py_MEMCPY((char*)PyUnicode_DATA(to)
640 + PyUnicode_KIND_SIZE(to_kind, to_start),
641 (char*)PyUnicode_DATA(from)
642 + PyUnicode_KIND_SIZE(from_kind, from_start),
643 PyUnicode_KIND_SIZE(to_kind, how_many));
644 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200646
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200647 if (from_kind > to_kind) {
648 /* slow path to check for character overflow */
649 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
650 void *from_data = PyUnicode_DATA(from);
651 void *to_data = PyUnicode_DATA(to);
652 Py_UCS4 ch, maxchar;
653 Py_ssize_t i;
654 int overflow;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200656 maxchar = 0;
Victor Stinner73f01c62011-09-28 22:28:04 +0200657 overflow = 0;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200658 for (i=0; i < how_many; i++) {
659 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
660 if (ch > maxchar) {
661 maxchar = ch;
662 if (maxchar > to_maxchar) {
663 overflow = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200664 break;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200666 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200667 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
668 }
669 if (!overflow)
670 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200671 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200672 else if (from_kind == PyUnicode_1BYTE_KIND && to_kind == PyUnicode_2BYTE_KIND)
673 {
674 _PyUnicode_CONVERT_BYTES(
675 Py_UCS1, Py_UCS2,
676 PyUnicode_1BYTE_DATA(from) + from_start,
677 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
678 PyUnicode_2BYTE_DATA(to) + to_start
679 );
680 return how_many;
681 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200682 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200683 && to_kind == PyUnicode_4BYTE_KIND)
684 {
685 _PyUnicode_CONVERT_BYTES(
686 Py_UCS1, Py_UCS4,
687 PyUnicode_1BYTE_DATA(from) + from_start,
688 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
689 PyUnicode_4BYTE_DATA(to) + to_start
690 );
691 return how_many;
692 }
693 else if (from_kind == PyUnicode_2BYTE_KIND
694 && to_kind == PyUnicode_4BYTE_KIND)
695 {
696 _PyUnicode_CONVERT_BYTES(
697 Py_UCS2, Py_UCS4,
698 PyUnicode_2BYTE_DATA(from) + from_start,
699 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
700 PyUnicode_4BYTE_DATA(to) + to_start
701 );
702 return how_many;
703 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200704 PyErr_Format(PyExc_ValueError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200705 "Cannot copy UCS%u characters "
706 "into a string of UCS%u characters",
Victor Stinner157f83f2011-09-28 21:41:31 +0200707 1 << (from_kind - 1),
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200708 1 << (to_kind -1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200709 return -1;
710}
711
Victor Stinner17222162011-09-28 22:15:37 +0200712/* Find the maximum code point and count the number of surrogate pairs so a
713 correct string length can be computed before converting a string to UCS4.
714 This function counts single surrogates as a character and not as a pair.
715
716 Return 0 on success, or -1 on error. */
717static int
718find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
719 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200720{
721 const wchar_t *iter;
722
723 if (num_surrogates == NULL || maxchar == NULL) {
724 PyErr_SetString(PyExc_SystemError,
725 "unexpected NULL arguments to "
726 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
727 return -1;
728 }
729
730 *num_surrogates = 0;
731 *maxchar = 0;
732
733 for (iter = begin; iter < end; ) {
734 if (*iter > *maxchar)
735 *maxchar = *iter;
736#if SIZEOF_WCHAR_T == 2
737 if (*iter >= 0xD800 && *iter <= 0xDBFF
738 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
739 {
740 Py_UCS4 surrogate_val;
741 surrogate_val = (((iter[0] & 0x3FF)<<10)
742 | (iter[1] & 0x3FF)) + 0x10000;
743 ++(*num_surrogates);
744 if (surrogate_val > *maxchar)
745 *maxchar = surrogate_val;
746 iter += 2;
747 }
748 else
749 iter++;
750#else
751 iter++;
752#endif
753 }
754 return 0;
755}
756
757#ifdef Py_DEBUG
758int unicode_ready_calls = 0;
759#endif
760
761int
762_PyUnicode_Ready(PyUnicodeObject *unicode)
763{
764 wchar_t *end;
765 Py_UCS4 maxchar = 0;
766 Py_ssize_t num_surrogates;
767#if SIZEOF_WCHAR_T == 2
768 Py_ssize_t length_wo_surrogates;
769#endif
770
771 assert(PyUnicode_Check(unicode));
772
773 if (unicode->data.any != NULL) {
774 assert(PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
775 return 0;
776 }
777
778 /* _PyUnicode_Ready() is only intented for old-style API usage where
779 * strings were created using _PyObject_New() and where no canonical
780 * representation (the str field) has been set yet aka strings
781 * which are not yet ready.
782 */
783 assert(_PyUnicode_WSTR(unicode) != NULL);
784 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
785 assert(!PyUnicode_IS_COMPACT(unicode));
786 assert(!PyUnicode_IS_READY(unicode));
787 /* Actually, it should neither be interned nor be anything else: */
788 assert(_PyUnicode_STATE(unicode).interned == 0);
789 assert(unicode->_base.utf8 == NULL);
790
791#ifdef Py_DEBUG
792 ++unicode_ready_calls;
793#endif
794
795 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200796 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200797 &maxchar,
798 &num_surrogates) == -1) {
799 assert(0 && "PyUnicode_FindMaxCharAndNumSurrogatePairs failed");
800 return -1;
801 }
802
803 if (maxchar < 256) {
804 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
805 if (!unicode->data.any) {
806 PyErr_NoMemory();
807 return -1;
808 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200809 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200810 _PyUnicode_WSTR(unicode), end,
811 PyUnicode_1BYTE_DATA(unicode));
812 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
813 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
814 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
815 if (maxchar < 128) {
816 unicode->_base.utf8 = unicode->data.any;
817 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
818 }
819 else {
820 unicode->_base.utf8 = NULL;
821 unicode->_base.utf8_length = 0;
822 }
823 PyObject_FREE(_PyUnicode_WSTR(unicode));
824 _PyUnicode_WSTR(unicode) = NULL;
825 _PyUnicode_WSTR_LENGTH(unicode) = 0;
826 }
827 /* In this case we might have to convert down from 4-byte native
828 wchar_t to 2-byte unicode. */
829 else if (maxchar < 65536) {
830 assert(num_surrogates == 0 &&
831 "FindMaxCharAndNumSurrogatePairs() messed up");
832
833 if (sizeof(wchar_t) == 2) {
834 /* We can share representations and are done. */
835 unicode->data.any = _PyUnicode_WSTR(unicode);
836 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
837 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
838 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
839 unicode->_base.utf8 = NULL;
840 unicode->_base.utf8_length = 0;
841 }
842 else {
843 assert(sizeof(wchar_t) == 4);
844
845 unicode->data.any = PyObject_MALLOC(
846 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
847 if (!unicode->data.any) {
848 PyErr_NoMemory();
849 return -1;
850 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200851 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852 _PyUnicode_WSTR(unicode), end,
853 PyUnicode_2BYTE_DATA(unicode));
854 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
855 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
856 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
857 unicode->_base.utf8 = NULL;
858 unicode->_base.utf8_length = 0;
859 PyObject_FREE(_PyUnicode_WSTR(unicode));
860 _PyUnicode_WSTR(unicode) = NULL;
861 _PyUnicode_WSTR_LENGTH(unicode) = 0;
862 }
863 }
864 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
865 else {
866#if SIZEOF_WCHAR_T == 2
867 /* in case the native representation is 2-bytes, we need to allocate a
868 new normalized 4-byte version. */
869 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
870 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
871 if (!unicode->data.any) {
872 PyErr_NoMemory();
873 return -1;
874 }
875 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
876 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
877 unicode->_base.utf8 = NULL;
878 unicode->_base.utf8_length = 0;
879 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
880 unicode) < 0) {
881 assert(0 && "ConvertWideCharToUCS4 failed");
882 return -1;
883 }
884 PyObject_FREE(_PyUnicode_WSTR(unicode));
885 _PyUnicode_WSTR(unicode) = NULL;
886 _PyUnicode_WSTR_LENGTH(unicode) = 0;
887#else
888 assert(num_surrogates == 0);
889
890 unicode->data.any = _PyUnicode_WSTR(unicode);
891 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
892 unicode->_base.utf8 = NULL;
893 unicode->_base.utf8_length = 0;
894 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
895#endif
896 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
897 }
898 _PyUnicode_STATE(unicode).ready = 1;
899 return 0;
900}
901
Alexander Belopolsky40018472011-02-26 01:02:56 +0000902static void
903unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904{
Walter Dörwald16807132007-05-25 13:52:07 +0000905 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000906 case SSTATE_NOT_INTERNED:
907 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000908
Benjamin Peterson29060642009-01-31 22:14:21 +0000909 case SSTATE_INTERNED_MORTAL:
910 /* revive dead object temporarily for DelItem */
911 Py_REFCNT(unicode) = 3;
912 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
913 Py_FatalError(
914 "deletion of interned string failed");
915 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000916
Benjamin Peterson29060642009-01-31 22:14:21 +0000917 case SSTATE_INTERNED_IMMORTAL:
918 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000919
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 default:
921 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000922 }
923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200924 if (_PyUnicode_WSTR(unicode) &&
925 (!PyUnicode_IS_READY(unicode) ||
926 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
927 PyObject_DEL(_PyUnicode_WSTR(unicode));
928 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
929 PyObject_DEL(unicode->_base.utf8);
930
931 if (PyUnicode_IS_COMPACT(unicode)) {
932 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000933 }
934 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200935 if (unicode->data.any)
936 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000937 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000938 }
939}
940
Alexander Belopolsky40018472011-02-26 01:02:56 +0000941static int
942_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000943{
944 register PyUnicodeObject *v;
945
946 /* Argument checks */
947 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000948 PyErr_BadInternalCall();
949 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000950 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000951 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200952 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
953 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000954 PyErr_BadInternalCall();
955 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000956 }
957
958 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200959 possible since these are being shared.
960 The same goes for new-representation unicode objects or objects which
961 have already been readied.
962 For these, we simply return a fresh copy with the same Unicode content.
963 */
964 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
965 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
966 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000967 PyUnicodeObject *w = _PyUnicode_New(length);
968 if (w == NULL)
969 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
971 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000972 Py_DECREF(*unicode);
973 *unicode = w;
974 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000975 }
976
977 /* Note that we don't have to modify *unicode for unshared Unicode
978 objects, since we can modify them in-place. */
979 return unicode_resize(v, length);
980}
981
Alexander Belopolsky40018472011-02-26 01:02:56 +0000982int
983PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000984{
985 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
986}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988static PyObject*
989get_latin1_char(unsigned char ch)
990{
991 PyUnicodeObject *unicode = unicode_latin1[ch];
992 if (!unicode) {
993 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
994 if (!unicode)
995 return NULL;
996 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
997 unicode_latin1[ch] = unicode;
998 }
999 Py_INCREF(unicode);
1000 return (PyObject *)unicode;
1001}
1002
Alexander Belopolsky40018472011-02-26 01:02:56 +00001003PyObject *
1004PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001005{
1006 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007 Py_UCS4 maxchar = 0;
1008 Py_ssize_t num_surrogates;
1009
1010 if (u == NULL)
1011 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001012
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001013 /* If the Unicode data is known at construction time, we can apply
1014 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001016 /* Optimization for empty strings */
1017 if (size == 0 && unicode_empty != NULL) {
1018 Py_INCREF(unicode_empty);
1019 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001020 }
Tim Petersced69f82003-09-16 20:30:58 +00001021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001022 /* Single character Unicode objects in the Latin-1 range are
1023 shared when using this constructor */
1024 if (size == 1 && *u < 256)
1025 return get_latin1_char((unsigned char)*u);
1026
1027 /* If not empty and not single character, copy the Unicode data
1028 into the new object */
Victor Stinner17222162011-09-28 22:15:37 +02001029 if (find_maxchar_surrogates(u, u + size, &maxchar,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030 &num_surrogates) == -1)
1031 return NULL;
1032
1033 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1034 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001035 if (!unicode)
1036 return NULL;
1037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038 switch (PyUnicode_KIND(unicode)) {
1039 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001040 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1042 break;
1043 case PyUnicode_2BYTE_KIND:
1044#if Py_UNICODE_SIZE == 2
1045 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1046#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001047 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1049#endif
1050 break;
1051 case PyUnicode_4BYTE_KIND:
1052#if SIZEOF_WCHAR_T == 2
1053 /* This is the only case which has to process surrogates, thus
1054 a simple copy loop is not enough and we need a function. */
1055 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1056 Py_DECREF(unicode);
1057 return NULL;
1058 }
1059#else
1060 assert(num_surrogates == 0);
1061 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1062#endif
1063 break;
1064 default:
1065 assert(0 && "Impossible state");
1066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067
1068 return (PyObject *)unicode;
1069}
1070
Alexander Belopolsky40018472011-02-26 01:02:56 +00001071PyObject *
1072PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001073{
1074 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001075
Benjamin Peterson14339b62009-01-31 16:36:08 +00001076 if (size < 0) {
1077 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001078 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001079 return NULL;
1080 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001081
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001082 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001083 some optimizations which share commonly used objects.
1084 Also, this means the input must be UTF-8, so fall back to the
1085 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001086 if (u != NULL) {
1087
Benjamin Peterson29060642009-01-31 22:14:21 +00001088 /* Optimization for empty strings */
1089 if (size == 0 && unicode_empty != NULL) {
1090 Py_INCREF(unicode_empty);
1091 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001092 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001093
1094 /* Single characters are shared when using this constructor.
1095 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001096 if (size == 1 && Py_CHARMASK(*u) < 128)
1097 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001098
1099 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001100 }
1101
Walter Dörwald55507312007-05-18 13:12:10 +00001102 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001103 if (!unicode)
1104 return NULL;
1105
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001106 return (PyObject *)unicode;
1107}
1108
Alexander Belopolsky40018472011-02-26 01:02:56 +00001109PyObject *
1110PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001111{
1112 size_t size = strlen(u);
1113 if (size > PY_SSIZE_T_MAX) {
1114 PyErr_SetString(PyExc_OverflowError, "input too long");
1115 return NULL;
1116 }
1117
1118 return PyUnicode_FromStringAndSize(u, size);
1119}
1120
Victor Stinnere57b1c02011-09-28 22:20:48 +02001121static PyObject*
1122_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001123{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124 PyObject *res;
1125 unsigned char max = 127;
1126 Py_ssize_t i;
1127 for (i = 0; i < size; i++) {
1128 if (u[i] & 0x80) {
1129 max = 255;
1130 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001131 }
1132 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001133 res = PyUnicode_New(size, max);
1134 if (!res)
1135 return NULL;
1136 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1137 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001138}
1139
Victor Stinnere57b1c02011-09-28 22:20:48 +02001140static PyObject*
1141_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142{
1143 PyObject *res;
1144 Py_UCS2 max = 0;
1145 Py_ssize_t i;
1146 for (i = 0; i < size; i++)
1147 if (u[i] > max)
1148 max = u[i];
1149 res = PyUnicode_New(size, max);
1150 if (!res)
1151 return NULL;
1152 if (max >= 256)
1153 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1154 else
1155 for (i = 0; i < size; i++)
1156 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1157 return res;
1158}
1159
Victor Stinnere57b1c02011-09-28 22:20:48 +02001160static PyObject*
1161_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162{
1163 PyObject *res;
1164 Py_UCS4 max = 0;
1165 Py_ssize_t i;
1166 for (i = 0; i < size; i++)
1167 if (u[i] > max)
1168 max = u[i];
1169 res = PyUnicode_New(size, max);
1170 if (!res)
1171 return NULL;
1172 if (max >= 0x10000)
1173 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1174 else {
1175 int kind = PyUnicode_KIND(res);
1176 void *data = PyUnicode_DATA(res);
1177 for (i = 0; i < size; i++)
1178 PyUnicode_WRITE(kind, data, i, u[i]);
1179 }
1180 return res;
1181}
1182
1183PyObject*
1184PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1185{
1186 switch(kind) {
1187 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001188 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001189 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001190 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001191 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001192 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001193 }
1194 assert(0);
1195 return NULL;
1196}
1197
1198
1199/* Widen Unicode objects to larger buffers.
1200 Return NULL if the string is too wide already. */
1201
1202void*
1203_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1204{
1205 Py_ssize_t i;
1206 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1207 void *d = PyUnicode_DATA(s);
1208 unsigned int skind = PyUnicode_KIND(s);
1209 if (PyUnicode_KIND(s) >= kind) {
1210 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1211 return NULL;
1212 }
1213 switch(kind) {
1214 case PyUnicode_2BYTE_KIND: {
1215 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1216 if (!result) {
1217 PyErr_NoMemory();
1218 return 0;
1219 }
1220 for (i = 0; i < len; i++)
1221 result[i] = ((Py_UCS1*)d)[i];
1222 return result;
1223 }
1224 case PyUnicode_4BYTE_KIND: {
1225 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1226 if (!result) {
1227 PyErr_NoMemory();
1228 return 0;
1229 }
1230 for (i = 0; i < len; i++)
1231 result[i] = PyUnicode_READ(skind, d, i);
1232 return result;
1233 }
1234 }
1235 Py_FatalError("invalid kind");
1236 return NULL;
1237}
1238
1239static Py_UCS4*
1240as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1241 int copy_null)
1242{
1243 int kind;
1244 void *data;
1245 Py_ssize_t len, targetlen;
1246 if (PyUnicode_READY(string) == -1)
1247 return NULL;
1248 kind = PyUnicode_KIND(string);
1249 data = PyUnicode_DATA(string);
1250 len = PyUnicode_GET_LENGTH(string);
1251 targetlen = len;
1252 if (copy_null)
1253 targetlen++;
1254 if (!target) {
1255 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1256 PyErr_NoMemory();
1257 return NULL;
1258 }
1259 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1260 if (!target) {
1261 PyErr_NoMemory();
1262 return NULL;
1263 }
1264 }
1265 else {
1266 if (targetsize < targetlen) {
1267 PyErr_Format(PyExc_SystemError,
1268 "string is longer than the buffer");
1269 if (copy_null && 0 < targetsize)
1270 target[0] = 0;
1271 return NULL;
1272 }
1273 }
1274 if (kind != PyUnicode_4BYTE_KIND) {
1275 Py_ssize_t i;
1276 for (i = 0; i < len; i++)
1277 target[i] = PyUnicode_READ(kind, data, i);
1278 }
1279 else
1280 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1281 if (copy_null)
1282 target[len] = 0;
1283 return target;
1284}
1285
1286Py_UCS4*
1287PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1288 int copy_null)
1289{
1290 if (target == NULL || targetsize < 1) {
1291 PyErr_BadInternalCall();
1292 return NULL;
1293 }
1294 return as_ucs4(string, target, targetsize, copy_null);
1295}
1296
1297Py_UCS4*
1298PyUnicode_AsUCS4Copy(PyObject *string)
1299{
1300 return as_ucs4(string, NULL, 0, 1);
1301}
1302
1303#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001304
Alexander Belopolsky40018472011-02-26 01:02:56 +00001305PyObject *
1306PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001309 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001311 PyErr_BadInternalCall();
1312 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001313 }
1314
Martin v. Löwis790465f2008-04-05 20:41:37 +00001315 if (size == -1) {
1316 size = wcslen(w);
1317 }
1318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320}
1321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001323
Walter Dörwald346737f2007-05-31 10:44:43 +00001324static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001325makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1326 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001327{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001328 *fmt++ = '%';
1329 if (width) {
1330 if (zeropad)
1331 *fmt++ = '0';
1332 fmt += sprintf(fmt, "%d", width);
1333 }
1334 if (precision)
1335 fmt += sprintf(fmt, ".%d", precision);
1336 if (longflag)
1337 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001338 else if (longlongflag) {
1339 /* longlongflag should only ever be nonzero on machines with
1340 HAVE_LONG_LONG defined */
1341#ifdef HAVE_LONG_LONG
1342 char *f = PY_FORMAT_LONG_LONG;
1343 while (*f)
1344 *fmt++ = *f++;
1345#else
1346 /* we shouldn't ever get here */
1347 assert(0);
1348 *fmt++ = 'l';
1349#endif
1350 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001351 else if (size_tflag) {
1352 char *f = PY_FORMAT_SIZE_T;
1353 while (*f)
1354 *fmt++ = *f++;
1355 }
1356 *fmt++ = c;
1357 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001358}
1359
Victor Stinner96865452011-03-01 23:44:09 +00001360/* helper for PyUnicode_FromFormatV() */
1361
1362static const char*
1363parse_format_flags(const char *f,
1364 int *p_width, int *p_precision,
1365 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1366{
1367 int width, precision, longflag, longlongflag, size_tflag;
1368
1369 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1370 f++;
1371 width = 0;
1372 while (Py_ISDIGIT((unsigned)*f))
1373 width = (width*10) + *f++ - '0';
1374 precision = 0;
1375 if (*f == '.') {
1376 f++;
1377 while (Py_ISDIGIT((unsigned)*f))
1378 precision = (precision*10) + *f++ - '0';
1379 if (*f == '%') {
1380 /* "%.3%s" => f points to "3" */
1381 f--;
1382 }
1383 }
1384 if (*f == '\0') {
1385 /* bogus format "%.1" => go backward, f points to "1" */
1386 f--;
1387 }
1388 if (p_width != NULL)
1389 *p_width = width;
1390 if (p_precision != NULL)
1391 *p_precision = precision;
1392
1393 /* Handle %ld, %lu, %lld and %llu. */
1394 longflag = 0;
1395 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001396 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001397
1398 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001399 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001400 longflag = 1;
1401 ++f;
1402 }
1403#ifdef HAVE_LONG_LONG
1404 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001405 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001406 longlongflag = 1;
1407 f += 2;
1408 }
1409#endif
1410 }
1411 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001412 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001413 size_tflag = 1;
1414 ++f;
1415 }
1416 if (p_longflag != NULL)
1417 *p_longflag = longflag;
1418 if (p_longlongflag != NULL)
1419 *p_longlongflag = longlongflag;
1420 if (p_size_tflag != NULL)
1421 *p_size_tflag = size_tflag;
1422 return f;
1423}
1424
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001425/* maximum number of characters required for output of %ld. 21 characters
1426 allows for 64-bit integers (in decimal) and an optional sign. */
1427#define MAX_LONG_CHARS 21
1428/* maximum number of characters required for output of %lld.
1429 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1430 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1431#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1432
Walter Dörwaldd2034312007-05-18 16:29:38 +00001433PyObject *
1434PyUnicode_FromFormatV(const char *format, va_list vargs)
1435{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001436 va_list count;
1437 Py_ssize_t callcount = 0;
1438 PyObject **callresults = NULL;
1439 PyObject **callresult = NULL;
1440 Py_ssize_t n = 0;
1441 int width = 0;
1442 int precision = 0;
1443 int zeropad;
1444 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001446 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001447 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1449 Py_UCS4 argmaxchar;
1450 Py_ssize_t numbersize = 0;
1451 char *numberresults = NULL;
1452 char *numberresult = NULL;
1453 Py_ssize_t i;
1454 int kind;
1455 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001456
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001457 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001458 /* step 1: count the number of %S/%R/%A/%s format specifications
1459 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1460 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001461 * result in an array)
1462 * also esimate a upper bound for all the number formats in the string,
1463 * numbers will be formated in step 3 and be keept in a '\0'-separated
1464 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001465 for (f = format; *f; f++) {
1466 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001467 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1469 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1470 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1471 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001474#ifdef HAVE_LONG_LONG
1475 if (longlongflag) {
1476 if (width < MAX_LONG_LONG_CHARS)
1477 width = MAX_LONG_LONG_CHARS;
1478 }
1479 else
1480#endif
1481 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1482 including sign. Decimal takes the most space. This
1483 isn't enough for octal. If a width is specified we
1484 need more (which we allocate later). */
1485 if (width < MAX_LONG_CHARS)
1486 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487
1488 /* account for the size + '\0' to separate numbers
1489 inside of the numberresults buffer */
1490 numbersize += (width + 1);
1491 }
1492 }
1493 else if ((unsigned char)*f > 127) {
1494 PyErr_Format(PyExc_ValueError,
1495 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1496 "string, got a non-ASCII byte: 0x%02x",
1497 (unsigned char)*f);
1498 return NULL;
1499 }
1500 }
1501 /* step 2: allocate memory for the results of
1502 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1503 if (callcount) {
1504 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1505 if (!callresults) {
1506 PyErr_NoMemory();
1507 return NULL;
1508 }
1509 callresult = callresults;
1510 }
1511 /* step 2.5: allocate memory for the results of formating numbers */
1512 if (numbersize) {
1513 numberresults = PyObject_Malloc(numbersize);
1514 if (!numberresults) {
1515 PyErr_NoMemory();
1516 goto fail;
1517 }
1518 numberresult = numberresults;
1519 }
1520
1521 /* step 3: format numbers and figure out how large a buffer we need */
1522 for (f = format; *f; f++) {
1523 if (*f == '%') {
1524 const char* p;
1525 int longflag;
1526 int longlongflag;
1527 int size_tflag;
1528 int numprinted;
1529
1530 p = f;
1531 zeropad = (f[1] == '0');
1532 f = parse_format_flags(f, &width, &precision,
1533 &longflag, &longlongflag, &size_tflag);
1534 switch (*f) {
1535 case 'c':
1536 {
1537 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001538 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001539 n++;
1540 break;
1541 }
1542 case '%':
1543 n++;
1544 break;
1545 case 'i':
1546 case 'd':
1547 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1548 width, precision, *f);
1549 if (longflag)
1550 numprinted = sprintf(numberresult, fmt,
1551 va_arg(count, long));
1552#ifdef HAVE_LONG_LONG
1553 else if (longlongflag)
1554 numprinted = sprintf(numberresult, fmt,
1555 va_arg(count, PY_LONG_LONG));
1556#endif
1557 else if (size_tflag)
1558 numprinted = sprintf(numberresult, fmt,
1559 va_arg(count, Py_ssize_t));
1560 else
1561 numprinted = sprintf(numberresult, fmt,
1562 va_arg(count, int));
1563 n += numprinted;
1564 /* advance by +1 to skip over the '\0' */
1565 numberresult += (numprinted + 1);
1566 assert(*(numberresult - 1) == '\0');
1567 assert(*(numberresult - 2) != '\0');
1568 assert(numprinted >= 0);
1569 assert(numberresult <= numberresults + numbersize);
1570 break;
1571 case 'u':
1572 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1573 width, precision, 'u');
1574 if (longflag)
1575 numprinted = sprintf(numberresult, fmt,
1576 va_arg(count, unsigned long));
1577#ifdef HAVE_LONG_LONG
1578 else if (longlongflag)
1579 numprinted = sprintf(numberresult, fmt,
1580 va_arg(count, unsigned PY_LONG_LONG));
1581#endif
1582 else if (size_tflag)
1583 numprinted = sprintf(numberresult, fmt,
1584 va_arg(count, size_t));
1585 else
1586 numprinted = sprintf(numberresult, fmt,
1587 va_arg(count, unsigned int));
1588 n += numprinted;
1589 numberresult += (numprinted + 1);
1590 assert(*(numberresult - 1) == '\0');
1591 assert(*(numberresult - 2) != '\0');
1592 assert(numprinted >= 0);
1593 assert(numberresult <= numberresults + numbersize);
1594 break;
1595 case 'x':
1596 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1597 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1598 n += numprinted;
1599 numberresult += (numprinted + 1);
1600 assert(*(numberresult - 1) == '\0');
1601 assert(*(numberresult - 2) != '\0');
1602 assert(numprinted >= 0);
1603 assert(numberresult <= numberresults + numbersize);
1604 break;
1605 case 'p':
1606 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1607 /* %p is ill-defined: ensure leading 0x. */
1608 if (numberresult[1] == 'X')
1609 numberresult[1] = 'x';
1610 else if (numberresult[1] != 'x') {
1611 memmove(numberresult + 2, numberresult,
1612 strlen(numberresult) + 1);
1613 numberresult[0] = '0';
1614 numberresult[1] = 'x';
1615 numprinted += 2;
1616 }
1617 n += numprinted;
1618 numberresult += (numprinted + 1);
1619 assert(*(numberresult - 1) == '\0');
1620 assert(*(numberresult - 2) != '\0');
1621 assert(numprinted >= 0);
1622 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001623 break;
1624 case 's':
1625 {
1626 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001627 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001628 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1629 if (!str)
1630 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 /* since PyUnicode_DecodeUTF8 returns already flexible
1632 unicode objects, there is no need to call ready on them */
1633 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001634 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001635 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001636 /* Remember the str and switch to the next slot */
1637 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001638 break;
1639 }
1640 case 'U':
1641 {
1642 PyObject *obj = va_arg(count, PyObject *);
1643 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 if (PyUnicode_READY(obj) == -1)
1645 goto fail;
1646 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001647 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001649 break;
1650 }
1651 case 'V':
1652 {
1653 PyObject *obj = va_arg(count, PyObject *);
1654 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001655 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001656 assert(obj || str);
1657 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001658 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001659 if (PyUnicode_READY(obj) == -1)
1660 goto fail;
1661 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001662 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001664 *callresult++ = NULL;
1665 }
1666 else {
1667 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1668 if (!str_obj)
1669 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001671 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001673 *callresult++ = str_obj;
1674 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001675 break;
1676 }
1677 case 'S':
1678 {
1679 PyObject *obj = va_arg(count, PyObject *);
1680 PyObject *str;
1681 assert(obj);
1682 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001684 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001686 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001688 /* Remember the str and switch to the next slot */
1689 *callresult++ = str;
1690 break;
1691 }
1692 case 'R':
1693 {
1694 PyObject *obj = va_arg(count, PyObject *);
1695 PyObject *repr;
1696 assert(obj);
1697 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001699 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001701 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001702 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001703 /* Remember the repr and switch to the next slot */
1704 *callresult++ = repr;
1705 break;
1706 }
1707 case 'A':
1708 {
1709 PyObject *obj = va_arg(count, PyObject *);
1710 PyObject *ascii;
1711 assert(obj);
1712 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001714 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001716 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001717 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001718 /* Remember the repr and switch to the next slot */
1719 *callresult++ = ascii;
1720 break;
1721 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001722 default:
1723 /* if we stumble upon an unknown
1724 formatting code, copy the rest of
1725 the format string to the output
1726 string. (we cannot just skip the
1727 code, since there's no way to know
1728 what's in the argument list) */
1729 n += strlen(p);
1730 goto expand;
1731 }
1732 } else
1733 n++;
1734 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001735 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001736 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001737 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001738 we don't have to resize the string.
1739 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001741 if (!string)
1742 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 kind = PyUnicode_KIND(string);
1744 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001745 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001749 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001750 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001751
1752 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1754 /* checking for == because the last argument could be a empty
1755 string, which causes i to point to end, the assert at the end of
1756 the loop */
1757 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001758
Benjamin Peterson14339b62009-01-31 16:36:08 +00001759 switch (*f) {
1760 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001761 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 const int ordinal = va_arg(vargs, int);
1763 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001764 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001765 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001766 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001767 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001768 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001769 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770 case 'p':
1771 /* unused, since we already have the result */
1772 if (*f == 'p')
1773 (void) va_arg(vargs, void *);
1774 else
1775 (void) va_arg(vargs, int);
1776 /* extract the result from numberresults and append. */
1777 for (; *numberresult; ++i, ++numberresult)
1778 PyUnicode_WRITE(kind, data, i, *numberresult);
1779 /* skip over the separating '\0' */
1780 assert(*numberresult == '\0');
1781 numberresult++;
1782 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001783 break;
1784 case 's':
1785 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001786 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001788 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 size = PyUnicode_GET_LENGTH(*callresult);
1790 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001791 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1792 *callresult, 0,
1793 size) < 0)
1794 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001796 /* We're done with the unicode()/repr() => forget it */
1797 Py_DECREF(*callresult);
1798 /* switch to next unicode()/repr() result */
1799 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001800 break;
1801 }
1802 case 'U':
1803 {
1804 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 Py_ssize_t size;
1806 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1807 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001808 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1809 obj, 0,
1810 size) < 0)
1811 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001813 break;
1814 }
1815 case 'V':
1816 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001818 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001819 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001820 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821 size = PyUnicode_GET_LENGTH(obj);
1822 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001823 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1824 obj, 0,
1825 size) < 0)
1826 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001828 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 size = PyUnicode_GET_LENGTH(*callresult);
1830 assert(PyUnicode_KIND(*callresult) <=
1831 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001832 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1833 *callresult,
1834 0, size) < 0)
1835 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001836 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001837 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001838 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001839 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001840 break;
1841 }
1842 case 'S':
1843 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001844 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001845 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001846 /* unused, since we already have the result */
1847 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001849 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1850 *callresult, 0,
1851 PyUnicode_GET_LENGTH(*callresult)) < 0)
1852 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001853 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001854 /* We're done with the unicode()/repr() => forget it */
1855 Py_DECREF(*callresult);
1856 /* switch to next unicode()/repr() result */
1857 ++callresult;
1858 break;
1859 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001860 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001861 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001862 break;
1863 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 for (; *p; ++p, ++i)
1865 PyUnicode_WRITE(kind, data, i, *p);
1866 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001867 goto end;
1868 }
Victor Stinner1205f272010-09-11 00:54:47 +00001869 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001870 else {
1871 assert(i < PyUnicode_GET_LENGTH(string));
1872 PyUnicode_WRITE(kind, data, i++, *f);
1873 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001874 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001876
Benjamin Peterson29060642009-01-31 22:14:21 +00001877 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001878 if (callresults)
1879 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001880 if (numberresults)
1881 PyObject_Free(numberresults);
1882 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001883 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001884 if (callresults) {
1885 PyObject **callresult2 = callresults;
1886 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001887 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001888 ++callresult2;
1889 }
1890 PyObject_Free(callresults);
1891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 if (numberresults)
1893 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001894 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001895}
1896
Walter Dörwaldd2034312007-05-18 16:29:38 +00001897PyObject *
1898PyUnicode_FromFormat(const char *format, ...)
1899{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001900 PyObject* ret;
1901 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001902
1903#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001904 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001905#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001906 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001907#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001908 ret = PyUnicode_FromFormatV(format, vargs);
1909 va_end(vargs);
1910 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001911}
1912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913#ifdef HAVE_WCHAR_H
1914
Victor Stinner5593d8a2010-10-02 11:11:27 +00001915/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1916 convert a Unicode object to a wide character string.
1917
Victor Stinnerd88d9832011-09-06 02:00:05 +02001918 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001919 character) required to convert the unicode object. Ignore size argument.
1920
Victor Stinnerd88d9832011-09-06 02:00:05 +02001921 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001922 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001923 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001924static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001925unicode_aswidechar(PyUnicodeObject *unicode,
1926 wchar_t *w,
1927 Py_ssize_t size)
1928{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001929 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930 const wchar_t *wstr;
1931
1932 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1933 if (wstr == NULL)
1934 return -1;
1935
Victor Stinner5593d8a2010-10-02 11:11:27 +00001936 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001937 if (size > res)
1938 size = res + 1;
1939 else
1940 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001941 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001942 return res;
1943 }
1944 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001945 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001946}
1947
1948Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001949PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001950 wchar_t *w,
1951 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952{
1953 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001954 PyErr_BadInternalCall();
1955 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001957 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001958}
1959
Victor Stinner137c34c2010-09-29 10:25:54 +00001960wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001961PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001962 Py_ssize_t *size)
1963{
1964 wchar_t* buffer;
1965 Py_ssize_t buflen;
1966
1967 if (unicode == NULL) {
1968 PyErr_BadInternalCall();
1969 return NULL;
1970 }
1971
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001972 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 if (buflen == -1)
1974 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001975 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001976 PyErr_NoMemory();
1977 return NULL;
1978 }
1979
Victor Stinner137c34c2010-09-29 10:25:54 +00001980 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1981 if (buffer == NULL) {
1982 PyErr_NoMemory();
1983 return NULL;
1984 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001985 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986 if (buflen == -1)
1987 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001988 if (size != NULL)
1989 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001990 return buffer;
1991}
1992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994
Alexander Belopolsky40018472011-02-26 01:02:56 +00001995PyObject *
1996PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001999 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002000 PyErr_SetString(PyExc_ValueError,
2001 "chr() arg not in range(0x110000)");
2002 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002003 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 if (ordinal < 256)
2006 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 v = PyUnicode_New(1, ordinal);
2009 if (v == NULL)
2010 return NULL;
2011 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2012 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002013}
2014
Alexander Belopolsky40018472011-02-26 01:02:56 +00002015PyObject *
2016PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002018 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002019 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002020 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002021 Py_INCREF(obj);
2022 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002023 }
2024 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002025 /* For a Unicode subtype that's not a Unicode object,
2026 return a true Unicode object with the same data. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 if (PyUnicode_READY(obj) == -1)
2028 return NULL;
2029 return substring((PyUnicodeObject *)obj, 0, PyUnicode_GET_LENGTH(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002030 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002031 PyErr_Format(PyExc_TypeError,
2032 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002033 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002034 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002035}
2036
Alexander Belopolsky40018472011-02-26 01:02:56 +00002037PyObject *
2038PyUnicode_FromEncodedObject(register PyObject *obj,
2039 const char *encoding,
2040 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002041{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002042 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002043 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002044
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002046 PyErr_BadInternalCall();
2047 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002049
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002050 /* Decoding bytes objects is the most common case and should be fast */
2051 if (PyBytes_Check(obj)) {
2052 if (PyBytes_GET_SIZE(obj) == 0) {
2053 Py_INCREF(unicode_empty);
2054 v = (PyObject *) unicode_empty;
2055 }
2056 else {
2057 v = PyUnicode_Decode(
2058 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2059 encoding, errors);
2060 }
2061 return v;
2062 }
2063
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002064 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002065 PyErr_SetString(PyExc_TypeError,
2066 "decoding str is not supported");
2067 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002068 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002069
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002070 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2071 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2072 PyErr_Format(PyExc_TypeError,
2073 "coercing to str: need bytes, bytearray "
2074 "or buffer-like object, %.80s found",
2075 Py_TYPE(obj)->tp_name);
2076 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002077 }
Tim Petersced69f82003-09-16 20:30:58 +00002078
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002079 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002080 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002081 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 }
Tim Petersced69f82003-09-16 20:30:58 +00002083 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002084 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002085
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002086 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002087 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088}
2089
Victor Stinner600d3be2010-06-10 12:00:55 +00002090/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002091 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2092 1 on success. */
2093static int
2094normalize_encoding(const char *encoding,
2095 char *lower,
2096 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002098 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002099 char *l;
2100 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002101
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002102 e = encoding;
2103 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002104 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002105 while (*e) {
2106 if (l == l_end)
2107 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002108 if (Py_ISUPPER(*e)) {
2109 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002110 }
2111 else if (*e == '_') {
2112 *l++ = '-';
2113 e++;
2114 }
2115 else {
2116 *l++ = *e++;
2117 }
2118 }
2119 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002120 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002121}
2122
Alexander Belopolsky40018472011-02-26 01:02:56 +00002123PyObject *
2124PyUnicode_Decode(const char *s,
2125 Py_ssize_t size,
2126 const char *encoding,
2127 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002128{
2129 PyObject *buffer = NULL, *unicode;
2130 Py_buffer info;
2131 char lower[11]; /* Enough for any encoding shortcut */
2132
2133 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002134 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002135
2136 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002137 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002138 if ((strcmp(lower, "utf-8") == 0) ||
2139 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002140 return PyUnicode_DecodeUTF8(s, size, errors);
2141 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002142 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002143 (strcmp(lower, "iso-8859-1") == 0))
2144 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002145#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002146 else if (strcmp(lower, "mbcs") == 0)
2147 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002148#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002149 else if (strcmp(lower, "ascii") == 0)
2150 return PyUnicode_DecodeASCII(s, size, errors);
2151 else if (strcmp(lower, "utf-16") == 0)
2152 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2153 else if (strcmp(lower, "utf-32") == 0)
2154 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156
2157 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002158 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002159 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002160 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002161 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 if (buffer == NULL)
2163 goto onError;
2164 unicode = PyCodec_Decode(buffer, encoding, errors);
2165 if (unicode == NULL)
2166 goto onError;
2167 if (!PyUnicode_Check(unicode)) {
2168 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002169 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002170 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171 Py_DECREF(unicode);
2172 goto onError;
2173 }
2174 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002175 if (PyUnicode_READY(unicode)) {
2176 Py_DECREF(unicode);
2177 return NULL;
2178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002179 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002180
Benjamin Peterson29060642009-01-31 22:14:21 +00002181 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 Py_XDECREF(buffer);
2183 return NULL;
2184}
2185
Alexander Belopolsky40018472011-02-26 01:02:56 +00002186PyObject *
2187PyUnicode_AsDecodedObject(PyObject *unicode,
2188 const char *encoding,
2189 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002190{
2191 PyObject *v;
2192
2193 if (!PyUnicode_Check(unicode)) {
2194 PyErr_BadArgument();
2195 goto onError;
2196 }
2197
2198 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002199 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002200
2201 /* Decode via the codec registry */
2202 v = PyCodec_Decode(unicode, encoding, errors);
2203 if (v == NULL)
2204 goto onError;
2205 return v;
2206
Benjamin Peterson29060642009-01-31 22:14:21 +00002207 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002208 return NULL;
2209}
2210
Alexander Belopolsky40018472011-02-26 01:02:56 +00002211PyObject *
2212PyUnicode_AsDecodedUnicode(PyObject *unicode,
2213 const char *encoding,
2214 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002215{
2216 PyObject *v;
2217
2218 if (!PyUnicode_Check(unicode)) {
2219 PyErr_BadArgument();
2220 goto onError;
2221 }
2222
2223 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002224 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002225
2226 /* Decode via the codec registry */
2227 v = PyCodec_Decode(unicode, encoding, errors);
2228 if (v == NULL)
2229 goto onError;
2230 if (!PyUnicode_Check(v)) {
2231 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002232 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002233 Py_TYPE(v)->tp_name);
2234 Py_DECREF(v);
2235 goto onError;
2236 }
2237 return v;
2238
Benjamin Peterson29060642009-01-31 22:14:21 +00002239 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002240 return NULL;
2241}
2242
Alexander Belopolsky40018472011-02-26 01:02:56 +00002243PyObject *
2244PyUnicode_Encode(const Py_UNICODE *s,
2245 Py_ssize_t size,
2246 const char *encoding,
2247 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248{
2249 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002250
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 unicode = PyUnicode_FromUnicode(s, size);
2252 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002254 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2255 Py_DECREF(unicode);
2256 return v;
2257}
2258
Alexander Belopolsky40018472011-02-26 01:02:56 +00002259PyObject *
2260PyUnicode_AsEncodedObject(PyObject *unicode,
2261 const char *encoding,
2262 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002263{
2264 PyObject *v;
2265
2266 if (!PyUnicode_Check(unicode)) {
2267 PyErr_BadArgument();
2268 goto onError;
2269 }
2270
2271 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002272 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002273
2274 /* Encode via the codec registry */
2275 v = PyCodec_Encode(unicode, encoding, errors);
2276 if (v == NULL)
2277 goto onError;
2278 return v;
2279
Benjamin Peterson29060642009-01-31 22:14:21 +00002280 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002281 return NULL;
2282}
2283
Victor Stinnerad158722010-10-27 00:25:46 +00002284PyObject *
2285PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002286{
Victor Stinner99b95382011-07-04 14:23:54 +02002287#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002288 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2289 PyUnicode_GET_SIZE(unicode),
2290 NULL);
2291#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002292 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002293#else
Victor Stinner793b5312011-04-27 00:24:21 +02002294 PyInterpreterState *interp = PyThreadState_GET()->interp;
2295 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2296 cannot use it to encode and decode filenames before it is loaded. Load
2297 the Python codec requires to encode at least its own filename. Use the C
2298 version of the locale codec until the codec registry is initialized and
2299 the Python codec is loaded.
2300
2301 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2302 cannot only rely on it: check also interp->fscodec_initialized for
2303 subinterpreters. */
2304 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002305 return PyUnicode_AsEncodedString(unicode,
2306 Py_FileSystemDefaultEncoding,
2307 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002308 }
2309 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002310 /* locale encoding with surrogateescape */
2311 wchar_t *wchar;
2312 char *bytes;
2313 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002314 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002315
2316 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2317 if (wchar == NULL)
2318 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002319 bytes = _Py_wchar2char(wchar, &error_pos);
2320 if (bytes == NULL) {
2321 if (error_pos != (size_t)-1) {
2322 char *errmsg = strerror(errno);
2323 PyObject *exc = NULL;
2324 if (errmsg == NULL)
2325 errmsg = "Py_wchar2char() failed";
2326 raise_encode_exception(&exc,
2327 "filesystemencoding",
2328 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2329 error_pos, error_pos+1,
2330 errmsg);
2331 Py_XDECREF(exc);
2332 }
2333 else
2334 PyErr_NoMemory();
2335 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002336 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002337 }
2338 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002339
2340 bytes_obj = PyBytes_FromString(bytes);
2341 PyMem_Free(bytes);
2342 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002343 }
Victor Stinnerad158722010-10-27 00:25:46 +00002344#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002345}
2346
Alexander Belopolsky40018472011-02-26 01:02:56 +00002347PyObject *
2348PyUnicode_AsEncodedString(PyObject *unicode,
2349 const char *encoding,
2350 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351{
2352 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002353 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002354
Guido van Rossumd57fd912000-03-10 22:53:23 +00002355 if (!PyUnicode_Check(unicode)) {
2356 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358 }
Fred Drakee4315f52000-05-09 19:53:39 +00002359
Victor Stinner2f283c22011-03-02 01:21:46 +00002360 if (encoding == NULL) {
2361 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002362 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002363 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002365 }
Fred Drakee4315f52000-05-09 19:53:39 +00002366
2367 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002368 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002369 if ((strcmp(lower, "utf-8") == 0) ||
2370 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002371 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002372 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002373 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002374 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002375 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002376 }
Victor Stinner37296e82010-06-10 13:36:23 +00002377 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002378 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002379 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002380 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002381#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002382 else if (strcmp(lower, "mbcs") == 0)
2383 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2384 PyUnicode_GET_SIZE(unicode),
2385 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002386#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002387 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002388 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002389 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390
2391 /* Encode via the codec registry */
2392 v = PyCodec_Encode(unicode, encoding, errors);
2393 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002394 return NULL;
2395
2396 /* The normal path */
2397 if (PyBytes_Check(v))
2398 return v;
2399
2400 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002401 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002402 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002403 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002404
2405 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2406 "encoder %s returned bytearray instead of bytes",
2407 encoding);
2408 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002409 Py_DECREF(v);
2410 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002411 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002412
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002413 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2414 Py_DECREF(v);
2415 return b;
2416 }
2417
2418 PyErr_Format(PyExc_TypeError,
2419 "encoder did not return a bytes object (type=%.400s)",
2420 Py_TYPE(v)->tp_name);
2421 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002422 return NULL;
2423}
2424
Alexander Belopolsky40018472011-02-26 01:02:56 +00002425PyObject *
2426PyUnicode_AsEncodedUnicode(PyObject *unicode,
2427 const char *encoding,
2428 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002429{
2430 PyObject *v;
2431
2432 if (!PyUnicode_Check(unicode)) {
2433 PyErr_BadArgument();
2434 goto onError;
2435 }
2436
2437 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002438 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002439
2440 /* Encode via the codec registry */
2441 v = PyCodec_Encode(unicode, encoding, errors);
2442 if (v == NULL)
2443 goto onError;
2444 if (!PyUnicode_Check(v)) {
2445 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002446 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002447 Py_TYPE(v)->tp_name);
2448 Py_DECREF(v);
2449 goto onError;
2450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002451 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002452
Benjamin Peterson29060642009-01-31 22:14:21 +00002453 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002454 return NULL;
2455}
2456
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002457PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002458PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002459 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002460 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2461}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002462
Christian Heimes5894ba72007-11-04 11:43:14 +00002463PyObject*
2464PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2465{
Victor Stinner99b95382011-07-04 14:23:54 +02002466#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002467 return PyUnicode_DecodeMBCS(s, size, NULL);
2468#elif defined(__APPLE__)
2469 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2470#else
Victor Stinner793b5312011-04-27 00:24:21 +02002471 PyInterpreterState *interp = PyThreadState_GET()->interp;
2472 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2473 cannot use it to encode and decode filenames before it is loaded. Load
2474 the Python codec requires to encode at least its own filename. Use the C
2475 version of the locale codec until the codec registry is initialized and
2476 the Python codec is loaded.
2477
2478 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2479 cannot only rely on it: check also interp->fscodec_initialized for
2480 subinterpreters. */
2481 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002482 return PyUnicode_Decode(s, size,
2483 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002484 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002485 }
2486 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002487 /* locale encoding with surrogateescape */
2488 wchar_t *wchar;
2489 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002490 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002491
2492 if (s[size] != '\0' || size != strlen(s)) {
2493 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2494 return NULL;
2495 }
2496
Victor Stinner168e1172010-10-16 23:16:16 +00002497 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002498 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002499 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002500
Victor Stinner168e1172010-10-16 23:16:16 +00002501 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002502 PyMem_Free(wchar);
2503 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002504 }
Victor Stinnerad158722010-10-27 00:25:46 +00002505#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002506}
2507
Martin v. Löwis011e8422009-05-05 04:43:17 +00002508
2509int
2510PyUnicode_FSConverter(PyObject* arg, void* addr)
2511{
2512 PyObject *output = NULL;
2513 Py_ssize_t size;
2514 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002515 if (arg == NULL) {
2516 Py_DECREF(*(PyObject**)addr);
2517 return 1;
2518 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002519 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002520 output = arg;
2521 Py_INCREF(output);
2522 }
2523 else {
2524 arg = PyUnicode_FromObject(arg);
2525 if (!arg)
2526 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002527 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002528 Py_DECREF(arg);
2529 if (!output)
2530 return 0;
2531 if (!PyBytes_Check(output)) {
2532 Py_DECREF(output);
2533 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2534 return 0;
2535 }
2536 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002537 size = PyBytes_GET_SIZE(output);
2538 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002539 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002540 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002541 Py_DECREF(output);
2542 return 0;
2543 }
2544 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002545 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002546}
2547
2548
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002549int
2550PyUnicode_FSDecoder(PyObject* arg, void* addr)
2551{
2552 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002553 if (arg == NULL) {
2554 Py_DECREF(*(PyObject**)addr);
2555 return 1;
2556 }
2557 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002558 if (PyUnicode_READY(arg))
2559 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002560 output = arg;
2561 Py_INCREF(output);
2562 }
2563 else {
2564 arg = PyBytes_FromObject(arg);
2565 if (!arg)
2566 return 0;
2567 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2568 PyBytes_GET_SIZE(arg));
2569 Py_DECREF(arg);
2570 if (!output)
2571 return 0;
2572 if (!PyUnicode_Check(output)) {
2573 Py_DECREF(output);
2574 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2575 return 0;
2576 }
2577 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2579 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002580 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2581 Py_DECREF(output);
2582 return 0;
2583 }
2584 *(PyObject**)addr = output;
2585 return Py_CLEANUP_SUPPORTED;
2586}
2587
2588
Martin v. Löwis5b222132007-06-10 09:51:05 +00002589char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002591{
Christian Heimesf3863112007-11-22 07:46:41 +00002592 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2594
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002595 if (!PyUnicode_Check(unicode)) {
2596 PyErr_BadArgument();
2597 return NULL;
2598 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002599 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002600 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002601
2602 if (_PyUnicode_UTF8(unicode) == NULL) {
2603 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2604 if (bytes == NULL)
2605 return NULL;
2606 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2607 if (u->_base.utf8 == NULL) {
2608 Py_DECREF(bytes);
2609 return NULL;
2610 }
2611 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2612 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2613 Py_DECREF(bytes);
2614 }
2615
2616 if (psize)
2617 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2618 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002619}
2620
2621char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002623{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002624 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2625}
2626
2627#ifdef Py_DEBUG
2628int unicode_as_unicode_calls = 0;
2629#endif
2630
2631
2632Py_UNICODE *
2633PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2634{
2635 PyUnicodeObject *u;
2636 const unsigned char *one_byte;
2637#if SIZEOF_WCHAR_T == 4
2638 const Py_UCS2 *two_bytes;
2639#else
2640 const Py_UCS4 *four_bytes;
2641 const Py_UCS4 *ucs4_end;
2642 Py_ssize_t num_surrogates;
2643#endif
2644 wchar_t *w;
2645 wchar_t *wchar_end;
2646
2647 if (!PyUnicode_Check(unicode)) {
2648 PyErr_BadArgument();
2649 return NULL;
2650 }
2651 u = (PyUnicodeObject*)unicode;
2652 if (_PyUnicode_WSTR(u) == NULL) {
2653 /* Non-ASCII compact unicode object */
2654 assert(_PyUnicode_KIND(u) != 0);
2655 assert(PyUnicode_IS_READY(u));
2656
2657#ifdef Py_DEBUG
2658 ++unicode_as_unicode_calls;
2659#endif
2660
2661 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2662#if SIZEOF_WCHAR_T == 2
2663 four_bytes = PyUnicode_4BYTE_DATA(u);
2664 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2665 num_surrogates = 0;
2666
2667 for (; four_bytes < ucs4_end; ++four_bytes) {
2668 if (*four_bytes > 0xFFFF)
2669 ++num_surrogates;
2670 }
2671
2672 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2673 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2674 if (!_PyUnicode_WSTR(u)) {
2675 PyErr_NoMemory();
2676 return NULL;
2677 }
2678 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2679
2680 w = _PyUnicode_WSTR(u);
2681 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2682 four_bytes = PyUnicode_4BYTE_DATA(u);
2683 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2684 if (*four_bytes > 0xFFFF) {
2685 /* encode surrogate pair in this case */
2686 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2687 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2688 }
2689 else
2690 *w = *four_bytes;
2691
2692 if (w > wchar_end) {
2693 assert(0 && "Miscalculated string end");
2694 }
2695 }
2696 *w = 0;
2697#else
2698 /* sizeof(wchar_t) == 4 */
2699 Py_FatalError("Impossible unicode object state, wstr and str "
2700 "should share memory already.");
2701 return NULL;
2702#endif
2703 }
2704 else {
2705 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2706 (_PyUnicode_LENGTH(u) + 1));
2707 if (!_PyUnicode_WSTR(u)) {
2708 PyErr_NoMemory();
2709 return NULL;
2710 }
2711 if (!PyUnicode_IS_COMPACT_ASCII(u))
2712 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2713 w = _PyUnicode_WSTR(u);
2714 wchar_end = w + _PyUnicode_LENGTH(u);
2715
2716 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2717 one_byte = PyUnicode_1BYTE_DATA(u);
2718 for (; w < wchar_end; ++one_byte, ++w)
2719 *w = *one_byte;
2720 /* null-terminate the wstr */
2721 *w = 0;
2722 }
2723 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2724#if SIZEOF_WCHAR_T == 4
2725 two_bytes = PyUnicode_2BYTE_DATA(u);
2726 for (; w < wchar_end; ++two_bytes, ++w)
2727 *w = *two_bytes;
2728 /* null-terminate the wstr */
2729 *w = 0;
2730#else
2731 /* sizeof(wchar_t) == 2 */
2732 PyObject_FREE(_PyUnicode_WSTR(u));
2733 _PyUnicode_WSTR(u) = NULL;
2734 Py_FatalError("Impossible unicode object state, wstr "
2735 "and str should share memory already.");
2736 return NULL;
2737#endif
2738 }
2739 else {
2740 assert(0 && "This should never happen.");
2741 }
2742 }
2743 }
2744 if (size != NULL)
2745 *size = PyUnicode_WSTR_LENGTH(u);
2746 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002747}
2748
Alexander Belopolsky40018472011-02-26 01:02:56 +00002749Py_UNICODE *
2750PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002752 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753}
2754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002755
Alexander Belopolsky40018472011-02-26 01:02:56 +00002756Py_ssize_t
2757PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758{
2759 if (!PyUnicode_Check(unicode)) {
2760 PyErr_BadArgument();
2761 goto onError;
2762 }
2763 return PyUnicode_GET_SIZE(unicode);
2764
Benjamin Peterson29060642009-01-31 22:14:21 +00002765 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 return -1;
2767}
2768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002769Py_ssize_t
2770PyUnicode_GetLength(PyObject *unicode)
2771{
2772 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2773 PyErr_BadArgument();
2774 return -1;
2775 }
2776
2777 return PyUnicode_GET_LENGTH(unicode);
2778}
2779
2780Py_UCS4
2781PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2782{
2783 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2784 return PyErr_BadArgument();
2785 return (Py_UCS4)-1;
2786 }
2787 return PyUnicode_READ_CHAR(unicode, index);
2788}
2789
2790int
2791PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2792{
2793 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2794 return PyErr_BadArgument();
2795 return -1;
2796 }
2797
2798 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2799 index, ch);
2800 return 0;
2801}
2802
Alexander Belopolsky40018472011-02-26 01:02:56 +00002803const char *
2804PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002805{
Victor Stinner42cb4622010-09-01 19:39:01 +00002806 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002807}
2808
Victor Stinner554f3f02010-06-16 23:33:54 +00002809/* create or adjust a UnicodeDecodeError */
2810static void
2811make_decode_exception(PyObject **exceptionObject,
2812 const char *encoding,
2813 const char *input, Py_ssize_t length,
2814 Py_ssize_t startpos, Py_ssize_t endpos,
2815 const char *reason)
2816{
2817 if (*exceptionObject == NULL) {
2818 *exceptionObject = PyUnicodeDecodeError_Create(
2819 encoding, input, length, startpos, endpos, reason);
2820 }
2821 else {
2822 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2823 goto onError;
2824 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2825 goto onError;
2826 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2827 goto onError;
2828 }
2829 return;
2830
2831onError:
2832 Py_DECREF(*exceptionObject);
2833 *exceptionObject = NULL;
2834}
2835
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002836/* error handling callback helper:
2837 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002838 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002839 and adjust various state variables.
2840 return 0 on success, -1 on error
2841*/
2842
Alexander Belopolsky40018472011-02-26 01:02:56 +00002843static int
2844unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2845 const char *encoding, const char *reason,
2846 const char **input, const char **inend, Py_ssize_t *startinpos,
2847 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2848 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002850 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002851
2852 PyObject *restuple = NULL;
2853 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002854 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002855 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002856 Py_ssize_t requiredsize;
2857 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002858 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002859 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002860 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002861 int res = -1;
2862
2863 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002864 *errorHandler = PyCodec_LookupError(errors);
2865 if (*errorHandler == NULL)
2866 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867 }
2868
Victor Stinner554f3f02010-06-16 23:33:54 +00002869 make_decode_exception(exceptionObject,
2870 encoding,
2871 *input, *inend - *input,
2872 *startinpos, *endinpos,
2873 reason);
2874 if (*exceptionObject == NULL)
2875 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002876
2877 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2878 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002879 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002880 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002881 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002883 }
2884 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002886
2887 /* Copy back the bytes variables, which might have been modified by the
2888 callback */
2889 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2890 if (!inputobj)
2891 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002892 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002894 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002895 *input = PyBytes_AS_STRING(inputobj);
2896 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002897 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002898 /* we can DECREF safely, as the exception has another reference,
2899 so the object won't go away. */
2900 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002901
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002902 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002903 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002904 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002905 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2906 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002907 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002908
2909 /* need more space? (at least enough for what we
2910 have+the replacement+the rest of the string (starting
2911 at the new input position), so we won't have to check space
2912 when there are no errors in the rest of the string) */
2913 repptr = PyUnicode_AS_UNICODE(repunicode);
2914 repsize = PyUnicode_GET_SIZE(repunicode);
2915 requiredsize = *outpos + repsize + insize-newpos;
2916 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002917 if (requiredsize<2*outsize)
2918 requiredsize = 2*outsize;
2919 if (_PyUnicode_Resize(output, requiredsize) < 0)
2920 goto onError;
2921 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002922 }
2923 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002924 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002925 Py_UNICODE_COPY(*outptr, repptr, repsize);
2926 *outptr += repsize;
2927 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002928
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002929 /* we made it! */
2930 res = 0;
2931
Benjamin Peterson29060642009-01-31 22:14:21 +00002932 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002933 Py_XDECREF(restuple);
2934 return res;
2935}
2936
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002937/* --- UTF-7 Codec -------------------------------------------------------- */
2938
Antoine Pitrou244651a2009-05-04 18:56:13 +00002939/* See RFC2152 for details. We encode conservatively and decode liberally. */
2940
2941/* Three simple macros defining base-64. */
2942
2943/* Is c a base-64 character? */
2944
2945#define IS_BASE64(c) \
2946 (((c) >= 'A' && (c) <= 'Z') || \
2947 ((c) >= 'a' && (c) <= 'z') || \
2948 ((c) >= '0' && (c) <= '9') || \
2949 (c) == '+' || (c) == '/')
2950
2951/* given that c is a base-64 character, what is its base-64 value? */
2952
2953#define FROM_BASE64(c) \
2954 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2955 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2956 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2957 (c) == '+' ? 62 : 63)
2958
2959/* What is the base-64 character of the bottom 6 bits of n? */
2960
2961#define TO_BASE64(n) \
2962 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2963
2964/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2965 * decoded as itself. We are permissive on decoding; the only ASCII
2966 * byte not decoding to itself is the + which begins a base64
2967 * string. */
2968
2969#define DECODE_DIRECT(c) \
2970 ((c) <= 127 && (c) != '+')
2971
2972/* The UTF-7 encoder treats ASCII characters differently according to
2973 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2974 * the above). See RFC2152. This array identifies these different
2975 * sets:
2976 * 0 : "Set D"
2977 * alphanumeric and '(),-./:?
2978 * 1 : "Set O"
2979 * !"#$%&*;<=>@[]^_`{|}
2980 * 2 : "whitespace"
2981 * ht nl cr sp
2982 * 3 : special (must be base64 encoded)
2983 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2984 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002985
Tim Petersced69f82003-09-16 20:30:58 +00002986static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002987char utf7_category[128] = {
2988/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2989 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2990/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2991 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2992/* sp ! " # $ % & ' ( ) * + , - . / */
2993 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2994/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2995 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2996/* @ A B C D E F G H I J K L M N O */
2997 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2998/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2999 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3000/* ` a b c d e f g h i j k l m n o */
3001 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3002/* p q r s t u v w x y z { | } ~ del */
3003 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003004};
3005
Antoine Pitrou244651a2009-05-04 18:56:13 +00003006/* ENCODE_DIRECT: this character should be encoded as itself. The
3007 * answer depends on whether we are encoding set O as itself, and also
3008 * on whether we are encoding whitespace as itself. RFC2152 makes it
3009 * clear that the answers to these questions vary between
3010 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003011
Antoine Pitrou244651a2009-05-04 18:56:13 +00003012#define ENCODE_DIRECT(c, directO, directWS) \
3013 ((c) < 128 && (c) > 0 && \
3014 ((utf7_category[(c)] == 0) || \
3015 (directWS && (utf7_category[(c)] == 2)) || \
3016 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003017
Alexander Belopolsky40018472011-02-26 01:02:56 +00003018PyObject *
3019PyUnicode_DecodeUTF7(const char *s,
3020 Py_ssize_t size,
3021 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003022{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003023 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3024}
3025
Antoine Pitrou244651a2009-05-04 18:56:13 +00003026/* The decoder. The only state we preserve is our read position,
3027 * i.e. how many characters we have consumed. So if we end in the
3028 * middle of a shift sequence we have to back off the read position
3029 * and the output to the beginning of the sequence, otherwise we lose
3030 * all the shift state (seen bits, number of bits seen, high
3031 * surrogate). */
3032
Alexander Belopolsky40018472011-02-26 01:02:56 +00003033PyObject *
3034PyUnicode_DecodeUTF7Stateful(const char *s,
3035 Py_ssize_t size,
3036 const char *errors,
3037 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003038{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003039 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003040 Py_ssize_t startinpos;
3041 Py_ssize_t endinpos;
3042 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003043 const char *e;
3044 PyUnicodeObject *unicode;
3045 Py_UNICODE *p;
3046 const char *errmsg = "";
3047 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003048 Py_UNICODE *shiftOutStart;
3049 unsigned int base64bits = 0;
3050 unsigned long base64buffer = 0;
3051 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003052 PyObject *errorHandler = NULL;
3053 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003054
3055 unicode = _PyUnicode_New(size);
3056 if (!unicode)
3057 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003058 if (size == 0) {
3059 if (consumed)
3060 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003061 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003062 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003064 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003065 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003066 e = s + size;
3067
3068 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003069 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003070 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003071 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003072
Antoine Pitrou244651a2009-05-04 18:56:13 +00003073 if (inShift) { /* in a base-64 section */
3074 if (IS_BASE64(ch)) { /* consume a base-64 character */
3075 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3076 base64bits += 6;
3077 s++;
3078 if (base64bits >= 16) {
3079 /* we have enough bits for a UTF-16 value */
3080 Py_UNICODE outCh = (Py_UNICODE)
3081 (base64buffer >> (base64bits-16));
3082 base64bits -= 16;
3083 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3084 if (surrogate) {
3085 /* expecting a second surrogate */
3086 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3087#ifdef Py_UNICODE_WIDE
3088 *p++ = (((surrogate & 0x3FF)<<10)
3089 | (outCh & 0x3FF)) + 0x10000;
3090#else
3091 *p++ = surrogate;
3092 *p++ = outCh;
3093#endif
3094 surrogate = 0;
3095 }
3096 else {
3097 surrogate = 0;
3098 errmsg = "second surrogate missing";
3099 goto utf7Error;
3100 }
3101 }
3102 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3103 /* first surrogate */
3104 surrogate = outCh;
3105 }
3106 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3107 errmsg = "unexpected second surrogate";
3108 goto utf7Error;
3109 }
3110 else {
3111 *p++ = outCh;
3112 }
3113 }
3114 }
3115 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003116 inShift = 0;
3117 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003118 if (surrogate) {
3119 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003120 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003121 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003122 if (base64bits > 0) { /* left-over bits */
3123 if (base64bits >= 6) {
3124 /* We've seen at least one base-64 character */
3125 errmsg = "partial character in shift sequence";
3126 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003127 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003128 else {
3129 /* Some bits remain; they should be zero */
3130 if (base64buffer != 0) {
3131 errmsg = "non-zero padding bits in shift sequence";
3132 goto utf7Error;
3133 }
3134 }
3135 }
3136 if (ch != '-') {
3137 /* '-' is absorbed; other terminating
3138 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003139 *p++ = ch;
3140 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003141 }
3142 }
3143 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003144 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003145 s++; /* consume '+' */
3146 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003147 s++;
3148 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003149 }
3150 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003151 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003152 shiftOutStart = p;
3153 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003154 }
3155 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003156 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003157 *p++ = ch;
3158 s++;
3159 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003160 else {
3161 startinpos = s-starts;
3162 s++;
3163 errmsg = "unexpected special character";
3164 goto utf7Error;
3165 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003166 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003167utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003168 outpos = p-PyUnicode_AS_UNICODE(unicode);
3169 endinpos = s-starts;
3170 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003171 errors, &errorHandler,
3172 "utf7", errmsg,
3173 &starts, &e, &startinpos, &endinpos, &exc, &s,
3174 &unicode, &outpos, &p))
3175 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003176 }
3177
Antoine Pitrou244651a2009-05-04 18:56:13 +00003178 /* end of string */
3179
3180 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3181 /* if we're in an inconsistent state, that's an error */
3182 if (surrogate ||
3183 (base64bits >= 6) ||
3184 (base64bits > 0 && base64buffer != 0)) {
3185 outpos = p-PyUnicode_AS_UNICODE(unicode);
3186 endinpos = size;
3187 if (unicode_decode_call_errorhandler(
3188 errors, &errorHandler,
3189 "utf7", "unterminated shift sequence",
3190 &starts, &e, &startinpos, &endinpos, &exc, &s,
3191 &unicode, &outpos, &p))
3192 goto onError;
3193 if (s < e)
3194 goto restart;
3195 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003196 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003197
3198 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003199 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003200 if (inShift) {
3201 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003202 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003203 }
3204 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003205 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003206 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003207 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003208
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003209 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003210 goto onError;
3211
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003212 Py_XDECREF(errorHandler);
3213 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003214 if (PyUnicode_READY(unicode) == -1) {
3215 Py_DECREF(unicode);
3216 return NULL;
3217 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003218 return (PyObject *)unicode;
3219
Benjamin Peterson29060642009-01-31 22:14:21 +00003220 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 Py_XDECREF(errorHandler);
3222 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003223 Py_DECREF(unicode);
3224 return NULL;
3225}
3226
3227
Alexander Belopolsky40018472011-02-26 01:02:56 +00003228PyObject *
3229PyUnicode_EncodeUTF7(const Py_UNICODE *s,
3230 Py_ssize_t size,
3231 int base64SetO,
3232 int base64WhiteSpace,
3233 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003234{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003235 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003236 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003237 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003238 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003239 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003240 unsigned int base64bits = 0;
3241 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003242 char * out;
3243 char * start;
3244
3245 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003246 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003247
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003248 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003249 return PyErr_NoMemory();
3250
Antoine Pitrou244651a2009-05-04 18:56:13 +00003251 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003252 if (v == NULL)
3253 return NULL;
3254
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003255 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003256 for (;i < size; ++i) {
3257 Py_UNICODE ch = s[i];
3258
Antoine Pitrou244651a2009-05-04 18:56:13 +00003259 if (inShift) {
3260 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3261 /* shifting out */
3262 if (base64bits) { /* output remaining bits */
3263 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3264 base64buffer = 0;
3265 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003266 }
3267 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003268 /* Characters not in the BASE64 set implicitly unshift the sequence
3269 so no '-' is required, except if the character is itself a '-' */
3270 if (IS_BASE64(ch) || ch == '-') {
3271 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003272 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003273 *out++ = (char) ch;
3274 }
3275 else {
3276 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003277 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003278 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003279 else { /* not in a shift sequence */
3280 if (ch == '+') {
3281 *out++ = '+';
3282 *out++ = '-';
3283 }
3284 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3285 *out++ = (char) ch;
3286 }
3287 else {
3288 *out++ = '+';
3289 inShift = 1;
3290 goto encode_char;
3291 }
3292 }
3293 continue;
3294encode_char:
3295#ifdef Py_UNICODE_WIDE
3296 if (ch >= 0x10000) {
3297 /* code first surrogate */
3298 base64bits += 16;
3299 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3300 while (base64bits >= 6) {
3301 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3302 base64bits -= 6;
3303 }
3304 /* prepare second surrogate */
3305 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3306 }
3307#endif
3308 base64bits += 16;
3309 base64buffer = (base64buffer << 16) | ch;
3310 while (base64bits >= 6) {
3311 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3312 base64bits -= 6;
3313 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003314 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003315 if (base64bits)
3316 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3317 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003318 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003319 if (_PyBytes_Resize(&v, out - start) < 0)
3320 return NULL;
3321 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003322}
3323
Antoine Pitrou244651a2009-05-04 18:56:13 +00003324#undef IS_BASE64
3325#undef FROM_BASE64
3326#undef TO_BASE64
3327#undef DECODE_DIRECT
3328#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003329
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330/* --- UTF-8 Codec -------------------------------------------------------- */
3331
Tim Petersced69f82003-09-16 20:30:58 +00003332static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003334 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3335 illegal prefix. See RFC 3629 for details */
3336 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3337 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003338 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3340 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3341 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3342 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003343 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3344 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3346 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003347 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3348 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3349 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3350 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3351 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352};
3353
Alexander Belopolsky40018472011-02-26 01:02:56 +00003354PyObject *
3355PyUnicode_DecodeUTF8(const char *s,
3356 Py_ssize_t size,
3357 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358{
Walter Dörwald69652032004-09-07 20:24:22 +00003359 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3360}
3361
Antoine Pitrouab868312009-01-10 15:40:25 +00003362/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3363#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3364
3365/* Mask to quickly check whether a C 'long' contains a
3366 non-ASCII, UTF8-encoded char. */
3367#if (SIZEOF_LONG == 8)
3368# define ASCII_CHAR_MASK 0x8080808080808080L
3369#elif (SIZEOF_LONG == 4)
3370# define ASCII_CHAR_MASK 0x80808080L
3371#else
3372# error C 'long' size should be either 4 or 8!
3373#endif
3374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003375/* Scans a UTF-8 string and returns the maximum character to be expected,
3376 the size of the decoded unicode string and if any major errors were
3377 encountered.
3378
3379 This function does check basic UTF-8 sanity, it does however NOT CHECK
3380 if the string contains surrogates, and if all continuation bytes are
3381 within the correct ranges, these checks are performed in
3382 PyUnicode_DecodeUTF8Stateful.
3383
3384 If it sets has_errors to 1, it means the value of unicode_size and max_char
3385 will be bogus and you should not rely on useful information in them.
3386 */
3387static Py_UCS4
3388utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3389 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3390 int *has_errors)
3391{
3392 Py_ssize_t n;
3393 Py_ssize_t char_count = 0;
3394 Py_UCS4 max_char = 127, new_max;
3395 Py_UCS4 upper_bound;
3396 const unsigned char *p = (const unsigned char *)s;
3397 const unsigned char *end = p + string_size;
3398 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3399 int err = 0;
3400
3401 for (; p < end && !err; ++p, ++char_count) {
3402 /* Only check value if it's not a ASCII char... */
3403 if (*p < 0x80) {
3404 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3405 an explanation. */
3406 if (!((size_t) p & LONG_PTR_MASK)) {
3407 /* Help register allocation */
3408 register const unsigned char *_p = p;
3409 while (_p < aligned_end) {
3410 unsigned long value = *(unsigned long *) _p;
3411 if (value & ASCII_CHAR_MASK)
3412 break;
3413 _p += SIZEOF_LONG;
3414 char_count += SIZEOF_LONG;
3415 }
3416 p = _p;
3417 if (p == end)
3418 break;
3419 }
3420 }
3421 if (*p >= 0x80) {
3422 n = utf8_code_length[*p];
3423 new_max = max_char;
3424 switch (n) {
3425 /* invalid start byte */
3426 case 0:
3427 err = 1;
3428 break;
3429 case 2:
3430 /* Code points between 0x00FF and 0x07FF inclusive.
3431 Approximate the upper bound of the code point,
3432 if this flips over 255 we can be sure it will be more
3433 than 255 and the string will need 2 bytes per code coint,
3434 if it stays under or equal to 255, we can be sure 1 byte
3435 is enough.
3436 ((*p & 0b00011111) << 6) | 0b00111111 */
3437 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3438 if (max_char < upper_bound)
3439 new_max = upper_bound;
3440 /* Ensure we track at least that we left ASCII space. */
3441 if (new_max < 128)
3442 new_max = 128;
3443 break;
3444 case 3:
3445 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3446 always > 255 and <= 65535 and will always need 2 bytes. */
3447 if (max_char < 65535)
3448 new_max = 65535;
3449 break;
3450 case 4:
3451 /* Code point will be above 0xFFFF for sure in this case. */
3452 new_max = 65537;
3453 break;
3454 /* Internal error, this should be caught by the first if */
3455 case 1:
3456 default:
3457 assert(0 && "Impossible case in utf8_max_char_and_size");
3458 err = 1;
3459 }
3460 /* Instead of number of overall bytes for this code point,
3461 n containts the number of following bytes: */
3462 --n;
3463 /* Check if the follow up chars are all valid continuation bytes */
3464 if (n >= 1) {
3465 const unsigned char *cont;
3466 if ((p + n) >= end) {
3467 if (consumed == 0)
3468 /* incomplete data, non-incremental decoding */
3469 err = 1;
3470 break;
3471 }
3472 for (cont = p + 1; cont < (p + n); ++cont) {
3473 if ((*cont & 0xc0) != 0x80) {
3474 err = 1;
3475 break;
3476 }
3477 }
3478 p += n;
3479 }
3480 else
3481 err = 1;
3482 max_char = new_max;
3483 }
3484 }
3485
3486 if (unicode_size)
3487 *unicode_size = char_count;
3488 if (has_errors)
3489 *has_errors = err;
3490 return max_char;
3491}
3492
3493/* Similar to PyUnicode_WRITE but can also write into wstr field
3494 of the legacy unicode representation */
3495#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3496 do { \
3497 const int k_ = (kind); \
3498 if (k_ == PyUnicode_WCHAR_KIND) \
3499 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3500 else if (k_ == PyUnicode_1BYTE_KIND) \
3501 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3502 else if (k_ == PyUnicode_2BYTE_KIND) \
3503 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3504 else \
3505 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3506 } while (0)
3507
Alexander Belopolsky40018472011-02-26 01:02:56 +00003508PyObject *
3509PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003510 Py_ssize_t size,
3511 const char *errors,
3512 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003513{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003514 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003516 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003517 Py_ssize_t startinpos;
3518 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003519 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003521 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 PyObject *errorHandler = NULL;
3523 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003524 Py_UCS4 maxchar = 0;
3525 Py_ssize_t unicode_size;
3526 Py_ssize_t i;
3527 int kind;
3528 void *data;
3529 int has_errors;
3530 Py_UNICODE *error_outptr;
3531#if SIZEOF_WCHAR_T == 2
3532 Py_ssize_t wchar_offset = 0;
3533#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534
Walter Dörwald69652032004-09-07 20:24:22 +00003535 if (size == 0) {
3536 if (consumed)
3537 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003538 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003540 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3541 consumed, &has_errors);
3542 if (has_errors) {
3543 unicode = _PyUnicode_New(size);
3544 if (!unicode)
3545 return NULL;
3546 kind = PyUnicode_WCHAR_KIND;
3547 data = PyUnicode_AS_UNICODE(unicode);
3548 assert(data != NULL);
3549 }
3550 else {
3551 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3552 if (!unicode)
3553 return NULL;
3554 /* When the string is ASCII only, just use memcpy and return.
3555 unicode_size may be != size if there is an incomplete UTF-8
3556 sequence at the end of the ASCII block. */
3557 if (maxchar < 128 && size == unicode_size) {
3558 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3559 return (PyObject *)unicode;
3560 }
3561 kind = PyUnicode_KIND(unicode);
3562 data = PyUnicode_DATA(unicode);
3563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003565 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003567 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568
3569 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003570 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571
3572 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003573 /* Fast path for runs of ASCII characters. Given that common UTF-8
3574 input will consist of an overwhelming majority of ASCII
3575 characters, we try to optimize for this case by checking
3576 as many characters as a C 'long' can contain.
3577 First, check if we can do an aligned read, as most CPUs have
3578 a penalty for unaligned reads.
3579 */
3580 if (!((size_t) s & LONG_PTR_MASK)) {
3581 /* Help register allocation */
3582 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003583 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003584 while (_s < aligned_end) {
3585 /* Read a whole long at a time (either 4 or 8 bytes),
3586 and do a fast unrolled copy if it only contains ASCII
3587 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003588 unsigned long value = *(unsigned long *) _s;
3589 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003590 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003591 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3592 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3593 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3594 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003595#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003596 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3597 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3598 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3599 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003600#endif
3601 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003602 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003603 }
3604 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003605 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003606 if (s == e)
3607 break;
3608 ch = (unsigned char)*s;
3609 }
3610 }
3611
3612 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003613 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 s++;
3615 continue;
3616 }
3617
3618 n = utf8_code_length[ch];
3619
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003620 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003621 if (consumed)
3622 break;
3623 else {
3624 errmsg = "unexpected end of data";
3625 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003626 endinpos = startinpos+1;
3627 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3628 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003629 goto utf8Error;
3630 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003631 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632
3633 switch (n) {
3634
3635 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003636 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003637 startinpos = s-starts;
3638 endinpos = startinpos+1;
3639 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640
3641 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003642 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003643 startinpos = s-starts;
3644 endinpos = startinpos+1;
3645 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646
3647 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003648 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003649 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003650 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003651 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003652 goto utf8Error;
3653 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003655 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003656 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 break;
3658
3659 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003660 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3661 will result in surrogates in range d800-dfff. Surrogates are
3662 not valid UTF-8 so they are rejected.
3663 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3664 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003665 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003666 (s[2] & 0xc0) != 0x80 ||
3667 ((unsigned char)s[0] == 0xE0 &&
3668 (unsigned char)s[1] < 0xA0) ||
3669 ((unsigned char)s[0] == 0xED &&
3670 (unsigned char)s[1] > 0x9F)) {
3671 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003673 endinpos = startinpos + 1;
3674
3675 /* if s[1] first two bits are 1 and 0, then the invalid
3676 continuation byte is s[2], so increment endinpos by 1,
3677 if not, s[1] is invalid and endinpos doesn't need to
3678 be incremented. */
3679 if ((s[1] & 0xC0) == 0x80)
3680 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003681 goto utf8Error;
3682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003684 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003685 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003686 break;
3687
3688 case 4:
3689 if ((s[1] & 0xc0) != 0x80 ||
3690 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003691 (s[3] & 0xc0) != 0x80 ||
3692 ((unsigned char)s[0] == 0xF0 &&
3693 (unsigned char)s[1] < 0x90) ||
3694 ((unsigned char)s[0] == 0xF4 &&
3695 (unsigned char)s[1] > 0x8F)) {
3696 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003697 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003698 endinpos = startinpos + 1;
3699 if ((s[1] & 0xC0) == 0x80) {
3700 endinpos++;
3701 if ((s[2] & 0xC0) == 0x80)
3702 endinpos++;
3703 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003704 goto utf8Error;
3705 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003706 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003707 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3708 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003710 /* If the string is flexible or we have native UCS-4, write
3711 directly.. */
3712 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3713 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003715 else {
3716 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003718 /* translate from 10000..10FFFF to 0..FFFF */
3719 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003721 /* high surrogate = top 10 bits added to D800 */
3722 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3723 (Py_UNICODE)(0xD800 + (ch >> 10)));
3724
3725 /* low surrogate = bottom 10 bits added to DC00 */
3726 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3727 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3728 }
3729#if SIZEOF_WCHAR_T == 2
3730 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003731#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733 }
3734 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003735 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003736
Benjamin Peterson29060642009-01-31 22:14:21 +00003737 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003738 /* If this is not yet a resizable string, make it one.. */
3739 if (kind != PyUnicode_WCHAR_KIND) {
3740 const Py_UNICODE *u;
3741 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3742 if (!new_unicode)
3743 goto onError;
3744 u = PyUnicode_AsUnicode((PyObject *)unicode);
3745 if (!u)
3746 goto onError;
3747#if SIZEOF_WCHAR_T == 2
3748 i += wchar_offset;
3749#endif
3750 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3751 Py_DECREF(unicode);
3752 unicode = new_unicode;
3753 kind = 0;
3754 data = PyUnicode_AS_UNICODE(new_unicode);
3755 assert(data != NULL);
3756 }
3757 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003758 if (unicode_decode_call_errorhandler(
3759 errors, &errorHandler,
3760 "utf8", errmsg,
3761 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003763 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764 /* Update data because unicode_decode_call_errorhandler might have
3765 re-created or resized the unicode object. */
3766 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003767 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769 /* Ensure the unicode_size calculation above was correct: */
3770 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3771
Walter Dörwald69652032004-09-07 20:24:22 +00003772 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003773 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003775 /* Adjust length and ready string when it contained errors and
3776 is of the old resizable kind. */
3777 if (kind == PyUnicode_WCHAR_KIND) {
3778 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3779 PyUnicode_READY(unicode) == -1)
3780 goto onError;
3781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 Py_XDECREF(errorHandler);
3784 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785 if (PyUnicode_READY(unicode) == -1) {
3786 Py_DECREF(unicode);
3787 return NULL;
3788 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 return (PyObject *)unicode;
3790
Benjamin Peterson29060642009-01-31 22:14:21 +00003791 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 Py_XDECREF(errorHandler);
3793 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794 Py_DECREF(unicode);
3795 return NULL;
3796}
3797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003798#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003799
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003800#ifdef __APPLE__
3801
3802/* Simplified UTF-8 decoder using surrogateescape error handler,
3803 used to decode the command line arguments on Mac OS X. */
3804
3805wchar_t*
3806_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3807{
3808 int n;
3809 const char *e;
3810 wchar_t *unicode, *p;
3811
3812 /* Note: size will always be longer than the resulting Unicode
3813 character count */
3814 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3815 PyErr_NoMemory();
3816 return NULL;
3817 }
3818 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3819 if (!unicode)
3820 return NULL;
3821
3822 /* Unpack UTF-8 encoded data */
3823 p = unicode;
3824 e = s + size;
3825 while (s < e) {
3826 Py_UCS4 ch = (unsigned char)*s;
3827
3828 if (ch < 0x80) {
3829 *p++ = (wchar_t)ch;
3830 s++;
3831 continue;
3832 }
3833
3834 n = utf8_code_length[ch];
3835 if (s + n > e) {
3836 goto surrogateescape;
3837 }
3838
3839 switch (n) {
3840 case 0:
3841 case 1:
3842 goto surrogateescape;
3843
3844 case 2:
3845 if ((s[1] & 0xc0) != 0x80)
3846 goto surrogateescape;
3847 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3848 assert ((ch > 0x007F) && (ch <= 0x07FF));
3849 *p++ = (wchar_t)ch;
3850 break;
3851
3852 case 3:
3853 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3854 will result in surrogates in range d800-dfff. Surrogates are
3855 not valid UTF-8 so they are rejected.
3856 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3857 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3858 if ((s[1] & 0xc0) != 0x80 ||
3859 (s[2] & 0xc0) != 0x80 ||
3860 ((unsigned char)s[0] == 0xE0 &&
3861 (unsigned char)s[1] < 0xA0) ||
3862 ((unsigned char)s[0] == 0xED &&
3863 (unsigned char)s[1] > 0x9F)) {
3864
3865 goto surrogateescape;
3866 }
3867 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3868 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003870 break;
3871
3872 case 4:
3873 if ((s[1] & 0xc0) != 0x80 ||
3874 (s[2] & 0xc0) != 0x80 ||
3875 (s[3] & 0xc0) != 0x80 ||
3876 ((unsigned char)s[0] == 0xF0 &&
3877 (unsigned char)s[1] < 0x90) ||
3878 ((unsigned char)s[0] == 0xF4 &&
3879 (unsigned char)s[1] > 0x8F)) {
3880 goto surrogateescape;
3881 }
3882 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3883 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3884 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3885
3886#if SIZEOF_WCHAR_T == 4
3887 *p++ = (wchar_t)ch;
3888#else
3889 /* compute and append the two surrogates: */
3890
3891 /* translate from 10000..10FFFF to 0..FFFF */
3892 ch -= 0x10000;
3893
3894 /* high surrogate = top 10 bits added to D800 */
3895 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3896
3897 /* low surrogate = bottom 10 bits added to DC00 */
3898 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3899#endif
3900 break;
3901 }
3902 s += n;
3903 continue;
3904
3905 surrogateescape:
3906 *p++ = 0xDC00 + ch;
3907 s++;
3908 }
3909 *p = L'\0';
3910 return unicode;
3911}
3912
3913#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915/* Primary internal function which creates utf8 encoded bytes objects.
3916
3917 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003918 and allocate exactly as much space needed at the end. Else allocate the
3919 maximum possible needed (4 result bytes per Unicode character), and return
3920 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003921*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003922PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924{
Tim Peters602f7402002-04-27 18:03:26 +00003925#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003926
Guido van Rossum98297ee2007-11-06 21:34:58 +00003927 Py_ssize_t i; /* index into s of next input byte */
3928 PyObject *result; /* result string object */
3929 char *p; /* next free byte in output buffer */
3930 Py_ssize_t nallocated; /* number of result bytes allocated */
3931 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003932 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003933 PyObject *errorHandler = NULL;
3934 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003935 int kind;
3936 void *data;
3937 Py_ssize_t size;
3938 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3939#if SIZEOF_WCHAR_T == 2
3940 Py_ssize_t wchar_offset = 0;
3941#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943 if (!PyUnicode_Check(unicode)) {
3944 PyErr_BadArgument();
3945 return NULL;
3946 }
3947
3948 if (PyUnicode_READY(unicode) == -1)
3949 return NULL;
3950
3951 if (_PyUnicode_UTF8(unicode))
3952 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
3953 _PyUnicode_UTF8_LENGTH(unicode));
3954
3955 kind = PyUnicode_KIND(unicode);
3956 data = PyUnicode_DATA(unicode);
3957 size = PyUnicode_GET_LENGTH(unicode);
3958
Tim Peters602f7402002-04-27 18:03:26 +00003959 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960
Tim Peters602f7402002-04-27 18:03:26 +00003961 if (size <= MAX_SHORT_UNICHARS) {
3962 /* Write into the stack buffer; nallocated can't overflow.
3963 * At the end, we'll allocate exactly as much heap space as it
3964 * turns out we need.
3965 */
3966 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003967 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00003968 p = stackbuf;
3969 }
3970 else {
3971 /* Overallocate on the heap, and give the excess back at the end. */
3972 nallocated = size * 4;
3973 if (nallocated / 4 != size) /* overflow! */
3974 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00003975 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003976 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00003977 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003978 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003979 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003980
Tim Peters602f7402002-04-27 18:03:26 +00003981 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003983
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003984 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00003985 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003987
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00003989 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00003990 *p++ = (char)(0xc0 | (ch >> 6));
3991 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00003992 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993 Py_ssize_t newpos;
3994 PyObject *rep;
3995 Py_ssize_t repsize, k, startpos;
3996 startpos = i-1;
3997#if SIZEOF_WCHAR_T == 2
3998 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00003999#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004000 rep = unicode_encode_call_errorhandler(
4001 errors, &errorHandler, "utf-8", "surrogates not allowed",
4002 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4003 &exc, startpos, startpos+1, &newpos);
4004 if (!rep)
4005 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 if (PyBytes_Check(rep))
4008 repsize = PyBytes_GET_SIZE(rep);
4009 else
4010 repsize = PyUnicode_GET_SIZE(rep);
4011
4012 if (repsize > 4) {
4013 Py_ssize_t offset;
4014
4015 if (result == NULL)
4016 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004017 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004020 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4021 /* integer overflow */
4022 PyErr_NoMemory();
4023 goto error;
4024 }
4025 nallocated += repsize - 4;
4026 if (result != NULL) {
4027 if (_PyBytes_Resize(&result, nallocated) < 0)
4028 goto error;
4029 } else {
4030 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004031 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032 goto error;
4033 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4034 }
4035 p = PyBytes_AS_STRING(result) + offset;
4036 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038 if (PyBytes_Check(rep)) {
4039 char *prep = PyBytes_AS_STRING(rep);
4040 for(k = repsize; k > 0; k--)
4041 *p++ = *prep++;
4042 } else /* rep is unicode */ {
4043 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4044 Py_UNICODE c;
4045
4046 for(k=0; k<repsize; k++) {
4047 c = prep[k];
4048 if (0x80 <= c) {
4049 raise_encode_exception(&exc, "utf-8",
4050 PyUnicode_AS_UNICODE(unicode),
4051 size, i-1, i,
4052 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004053 goto error;
4054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004055 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004056 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004059 } else if (ch < 0x10000) {
4060 *p++ = (char)(0xe0 | (ch >> 12));
4061 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4062 *p++ = (char)(0x80 | (ch & 0x3f));
4063 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004064 /* Encode UCS4 Unicode ordinals */
4065 *p++ = (char)(0xf0 | (ch >> 18));
4066 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4067 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4068 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069#if SIZEOF_WCHAR_T == 2
4070 wchar_offset++;
4071#endif
Tim Peters602f7402002-04-27 18:03:26 +00004072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004074
Guido van Rossum98297ee2007-11-06 21:34:58 +00004075 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004076 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004077 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004078 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004079 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004080 }
4081 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004082 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004083 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004084 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004085 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004086 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004087
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004088 Py_XDECREF(errorHandler);
4089 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004090 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004091 error:
4092 Py_XDECREF(errorHandler);
4093 Py_XDECREF(exc);
4094 Py_XDECREF(result);
4095 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004096
Tim Peters602f7402002-04-27 18:03:26 +00004097#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098}
4099
Alexander Belopolsky40018472011-02-26 01:02:56 +00004100PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004101PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4102 Py_ssize_t size,
4103 const char *errors)
4104{
4105 PyObject *v, *unicode;
4106
4107 unicode = PyUnicode_FromUnicode(s, size);
4108 if (unicode == NULL)
4109 return NULL;
4110 v = _PyUnicode_AsUTF8String(unicode, errors);
4111 Py_DECREF(unicode);
4112 return v;
4113}
4114
4115PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004116PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004118 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119}
4120
Walter Dörwald41980ca2007-08-16 21:55:45 +00004121/* --- UTF-32 Codec ------------------------------------------------------- */
4122
4123PyObject *
4124PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004125 Py_ssize_t size,
4126 const char *errors,
4127 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004128{
4129 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4130}
4131
4132PyObject *
4133PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004134 Py_ssize_t size,
4135 const char *errors,
4136 int *byteorder,
4137 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004138{
4139 const char *starts = s;
4140 Py_ssize_t startinpos;
4141 Py_ssize_t endinpos;
4142 Py_ssize_t outpos;
4143 PyUnicodeObject *unicode;
4144 Py_UNICODE *p;
4145#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004146 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004147 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004148#else
4149 const int pairs = 0;
4150#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004151 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004152 int bo = 0; /* assume native ordering by default */
4153 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004154 /* Offsets from q for retrieving bytes in the right order. */
4155#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4156 int iorder[] = {0, 1, 2, 3};
4157#else
4158 int iorder[] = {3, 2, 1, 0};
4159#endif
4160 PyObject *errorHandler = NULL;
4161 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004162
Walter Dörwald41980ca2007-08-16 21:55:45 +00004163 q = (unsigned char *)s;
4164 e = q + size;
4165
4166 if (byteorder)
4167 bo = *byteorder;
4168
4169 /* Check for BOM marks (U+FEFF) in the input and adjust current
4170 byte order setting accordingly. In native mode, the leading BOM
4171 mark is skipped, in all other modes, it is copied to the output
4172 stream as-is (giving a ZWNBSP character). */
4173 if (bo == 0) {
4174 if (size >= 4) {
4175 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004177#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 if (bom == 0x0000FEFF) {
4179 q += 4;
4180 bo = -1;
4181 }
4182 else if (bom == 0xFFFE0000) {
4183 q += 4;
4184 bo = 1;
4185 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004186#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004187 if (bom == 0x0000FEFF) {
4188 q += 4;
4189 bo = 1;
4190 }
4191 else if (bom == 0xFFFE0000) {
4192 q += 4;
4193 bo = -1;
4194 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004195#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004197 }
4198
4199 if (bo == -1) {
4200 /* force LE */
4201 iorder[0] = 0;
4202 iorder[1] = 1;
4203 iorder[2] = 2;
4204 iorder[3] = 3;
4205 }
4206 else if (bo == 1) {
4207 /* force BE */
4208 iorder[0] = 3;
4209 iorder[1] = 2;
4210 iorder[2] = 1;
4211 iorder[3] = 0;
4212 }
4213
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004214 /* On narrow builds we split characters outside the BMP into two
4215 codepoints => count how much extra space we need. */
4216#ifndef Py_UNICODE_WIDE
4217 for (qq = q; qq < e; qq += 4)
4218 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4219 pairs++;
4220#endif
4221
4222 /* This might be one to much, because of a BOM */
4223 unicode = _PyUnicode_New((size+3)/4+pairs);
4224 if (!unicode)
4225 return NULL;
4226 if (size == 0)
4227 return (PyObject *)unicode;
4228
4229 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004230 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004231
Walter Dörwald41980ca2007-08-16 21:55:45 +00004232 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004233 Py_UCS4 ch;
4234 /* remaining bytes at the end? (size should be divisible by 4) */
4235 if (e-q<4) {
4236 if (consumed)
4237 break;
4238 errmsg = "truncated data";
4239 startinpos = ((const char *)q)-starts;
4240 endinpos = ((const char *)e)-starts;
4241 goto utf32Error;
4242 /* The remaining input chars are ignored if the callback
4243 chooses to skip the input */
4244 }
4245 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4246 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004247
Benjamin Peterson29060642009-01-31 22:14:21 +00004248 if (ch >= 0x110000)
4249 {
4250 errmsg = "codepoint not in range(0x110000)";
4251 startinpos = ((const char *)q)-starts;
4252 endinpos = startinpos+4;
4253 goto utf32Error;
4254 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004255#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 if (ch >= 0x10000)
4257 {
4258 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4259 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4260 }
4261 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004262#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004263 *p++ = ch;
4264 q += 4;
4265 continue;
4266 utf32Error:
4267 outpos = p-PyUnicode_AS_UNICODE(unicode);
4268 if (unicode_decode_call_errorhandler(
4269 errors, &errorHandler,
4270 "utf32", errmsg,
4271 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4272 &unicode, &outpos, &p))
4273 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004274 }
4275
4276 if (byteorder)
4277 *byteorder = bo;
4278
4279 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004280 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004281
4282 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004283 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004284 goto onError;
4285
4286 Py_XDECREF(errorHandler);
4287 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004288 if (PyUnicode_READY(unicode) == -1) {
4289 Py_DECREF(unicode);
4290 return NULL;
4291 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004292 return (PyObject *)unicode;
4293
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004295 Py_DECREF(unicode);
4296 Py_XDECREF(errorHandler);
4297 Py_XDECREF(exc);
4298 return NULL;
4299}
4300
4301PyObject *
4302PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004303 Py_ssize_t size,
4304 const char *errors,
4305 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004306{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004307 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004308 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004309 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004310#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004311 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004312#else
4313 const int pairs = 0;
4314#endif
4315 /* Offsets from p for storing byte pairs in the right order. */
4316#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4317 int iorder[] = {0, 1, 2, 3};
4318#else
4319 int iorder[] = {3, 2, 1, 0};
4320#endif
4321
Benjamin Peterson29060642009-01-31 22:14:21 +00004322#define STORECHAR(CH) \
4323 do { \
4324 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4325 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4326 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4327 p[iorder[0]] = (CH) & 0xff; \
4328 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004329 } while(0)
4330
4331 /* In narrow builds we can output surrogate pairs as one codepoint,
4332 so we need less space. */
4333#ifndef Py_UNICODE_WIDE
4334 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004335 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4336 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4337 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004338#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004339 nsize = (size - pairs + (byteorder == 0));
4340 bytesize = nsize * 4;
4341 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004342 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004343 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004344 if (v == NULL)
4345 return NULL;
4346
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004347 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004348 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004349 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004350 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004351 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004352
4353 if (byteorder == -1) {
4354 /* force LE */
4355 iorder[0] = 0;
4356 iorder[1] = 1;
4357 iorder[2] = 2;
4358 iorder[3] = 3;
4359 }
4360 else if (byteorder == 1) {
4361 /* force BE */
4362 iorder[0] = 3;
4363 iorder[1] = 2;
4364 iorder[2] = 1;
4365 iorder[3] = 0;
4366 }
4367
4368 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004370#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004371 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4372 Py_UCS4 ch2 = *s;
4373 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4374 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4375 s++;
4376 size--;
4377 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004378 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004379#endif
4380 STORECHAR(ch);
4381 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004382
4383 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004384 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004385#undef STORECHAR
4386}
4387
Alexander Belopolsky40018472011-02-26 01:02:56 +00004388PyObject *
4389PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004390{
4391 if (!PyUnicode_Check(unicode)) {
4392 PyErr_BadArgument();
4393 return NULL;
4394 }
4395 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004396 PyUnicode_GET_SIZE(unicode),
4397 NULL,
4398 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004399}
4400
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401/* --- UTF-16 Codec ------------------------------------------------------- */
4402
Tim Peters772747b2001-08-09 22:21:55 +00004403PyObject *
4404PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004405 Py_ssize_t size,
4406 const char *errors,
4407 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408{
Walter Dörwald69652032004-09-07 20:24:22 +00004409 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4410}
4411
Antoine Pitrouab868312009-01-10 15:40:25 +00004412/* Two masks for fast checking of whether a C 'long' may contain
4413 UTF16-encoded surrogate characters. This is an efficient heuristic,
4414 assuming that non-surrogate characters with a code point >= 0x8000 are
4415 rare in most input.
4416 FAST_CHAR_MASK is used when the input is in native byte ordering,
4417 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004418*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004419#if (SIZEOF_LONG == 8)
4420# define FAST_CHAR_MASK 0x8000800080008000L
4421# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4422#elif (SIZEOF_LONG == 4)
4423# define FAST_CHAR_MASK 0x80008000L
4424# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4425#else
4426# error C 'long' size should be either 4 or 8!
4427#endif
4428
Walter Dörwald69652032004-09-07 20:24:22 +00004429PyObject *
4430PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 Py_ssize_t size,
4432 const char *errors,
4433 int *byteorder,
4434 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004435{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004437 Py_ssize_t startinpos;
4438 Py_ssize_t endinpos;
4439 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 PyUnicodeObject *unicode;
4441 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004442 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004443 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004444 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004445 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004446 /* Offsets from q for retrieving byte pairs in the right order. */
4447#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4448 int ihi = 1, ilo = 0;
4449#else
4450 int ihi = 0, ilo = 1;
4451#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 PyObject *errorHandler = NULL;
4453 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454
4455 /* Note: size will always be longer than the resulting Unicode
4456 character count */
4457 unicode = _PyUnicode_New(size);
4458 if (!unicode)
4459 return NULL;
4460 if (size == 0)
4461 return (PyObject *)unicode;
4462
4463 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004464 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004465 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004466 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467
4468 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004469 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004471 /* Check for BOM marks (U+FEFF) in the input and adjust current
4472 byte order setting accordingly. In native mode, the leading BOM
4473 mark is skipped, in all other modes, it is copied to the output
4474 stream as-is (giving a ZWNBSP character). */
4475 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004476 if (size >= 2) {
4477 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004478#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004479 if (bom == 0xFEFF) {
4480 q += 2;
4481 bo = -1;
4482 }
4483 else if (bom == 0xFFFE) {
4484 q += 2;
4485 bo = 1;
4486 }
Tim Petersced69f82003-09-16 20:30:58 +00004487#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 if (bom == 0xFEFF) {
4489 q += 2;
4490 bo = 1;
4491 }
4492 else if (bom == 0xFFFE) {
4493 q += 2;
4494 bo = -1;
4495 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004496#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004498 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499
Tim Peters772747b2001-08-09 22:21:55 +00004500 if (bo == -1) {
4501 /* force LE */
4502 ihi = 1;
4503 ilo = 0;
4504 }
4505 else if (bo == 1) {
4506 /* force BE */
4507 ihi = 0;
4508 ilo = 1;
4509 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004510#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4511 native_ordering = ilo < ihi;
4512#else
4513 native_ordering = ilo > ihi;
4514#endif
Tim Peters772747b2001-08-09 22:21:55 +00004515
Antoine Pitrouab868312009-01-10 15:40:25 +00004516 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004517 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004519 /* First check for possible aligned read of a C 'long'. Unaligned
4520 reads are more expensive, better to defer to another iteration. */
4521 if (!((size_t) q & LONG_PTR_MASK)) {
4522 /* Fast path for runs of non-surrogate chars. */
4523 register const unsigned char *_q = q;
4524 Py_UNICODE *_p = p;
4525 if (native_ordering) {
4526 /* Native ordering is simple: as long as the input cannot
4527 possibly contain a surrogate char, do an unrolled copy
4528 of several 16-bit code points to the target object.
4529 The non-surrogate check is done on several input bytes
4530 at a time (as many as a C 'long' can contain). */
4531 while (_q < aligned_end) {
4532 unsigned long data = * (unsigned long *) _q;
4533 if (data & FAST_CHAR_MASK)
4534 break;
4535 _p[0] = ((unsigned short *) _q)[0];
4536 _p[1] = ((unsigned short *) _q)[1];
4537#if (SIZEOF_LONG == 8)
4538 _p[2] = ((unsigned short *) _q)[2];
4539 _p[3] = ((unsigned short *) _q)[3];
4540#endif
4541 _q += SIZEOF_LONG;
4542 _p += SIZEOF_LONG / 2;
4543 }
4544 }
4545 else {
4546 /* Byteswapped ordering is similar, but we must decompose
4547 the copy bytewise, and take care of zero'ing out the
4548 upper bytes if the target object is in 32-bit units
4549 (that is, in UCS-4 builds). */
4550 while (_q < aligned_end) {
4551 unsigned long data = * (unsigned long *) _q;
4552 if (data & SWAPPED_FAST_CHAR_MASK)
4553 break;
4554 /* Zero upper bytes in UCS-4 builds */
4555#if (Py_UNICODE_SIZE > 2)
4556 _p[0] = 0;
4557 _p[1] = 0;
4558#if (SIZEOF_LONG == 8)
4559 _p[2] = 0;
4560 _p[3] = 0;
4561#endif
4562#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004563 /* Issue #4916; UCS-4 builds on big endian machines must
4564 fill the two last bytes of each 4-byte unit. */
4565#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4566# define OFF 2
4567#else
4568# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004569#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004570 ((unsigned char *) _p)[OFF + 1] = _q[0];
4571 ((unsigned char *) _p)[OFF + 0] = _q[1];
4572 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4573 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4574#if (SIZEOF_LONG == 8)
4575 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4576 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4577 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4578 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4579#endif
4580#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004581 _q += SIZEOF_LONG;
4582 _p += SIZEOF_LONG / 2;
4583 }
4584 }
4585 p = _p;
4586 q = _q;
4587 if (q >= e)
4588 break;
4589 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591
Benjamin Peterson14339b62009-01-31 16:36:08 +00004592 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004593
4594 if (ch < 0xD800 || ch > 0xDFFF) {
4595 *p++ = ch;
4596 continue;
4597 }
4598
4599 /* UTF-16 code pair: */
4600 if (q > e) {
4601 errmsg = "unexpected end of data";
4602 startinpos = (((const char *)q) - 2) - starts;
4603 endinpos = ((const char *)e) + 1 - starts;
4604 goto utf16Error;
4605 }
4606 if (0xD800 <= ch && ch <= 0xDBFF) {
4607 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4608 q += 2;
4609 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004610#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004611 *p++ = ch;
4612 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004613#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004614 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004615#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 continue;
4617 }
4618 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004619 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004620 startinpos = (((const char *)q)-4)-starts;
4621 endinpos = startinpos+2;
4622 goto utf16Error;
4623 }
4624
Benjamin Peterson14339b62009-01-31 16:36:08 +00004625 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004626 errmsg = "illegal encoding";
4627 startinpos = (((const char *)q)-2)-starts;
4628 endinpos = startinpos+2;
4629 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004630
Benjamin Peterson29060642009-01-31 22:14:21 +00004631 utf16Error:
4632 outpos = p - PyUnicode_AS_UNICODE(unicode);
4633 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004634 errors,
4635 &errorHandler,
4636 "utf16", errmsg,
4637 &starts,
4638 (const char **)&e,
4639 &startinpos,
4640 &endinpos,
4641 &exc,
4642 (const char **)&q,
4643 &unicode,
4644 &outpos,
4645 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004646 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004648 /* remaining byte at the end? (size should be even) */
4649 if (e == q) {
4650 if (!consumed) {
4651 errmsg = "truncated data";
4652 startinpos = ((const char *)q) - starts;
4653 endinpos = ((const char *)e) + 1 - starts;
4654 outpos = p - PyUnicode_AS_UNICODE(unicode);
4655 if (unicode_decode_call_errorhandler(
4656 errors,
4657 &errorHandler,
4658 "utf16", errmsg,
4659 &starts,
4660 (const char **)&e,
4661 &startinpos,
4662 &endinpos,
4663 &exc,
4664 (const char **)&q,
4665 &unicode,
4666 &outpos,
4667 &p))
4668 goto onError;
4669 /* The remaining input chars are ignored if the callback
4670 chooses to skip the input */
4671 }
4672 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004673
4674 if (byteorder)
4675 *byteorder = bo;
4676
Walter Dörwald69652032004-09-07 20:24:22 +00004677 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004678 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004679
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004681 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 goto onError;
4683
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004684 Py_XDECREF(errorHandler);
4685 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004686 if (PyUnicode_READY(unicode) == -1) {
4687 Py_DECREF(unicode);
4688 return NULL;
4689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004690 return (PyObject *)unicode;
4691
Benjamin Peterson29060642009-01-31 22:14:21 +00004692 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004694 Py_XDECREF(errorHandler);
4695 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 return NULL;
4697}
4698
Antoine Pitrouab868312009-01-10 15:40:25 +00004699#undef FAST_CHAR_MASK
4700#undef SWAPPED_FAST_CHAR_MASK
4701
Tim Peters772747b2001-08-09 22:21:55 +00004702PyObject *
4703PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004704 Py_ssize_t size,
4705 const char *errors,
4706 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004708 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004709 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004710 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004711#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004712 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004713#else
4714 const int pairs = 0;
4715#endif
Tim Peters772747b2001-08-09 22:21:55 +00004716 /* Offsets from p for storing byte pairs in the right order. */
4717#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4718 int ihi = 1, ilo = 0;
4719#else
4720 int ihi = 0, ilo = 1;
4721#endif
4722
Benjamin Peterson29060642009-01-31 22:14:21 +00004723#define STORECHAR(CH) \
4724 do { \
4725 p[ihi] = ((CH) >> 8) & 0xff; \
4726 p[ilo] = (CH) & 0xff; \
4727 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004728 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004730#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004731 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004732 if (s[i] >= 0x10000)
4733 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004734#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004735 /* 2 * (size + pairs + (byteorder == 0)) */
4736 if (size > PY_SSIZE_T_MAX ||
4737 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004738 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004739 nsize = size + pairs + (byteorder == 0);
4740 bytesize = nsize * 2;
4741 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004743 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 if (v == NULL)
4745 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004747 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004749 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004750 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004751 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004752
4753 if (byteorder == -1) {
4754 /* force LE */
4755 ihi = 1;
4756 ilo = 0;
4757 }
4758 else if (byteorder == 1) {
4759 /* force BE */
4760 ihi = 0;
4761 ilo = 1;
4762 }
4763
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004764 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004765 Py_UNICODE ch = *s++;
4766 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004767#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004768 if (ch >= 0x10000) {
4769 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4770 ch = 0xD800 | ((ch-0x10000) >> 10);
4771 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004772#endif
Tim Peters772747b2001-08-09 22:21:55 +00004773 STORECHAR(ch);
4774 if (ch2)
4775 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004776 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004777
4778 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004779 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004780#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781}
4782
Alexander Belopolsky40018472011-02-26 01:02:56 +00004783PyObject *
4784PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785{
4786 if (!PyUnicode_Check(unicode)) {
4787 PyErr_BadArgument();
4788 return NULL;
4789 }
4790 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 PyUnicode_GET_SIZE(unicode),
4792 NULL,
4793 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794}
4795
4796/* --- Unicode Escape Codec ----------------------------------------------- */
4797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004798/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4799 if all the escapes in the string make it still a valid ASCII string.
4800 Returns -1 if any escapes were found which cause the string to
4801 pop out of ASCII range. Otherwise returns the length of the
4802 required buffer to hold the string.
4803 */
4804Py_ssize_t
4805length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4806{
4807 const unsigned char *p = (const unsigned char *)s;
4808 const unsigned char *end = p + size;
4809 Py_ssize_t length = 0;
4810
4811 if (size < 0)
4812 return -1;
4813
4814 for (; p < end; ++p) {
4815 if (*p > 127) {
4816 /* Non-ASCII */
4817 return -1;
4818 }
4819 else if (*p != '\\') {
4820 /* Normal character */
4821 ++length;
4822 }
4823 else {
4824 /* Backslash-escape, check next char */
4825 ++p;
4826 /* Escape sequence reaches till end of string or
4827 non-ASCII follow-up. */
4828 if (p >= end || *p > 127)
4829 return -1;
4830 switch (*p) {
4831 case '\n':
4832 /* backslash + \n result in zero characters */
4833 break;
4834 case '\\': case '\'': case '\"':
4835 case 'b': case 'f': case 't':
4836 case 'n': case 'r': case 'v': case 'a':
4837 ++length;
4838 break;
4839 case '0': case '1': case '2': case '3':
4840 case '4': case '5': case '6': case '7':
4841 case 'x': case 'u': case 'U': case 'N':
4842 /* these do not guarantee ASCII characters */
4843 return -1;
4844 default:
4845 /* count the backslash + the other character */
4846 length += 2;
4847 }
4848 }
4849 }
4850 return length;
4851}
4852
4853/* Similar to PyUnicode_WRITE but either write into wstr field
4854 or treat string as ASCII. */
4855#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4856 do { \
4857 if ((kind) != PyUnicode_WCHAR_KIND) \
4858 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4859 else \
4860 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4861 } while (0)
4862
4863#define WRITE_WSTR(buf, index, value) \
4864 assert(kind == PyUnicode_WCHAR_KIND), \
4865 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4866
4867
Fredrik Lundh06d12682001-01-24 07:59:11 +00004868static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004869
Alexander Belopolsky40018472011-02-26 01:02:56 +00004870PyObject *
4871PyUnicode_DecodeUnicodeEscape(const char *s,
4872 Py_ssize_t size,
4873 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004875 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004876 Py_ssize_t startinpos;
4877 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004878 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004880 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004882 char* message;
4883 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004884 PyObject *errorHandler = NULL;
4885 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004886 Py_ssize_t ascii_length;
4887 Py_ssize_t i;
4888 int kind;
4889 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004891 ascii_length = length_of_escaped_ascii_string(s, size);
4892
4893 /* After length_of_escaped_ascii_string() there are two alternatives,
4894 either the string is pure ASCII with named escapes like \n, etc.
4895 and we determined it's exact size (common case)
4896 or it contains \x, \u, ... escape sequences. then we create a
4897 legacy wchar string and resize it at the end of this function. */
4898 if (ascii_length >= 0) {
4899 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4900 if (!v)
4901 goto onError;
4902 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4903 kind = PyUnicode_1BYTE_KIND;
4904 data = PyUnicode_DATA(v);
4905 }
4906 else {
4907 /* Escaped strings will always be longer than the resulting
4908 Unicode string, so we start with size here and then reduce the
4909 length after conversion to the true value.
4910 (but if the error callback returns a long replacement string
4911 we'll have to allocate more space) */
4912 v = _PyUnicode_New(size);
4913 if (!v)
4914 goto onError;
4915 kind = PyUnicode_WCHAR_KIND;
4916 data = PyUnicode_AS_UNICODE(v);
4917 }
4918
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 if (size == 0)
4920 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004921 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004923
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 while (s < end) {
4925 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004926 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004927 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004929 if (kind == PyUnicode_WCHAR_KIND) {
4930 assert(i < _PyUnicode_WSTR_LENGTH(v));
4931 }
4932 else {
4933 /* The only case in which i == ascii_length is a backslash
4934 followed by a newline. */
4935 assert(i <= ascii_length);
4936 }
4937
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 /* Non-escape characters are interpreted as Unicode ordinals */
4939 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004940 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 continue;
4942 }
4943
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004944 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 /* \ - Escapes */
4946 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004947 c = *s++;
4948 if (s > end)
4949 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004950
4951 if (kind == PyUnicode_WCHAR_KIND) {
4952 assert(i < _PyUnicode_WSTR_LENGTH(v));
4953 }
4954 else {
4955 /* The only case in which i == ascii_length is a backslash
4956 followed by a newline. */
4957 assert(i < ascii_length || (i == ascii_length && c == '\n'));
4958 }
4959
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004960 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004964 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
4965 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
4966 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
4967 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
4968 /* FF */
4969 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
4970 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
4971 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
4972 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
4973 /* VT */
4974 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
4975 /* BEL, not classic C */
4976 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977
Benjamin Peterson29060642009-01-31 22:14:21 +00004978 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979 case '0': case '1': case '2': case '3':
4980 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004981 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004982 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004983 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004984 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004985 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004987 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988 break;
4989
Benjamin Peterson29060642009-01-31 22:14:21 +00004990 /* hex escapes */
4991 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00004993 digits = 2;
4994 message = "truncated \\xXX escape";
4995 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996
Benjamin Peterson29060642009-01-31 22:14:21 +00004997 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00004999 digits = 4;
5000 message = "truncated \\uXXXX escape";
5001 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002
Benjamin Peterson29060642009-01-31 22:14:21 +00005003 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005004 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005005 digits = 8;
5006 message = "truncated \\UXXXXXXXX escape";
5007 hexescape:
5008 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005009 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005010 if (s+digits>end) {
5011 endinpos = size;
5012 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 errors, &errorHandler,
5014 "unicodeescape", "end of string in escape sequence",
5015 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005016 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005017 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005018 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005019 goto nextByte;
5020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005021 for (j = 0; j < digits; ++j) {
5022 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005023 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005024 endinpos = (s+j+1)-starts;
5025 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005026 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005027 errors, &errorHandler,
5028 "unicodeescape", message,
5029 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005030 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005031 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005032 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005033 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005034 }
5035 chr = (chr<<4) & ~0xF;
5036 if (c >= '0' && c <= '9')
5037 chr += c - '0';
5038 else if (c >= 'a' && c <= 'f')
5039 chr += 10 + c - 'a';
5040 else
5041 chr += 10 + c - 'A';
5042 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005043 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005044 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 /* _decoding_error will have already written into the
5046 target buffer. */
5047 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005048 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005049 /* when we get here, chr is a 32-bit unicode character */
5050 if (chr <= 0xffff)
5051 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005052 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005053 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005054 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005055 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005056#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005057 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005058#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005059 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005060 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5061 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005062#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005063 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005064 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005065 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005066 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005067 errors, &errorHandler,
5068 "unicodeescape", "illegal Unicode character",
5069 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005070 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005071 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005072 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005073 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005074 break;
5075
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005077 case 'N':
5078 message = "malformed \\N character escape";
5079 if (ucnhash_CAPI == NULL) {
5080 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005081 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5082 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005083 if (ucnhash_CAPI == NULL)
5084 goto ucnhashError;
5085 }
5086 if (*s == '{') {
5087 const char *start = s+1;
5088 /* look for the closing brace */
5089 while (*s != '}' && s < end)
5090 s++;
5091 if (s > start && s < end && *s == '}') {
5092 /* found a name. look it up in the unicode database */
5093 message = "unknown Unicode character name";
5094 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005095 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5096 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005097 goto store;
5098 }
5099 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005100 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005101 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005102 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005103 errors, &errorHandler,
5104 "unicodeescape", message,
5105 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005106 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005107 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005108 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005109 break;
5110
5111 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005112 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005113 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 message = "\\ at end of string";
5115 s--;
5116 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005117 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005118 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005119 errors, &errorHandler,
5120 "unicodeescape", message,
5121 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005122 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005123 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005124 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005125 }
5126 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005127 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5128 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005129 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005130 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005133 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005135 /* Ensure the length prediction worked in case of ASCII strings */
5136 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5137
5138 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5139 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005140 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005141 Py_XDECREF(errorHandler);
5142 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005144
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005146 PyErr_SetString(
5147 PyExc_UnicodeError,
5148 "\\N escapes not supported (can't load unicodedata module)"
5149 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005150 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005151 Py_XDECREF(errorHandler);
5152 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005153 return NULL;
5154
Benjamin Peterson29060642009-01-31 22:14:21 +00005155 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005157 Py_XDECREF(errorHandler);
5158 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159 return NULL;
5160}
5161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005162#undef WRITE_ASCII_OR_WSTR
5163#undef WRITE_WSTR
5164
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165/* Return a Unicode-Escape string version of the Unicode object.
5166
5167 If quotes is true, the string is enclosed in u"" or u'' quotes as
5168 appropriate.
5169
5170*/
5171
Walter Dörwald79e913e2007-05-12 11:08:06 +00005172static const char *hexdigits = "0123456789abcdef";
5173
Alexander Belopolsky40018472011-02-26 01:02:56 +00005174PyObject *
5175PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5176 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005178 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005181#ifdef Py_UNICODE_WIDE
5182 const Py_ssize_t expandsize = 10;
5183#else
5184 const Py_ssize_t expandsize = 6;
5185#endif
5186
Thomas Wouters89f507f2006-12-13 04:49:30 +00005187 /* XXX(nnorwitz): rather than over-allocating, it would be
5188 better to choose a different scheme. Perhaps scan the
5189 first N-chars of the string and allocate based on that size.
5190 */
5191 /* Initial allocation is based on the longest-possible unichr
5192 escape.
5193
5194 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5195 unichr, so in this case it's the longest unichr escape. In
5196 narrow (UTF-16) builds this is five chars per source unichr
5197 since there are two unichrs in the surrogate pair, so in narrow
5198 (UTF-16) builds it's not the longest unichr escape.
5199
5200 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5201 so in the narrow (UTF-16) build case it's the longest unichr
5202 escape.
5203 */
5204
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005205 if (size == 0)
5206 return PyBytes_FromStringAndSize(NULL, 0);
5207
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005208 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005209 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005210
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005211 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005212 2
5213 + expandsize*size
5214 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215 if (repr == NULL)
5216 return NULL;
5217
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005218 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 while (size-- > 0) {
5221 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005222
Walter Dörwald79e913e2007-05-12 11:08:06 +00005223 /* Escape backslashes */
5224 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225 *p++ = '\\';
5226 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005227 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005228 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005229
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005230#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005231 /* Map 21-bit characters to '\U00xxxxxx' */
5232 else if (ch >= 0x10000) {
5233 *p++ = '\\';
5234 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005235 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5236 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5237 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5238 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5239 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5240 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5241 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5242 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005244 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005245#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5247 else if (ch >= 0xD800 && ch < 0xDC00) {
5248 Py_UNICODE ch2;
5249 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005250
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 ch2 = *s++;
5252 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005253 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005254 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5255 *p++ = '\\';
5256 *p++ = 'U';
5257 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5258 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5259 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5260 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5261 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5262 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5263 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5264 *p++ = hexdigits[ucs & 0x0000000F];
5265 continue;
5266 }
5267 /* Fall through: isolated surrogates are copied as-is */
5268 s--;
5269 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005270 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005271#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005272
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005274 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 *p++ = '\\';
5276 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005277 *p++ = hexdigits[(ch >> 12) & 0x000F];
5278 *p++ = hexdigits[(ch >> 8) & 0x000F];
5279 *p++ = hexdigits[(ch >> 4) & 0x000F];
5280 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005282
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005283 /* Map special whitespace to '\t', \n', '\r' */
5284 else if (ch == '\t') {
5285 *p++ = '\\';
5286 *p++ = 't';
5287 }
5288 else if (ch == '\n') {
5289 *p++ = '\\';
5290 *p++ = 'n';
5291 }
5292 else if (ch == '\r') {
5293 *p++ = '\\';
5294 *p++ = 'r';
5295 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005296
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005297 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005298 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005300 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005301 *p++ = hexdigits[(ch >> 4) & 0x000F];
5302 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005303 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005304
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 /* Copy everything else as-is */
5306 else
5307 *p++ = (char) ch;
5308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005310 assert(p - PyBytes_AS_STRING(repr) > 0);
5311 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5312 return NULL;
5313 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314}
5315
Alexander Belopolsky40018472011-02-26 01:02:56 +00005316PyObject *
5317PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005319 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320 if (!PyUnicode_Check(unicode)) {
5321 PyErr_BadArgument();
5322 return NULL;
5323 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005324 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5325 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005326 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327}
5328
5329/* --- Raw Unicode Escape Codec ------------------------------------------- */
5330
Alexander Belopolsky40018472011-02-26 01:02:56 +00005331PyObject *
5332PyUnicode_DecodeRawUnicodeEscape(const char *s,
5333 Py_ssize_t size,
5334 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005336 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005337 Py_ssize_t startinpos;
5338 Py_ssize_t endinpos;
5339 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005341 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 const char *end;
5343 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005344 PyObject *errorHandler = NULL;
5345 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005346
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 /* Escaped strings will always be longer than the resulting
5348 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005349 length after conversion to the true value. (But decoding error
5350 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 v = _PyUnicode_New(size);
5352 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005356 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 end = s + size;
5358 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 unsigned char c;
5360 Py_UCS4 x;
5361 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005362 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 /* Non-escape characters are interpreted as Unicode ordinals */
5365 if (*s != '\\') {
5366 *p++ = (unsigned char)*s++;
5367 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005368 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 startinpos = s-starts;
5370
5371 /* \u-escapes are only interpreted iff the number of leading
5372 backslashes if odd */
5373 bs = s;
5374 for (;s < end;) {
5375 if (*s != '\\')
5376 break;
5377 *p++ = (unsigned char)*s++;
5378 }
5379 if (((s - bs) & 1) == 0 ||
5380 s >= end ||
5381 (*s != 'u' && *s != 'U')) {
5382 continue;
5383 }
5384 p--;
5385 count = *s=='u' ? 4 : 8;
5386 s++;
5387
5388 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5389 outpos = p-PyUnicode_AS_UNICODE(v);
5390 for (x = 0, i = 0; i < count; ++i, ++s) {
5391 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005392 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 endinpos = s-starts;
5394 if (unicode_decode_call_errorhandler(
5395 errors, &errorHandler,
5396 "rawunicodeescape", "truncated \\uXXXX",
5397 &starts, &end, &startinpos, &endinpos, &exc, &s,
5398 &v, &outpos, &p))
5399 goto onError;
5400 goto nextByte;
5401 }
5402 x = (x<<4) & ~0xF;
5403 if (c >= '0' && c <= '9')
5404 x += c - '0';
5405 else if (c >= 'a' && c <= 'f')
5406 x += 10 + c - 'a';
5407 else
5408 x += 10 + c - 'A';
5409 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005410 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 /* UCS-2 character */
5412 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005413 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 /* UCS-4 character. Either store directly, or as
5415 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005416#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005418#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005419 x -= 0x10000L;
5420 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5421 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005422#endif
5423 } else {
5424 endinpos = s-starts;
5425 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005426 if (unicode_decode_call_errorhandler(
5427 errors, &errorHandler,
5428 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 &starts, &end, &startinpos, &endinpos, &exc, &s,
5430 &v, &outpos, &p))
5431 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005432 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 nextByte:
5434 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005436 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005438 Py_XDECREF(errorHandler);
5439 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005440 if (PyUnicode_READY(v) == -1) {
5441 Py_DECREF(v);
5442 return NULL;
5443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005445
Benjamin Peterson29060642009-01-31 22:14:21 +00005446 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005448 Py_XDECREF(errorHandler);
5449 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 return NULL;
5451}
5452
Alexander Belopolsky40018472011-02-26 01:02:56 +00005453PyObject *
5454PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5455 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005457 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 char *p;
5459 char *q;
5460
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005461#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005462 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005463#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005464 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005465#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005466
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005467 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005469
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005470 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 if (repr == NULL)
5472 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005473 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005474 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005476 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 while (size-- > 0) {
5478 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005479#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005480 /* Map 32-bit characters to '\Uxxxxxxxx' */
5481 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005482 *p++ = '\\';
5483 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005484 *p++ = hexdigits[(ch >> 28) & 0xf];
5485 *p++ = hexdigits[(ch >> 24) & 0xf];
5486 *p++ = hexdigits[(ch >> 20) & 0xf];
5487 *p++ = hexdigits[(ch >> 16) & 0xf];
5488 *p++ = hexdigits[(ch >> 12) & 0xf];
5489 *p++ = hexdigits[(ch >> 8) & 0xf];
5490 *p++ = hexdigits[(ch >> 4) & 0xf];
5491 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005492 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005493 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005494#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5496 if (ch >= 0xD800 && ch < 0xDC00) {
5497 Py_UNICODE ch2;
5498 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005499
Benjamin Peterson29060642009-01-31 22:14:21 +00005500 ch2 = *s++;
5501 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005502 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005503 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5504 *p++ = '\\';
5505 *p++ = 'U';
5506 *p++ = hexdigits[(ucs >> 28) & 0xf];
5507 *p++ = hexdigits[(ucs >> 24) & 0xf];
5508 *p++ = hexdigits[(ucs >> 20) & 0xf];
5509 *p++ = hexdigits[(ucs >> 16) & 0xf];
5510 *p++ = hexdigits[(ucs >> 12) & 0xf];
5511 *p++ = hexdigits[(ucs >> 8) & 0xf];
5512 *p++ = hexdigits[(ucs >> 4) & 0xf];
5513 *p++ = hexdigits[ucs & 0xf];
5514 continue;
5515 }
5516 /* Fall through: isolated surrogates are copied as-is */
5517 s--;
5518 size++;
5519 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005520#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 /* Map 16-bit characters to '\uxxxx' */
5522 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 *p++ = '\\';
5524 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005525 *p++ = hexdigits[(ch >> 12) & 0xf];
5526 *p++ = hexdigits[(ch >> 8) & 0xf];
5527 *p++ = hexdigits[(ch >> 4) & 0xf];
5528 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005530 /* Copy everything else as-is */
5531 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 *p++ = (char) ch;
5533 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005534 size = p - q;
5535
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005536 assert(size > 0);
5537 if (_PyBytes_Resize(&repr, size) < 0)
5538 return NULL;
5539 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540}
5541
Alexander Belopolsky40018472011-02-26 01:02:56 +00005542PyObject *
5543PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005545 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005547 PyErr_BadArgument();
5548 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005550 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5551 PyUnicode_GET_SIZE(unicode));
5552
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005553 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554}
5555
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005556/* --- Unicode Internal Codec ------------------------------------------- */
5557
Alexander Belopolsky40018472011-02-26 01:02:56 +00005558PyObject *
5559_PyUnicode_DecodeUnicodeInternal(const char *s,
5560 Py_ssize_t size,
5561 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005562{
5563 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005564 Py_ssize_t startinpos;
5565 Py_ssize_t endinpos;
5566 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005567 PyUnicodeObject *v;
5568 Py_UNICODE *p;
5569 const char *end;
5570 const char *reason;
5571 PyObject *errorHandler = NULL;
5572 PyObject *exc = NULL;
5573
Neal Norwitzd43069c2006-01-08 01:12:10 +00005574#ifdef Py_UNICODE_WIDE
5575 Py_UNICODE unimax = PyUnicode_GetMax();
5576#endif
5577
Thomas Wouters89f507f2006-12-13 04:49:30 +00005578 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005579 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5580 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005582 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5583 as string was created with the old API. */
5584 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005586 p = PyUnicode_AS_UNICODE(v);
5587 end = s + size;
5588
5589 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005590 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005591 /* We have to sanity check the raw data, otherwise doom looms for
5592 some malformed UCS-4 data. */
5593 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005594#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005595 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005596#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005597 end-s < Py_UNICODE_SIZE
5598 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005600 startinpos = s - starts;
5601 if (end-s < Py_UNICODE_SIZE) {
5602 endinpos = end-starts;
5603 reason = "truncated input";
5604 }
5605 else {
5606 endinpos = s - starts + Py_UNICODE_SIZE;
5607 reason = "illegal code point (> 0x10FFFF)";
5608 }
5609 outpos = p - PyUnicode_AS_UNICODE(v);
5610 if (unicode_decode_call_errorhandler(
5611 errors, &errorHandler,
5612 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005613 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005614 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005615 goto onError;
5616 }
5617 }
5618 else {
5619 p++;
5620 s += Py_UNICODE_SIZE;
5621 }
5622 }
5623
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005624 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005625 goto onError;
5626 Py_XDECREF(errorHandler);
5627 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005628 if (PyUnicode_READY(v) == -1) {
5629 Py_DECREF(v);
5630 return NULL;
5631 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005632 return (PyObject *)v;
5633
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005635 Py_XDECREF(v);
5636 Py_XDECREF(errorHandler);
5637 Py_XDECREF(exc);
5638 return NULL;
5639}
5640
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641/* --- Latin-1 Codec ------------------------------------------------------ */
5642
Alexander Belopolsky40018472011-02-26 01:02:56 +00005643PyObject *
5644PyUnicode_DecodeLatin1(const char *s,
5645 Py_ssize_t size,
5646 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005649 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650}
5651
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005652/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005653static void
5654make_encode_exception(PyObject **exceptionObject,
5655 const char *encoding,
5656 const Py_UNICODE *unicode, Py_ssize_t size,
5657 Py_ssize_t startpos, Py_ssize_t endpos,
5658 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005660 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 *exceptionObject = PyUnicodeEncodeError_Create(
5662 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 }
5664 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5666 goto onError;
5667 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5668 goto onError;
5669 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5670 goto onError;
5671 return;
5672 onError:
5673 Py_DECREF(*exceptionObject);
5674 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 }
5676}
5677
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005678/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005679static void
5680raise_encode_exception(PyObject **exceptionObject,
5681 const char *encoding,
5682 const Py_UNICODE *unicode, Py_ssize_t size,
5683 Py_ssize_t startpos, Py_ssize_t endpos,
5684 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685{
5686 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005688 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005690}
5691
5692/* error handling callback helper:
5693 build arguments, call the callback and check the arguments,
5694 put the result into newpos and return the replacement string, which
5695 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005696static PyObject *
5697unicode_encode_call_errorhandler(const char *errors,
5698 PyObject **errorHandler,
5699 const char *encoding, const char *reason,
5700 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5701 Py_ssize_t startpos, Py_ssize_t endpos,
5702 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005704 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705
5706 PyObject *restuple;
5707 PyObject *resunicode;
5708
5709 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005710 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005711 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 }
5714
5715 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005717 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005718 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005719
5720 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005721 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005722 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005725 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 Py_DECREF(restuple);
5727 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005728 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005729 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 &resunicode, newpos)) {
5731 Py_DECREF(restuple);
5732 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005733 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005734 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5735 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5736 Py_DECREF(restuple);
5737 return NULL;
5738 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005741 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5743 Py_DECREF(restuple);
5744 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005745 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005746 Py_INCREF(resunicode);
5747 Py_DECREF(restuple);
5748 return resunicode;
5749}
5750
Alexander Belopolsky40018472011-02-26 01:02:56 +00005751static PyObject *
5752unicode_encode_ucs1(const Py_UNICODE *p,
5753 Py_ssize_t size,
5754 const char *errors,
5755 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756{
5757 /* output object */
5758 PyObject *res;
5759 /* pointers to the beginning and end+1 of input */
5760 const Py_UNICODE *startp = p;
5761 const Py_UNICODE *endp = p + size;
5762 /* pointer to the beginning of the unencodable characters */
5763 /* const Py_UNICODE *badp = NULL; */
5764 /* pointer into the output */
5765 char *str;
5766 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005767 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005768 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5769 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770 PyObject *errorHandler = NULL;
5771 PyObject *exc = NULL;
5772 /* the following variable is used for caching string comparisons
5773 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5774 int known_errorHandler = -1;
5775
5776 /* allocate enough for a simple encoding without
5777 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005778 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005779 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005780 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005782 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005783 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784 ressize = size;
5785
5786 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005788
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 /* can we encode this? */
5790 if (c<limit) {
5791 /* no overflow check, because we know that the space is enough */
5792 *str++ = (char)c;
5793 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005794 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 else {
5796 Py_ssize_t unicodepos = p-startp;
5797 Py_ssize_t requiredsize;
5798 PyObject *repunicode;
5799 Py_ssize_t repsize;
5800 Py_ssize_t newpos;
5801 Py_ssize_t respos;
5802 Py_UNICODE *uni2;
5803 /* startpos for collecting unencodable chars */
5804 const Py_UNICODE *collstart = p;
5805 const Py_UNICODE *collend = p;
5806 /* find all unecodable characters */
5807 while ((collend < endp) && ((*collend)>=limit))
5808 ++collend;
5809 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5810 if (known_errorHandler==-1) {
5811 if ((errors==NULL) || (!strcmp(errors, "strict")))
5812 known_errorHandler = 1;
5813 else if (!strcmp(errors, "replace"))
5814 known_errorHandler = 2;
5815 else if (!strcmp(errors, "ignore"))
5816 known_errorHandler = 3;
5817 else if (!strcmp(errors, "xmlcharrefreplace"))
5818 known_errorHandler = 4;
5819 else
5820 known_errorHandler = 0;
5821 }
5822 switch (known_errorHandler) {
5823 case 1: /* strict */
5824 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5825 goto onError;
5826 case 2: /* replace */
5827 while (collstart++<collend)
5828 *str++ = '?'; /* fall through */
5829 case 3: /* ignore */
5830 p = collend;
5831 break;
5832 case 4: /* xmlcharrefreplace */
5833 respos = str - PyBytes_AS_STRING(res);
5834 /* determine replacement size (temporarily (mis)uses p) */
5835 for (p = collstart, repsize = 0; p < collend; ++p) {
5836 if (*p<10)
5837 repsize += 2+1+1;
5838 else if (*p<100)
5839 repsize += 2+2+1;
5840 else if (*p<1000)
5841 repsize += 2+3+1;
5842 else if (*p<10000)
5843 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005844#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005845 else
5846 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005847#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 else if (*p<100000)
5849 repsize += 2+5+1;
5850 else if (*p<1000000)
5851 repsize += 2+6+1;
5852 else
5853 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005854#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 }
5856 requiredsize = respos+repsize+(endp-collend);
5857 if (requiredsize > ressize) {
5858 if (requiredsize<2*ressize)
5859 requiredsize = 2*ressize;
5860 if (_PyBytes_Resize(&res, requiredsize))
5861 goto onError;
5862 str = PyBytes_AS_STRING(res) + respos;
5863 ressize = requiredsize;
5864 }
5865 /* generate replacement (temporarily (mis)uses p) */
5866 for (p = collstart; p < collend; ++p) {
5867 str += sprintf(str, "&#%d;", (int)*p);
5868 }
5869 p = collend;
5870 break;
5871 default:
5872 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5873 encoding, reason, startp, size, &exc,
5874 collstart-startp, collend-startp, &newpos);
5875 if (repunicode == NULL)
5876 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005877 if (PyBytes_Check(repunicode)) {
5878 /* Directly copy bytes result to output. */
5879 repsize = PyBytes_Size(repunicode);
5880 if (repsize > 1) {
5881 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005882 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005883 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5884 Py_DECREF(repunicode);
5885 goto onError;
5886 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005887 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005888 ressize += repsize-1;
5889 }
5890 memcpy(str, PyBytes_AsString(repunicode), repsize);
5891 str += repsize;
5892 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005893 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005894 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005895 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 /* need more space? (at least enough for what we
5897 have+the replacement+the rest of the string, so
5898 we won't have to check space for encodable characters) */
5899 respos = str - PyBytes_AS_STRING(res);
5900 repsize = PyUnicode_GET_SIZE(repunicode);
5901 requiredsize = respos+repsize+(endp-collend);
5902 if (requiredsize > ressize) {
5903 if (requiredsize<2*ressize)
5904 requiredsize = 2*ressize;
5905 if (_PyBytes_Resize(&res, requiredsize)) {
5906 Py_DECREF(repunicode);
5907 goto onError;
5908 }
5909 str = PyBytes_AS_STRING(res) + respos;
5910 ressize = requiredsize;
5911 }
5912 /* check if there is anything unencodable in the replacement
5913 and copy it to the output */
5914 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5915 c = *uni2;
5916 if (c >= limit) {
5917 raise_encode_exception(&exc, encoding, startp, size,
5918 unicodepos, unicodepos+1, reason);
5919 Py_DECREF(repunicode);
5920 goto onError;
5921 }
5922 *str = (char)c;
5923 }
5924 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005925 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005926 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005927 }
5928 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005929 /* Resize if we allocated to much */
5930 size = str - PyBytes_AS_STRING(res);
5931 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005932 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005933 if (_PyBytes_Resize(&res, size) < 0)
5934 goto onError;
5935 }
5936
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005937 Py_XDECREF(errorHandler);
5938 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005939 return res;
5940
5941 onError:
5942 Py_XDECREF(res);
5943 Py_XDECREF(errorHandler);
5944 Py_XDECREF(exc);
5945 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005946}
5947
Alexander Belopolsky40018472011-02-26 01:02:56 +00005948PyObject *
5949PyUnicode_EncodeLatin1(const Py_UNICODE *p,
5950 Py_ssize_t size,
5951 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005953 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954}
5955
Alexander Belopolsky40018472011-02-26 01:02:56 +00005956PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005957_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958{
5959 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 PyErr_BadArgument();
5961 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005963 if (PyUnicode_READY(unicode) == -1)
5964 return NULL;
5965 /* Fast path: if it is a one-byte string, construct
5966 bytes object directly. */
5967 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
5968 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
5969 PyUnicode_GET_LENGTH(unicode));
5970 /* Non-Latin-1 characters present. Defer to above function to
5971 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005974 errors);
5975}
5976
5977PyObject*
5978PyUnicode_AsLatin1String(PyObject *unicode)
5979{
5980 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981}
5982
5983/* --- 7-bit ASCII Codec -------------------------------------------------- */
5984
Alexander Belopolsky40018472011-02-26 01:02:56 +00005985PyObject *
5986PyUnicode_DecodeASCII(const char *s,
5987 Py_ssize_t size,
5988 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005990 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 PyUnicodeObject *v;
5992 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005993 Py_ssize_t startinpos;
5994 Py_ssize_t endinpos;
5995 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005997 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005998 PyObject *errorHandler = NULL;
5999 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006000 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006001
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006003 if (size == 1 && *(unsigned char*)s < 128)
6004 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6005
6006 /* Fast path. Assume the input actually *is* ASCII, and allocate
6007 a single-block Unicode object with that assumption. If there is
6008 an error, drop the object and start over. */
6009 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6010 if (v == NULL)
6011 goto onError;
6012 d = PyUnicode_1BYTE_DATA(v);
6013 for (i = 0; i < size; i++) {
6014 unsigned char ch = ((unsigned char*)s)[i];
6015 if (ch < 128)
6016 d[i] = ch;
6017 else
6018 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006019 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006020 if (i == size)
6021 return (PyObject*)v;
6022 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006023
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 v = _PyUnicode_New(size);
6025 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006030 e = s + size;
6031 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 register unsigned char c = (unsigned char)*s;
6033 if (c < 128) {
6034 *p++ = c;
6035 ++s;
6036 }
6037 else {
6038 startinpos = s-starts;
6039 endinpos = startinpos + 1;
6040 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6041 if (unicode_decode_call_errorhandler(
6042 errors, &errorHandler,
6043 "ascii", "ordinal not in range(128)",
6044 &starts, &e, &startinpos, &endinpos, &exc, &s,
6045 &v, &outpos, &p))
6046 goto onError;
6047 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006049 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6051 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006052 Py_XDECREF(errorHandler);
6053 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006054 if (PyUnicode_READY(v) == -1) {
6055 Py_DECREF(v);
6056 return NULL;
6057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006059
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006062 Py_XDECREF(errorHandler);
6063 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 return NULL;
6065}
6066
Alexander Belopolsky40018472011-02-26 01:02:56 +00006067PyObject *
6068PyUnicode_EncodeASCII(const Py_UNICODE *p,
6069 Py_ssize_t size,
6070 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006072 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073}
6074
Alexander Belopolsky40018472011-02-26 01:02:56 +00006075PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006076_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077{
6078 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 PyErr_BadArgument();
6080 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006082 if (PyUnicode_READY(unicode) == -1)
6083 return NULL;
6084 /* Fast path: if it is an ASCII-only string, construct bytes object
6085 directly. Else defer to above function to raise the exception. */
6086 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6087 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6088 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006091 errors);
6092}
6093
6094PyObject *
6095PyUnicode_AsASCIIString(PyObject *unicode)
6096{
6097 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098}
6099
Victor Stinner99b95382011-07-04 14:23:54 +02006100#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006101
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006102/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006103
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006104#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006105#define NEED_RETRY
6106#endif
6107
6108/* XXX This code is limited to "true" double-byte encodings, as
6109 a) it assumes an incomplete character consists of a single byte, and
6110 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006112
Alexander Belopolsky40018472011-02-26 01:02:56 +00006113static int
6114is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006115{
6116 const char *curr = s + offset;
6117
6118 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 const char *prev = CharPrev(s, curr);
6120 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006121 }
6122 return 0;
6123}
6124
6125/*
6126 * Decode MBCS string into unicode object. If 'final' is set, converts
6127 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6128 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006129static int
6130decode_mbcs(PyUnicodeObject **v,
6131 const char *s, /* MBCS string */
6132 int size, /* sizeof MBCS string */
6133 int final,
6134 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006135{
6136 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006137 Py_ssize_t n;
6138 DWORD usize;
6139 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006140
6141 assert(size >= 0);
6142
Victor Stinner554f3f02010-06-16 23:33:54 +00006143 /* check and handle 'errors' arg */
6144 if (errors==NULL || strcmp(errors, "strict")==0)
6145 flags = MB_ERR_INVALID_CHARS;
6146 else if (strcmp(errors, "ignore")==0)
6147 flags = 0;
6148 else {
6149 PyErr_Format(PyExc_ValueError,
6150 "mbcs encoding does not support errors='%s'",
6151 errors);
6152 return -1;
6153 }
6154
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006155 /* Skip trailing lead-byte unless 'final' is set */
6156 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006158
6159 /* First get the size of the result */
6160 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006161 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6162 if (usize==0)
6163 goto mbcs_decode_error;
6164 } else
6165 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006166
6167 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 /* Create unicode object */
6169 *v = _PyUnicode_New(usize);
6170 if (*v == NULL)
6171 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006172 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006173 }
6174 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006175 /* Extend unicode object */
6176 n = PyUnicode_GET_SIZE(*v);
6177 if (_PyUnicode_Resize(v, n + usize) < 0)
6178 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006179 }
6180
6181 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006182 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006183 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006184 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6185 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006187 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006188 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006189
6190mbcs_decode_error:
6191 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6192 we raise a UnicodeDecodeError - else it is a 'generic'
6193 windows error
6194 */
6195 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6196 /* Ideally, we should get reason from FormatMessage - this
6197 is the Windows 2000 English version of the message
6198 */
6199 PyObject *exc = NULL;
6200 const char *reason = "No mapping for the Unicode character exists "
6201 "in the target multi-byte code page.";
6202 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6203 if (exc != NULL) {
6204 PyCodec_StrictErrors(exc);
6205 Py_DECREF(exc);
6206 }
6207 } else {
6208 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6209 }
6210 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006211}
6212
Alexander Belopolsky40018472011-02-26 01:02:56 +00006213PyObject *
6214PyUnicode_DecodeMBCSStateful(const char *s,
6215 Py_ssize_t size,
6216 const char *errors,
6217 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006218{
6219 PyUnicodeObject *v = NULL;
6220 int done;
6221
6222 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006224
6225#ifdef NEED_RETRY
6226 retry:
6227 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006228 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006229 else
6230#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006231 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006232
6233 if (done < 0) {
6234 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006236 }
6237
6238 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006240
6241#ifdef NEED_RETRY
6242 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006243 s += done;
6244 size -= done;
6245 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006246 }
6247#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006248 if (PyUnicode_READY(v) == -1) {
6249 Py_DECREF(v);
6250 return NULL;
6251 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006252 return (PyObject *)v;
6253}
6254
Alexander Belopolsky40018472011-02-26 01:02:56 +00006255PyObject *
6256PyUnicode_DecodeMBCS(const char *s,
6257 Py_ssize_t size,
6258 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006259{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006260 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6261}
6262
6263/*
6264 * Convert unicode into string object (MBCS).
6265 * Returns 0 if succeed, -1 otherwise.
6266 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006267static int
6268encode_mbcs(PyObject **repr,
6269 const Py_UNICODE *p, /* unicode */
6270 int size, /* size of unicode */
6271 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006272{
Victor Stinner554f3f02010-06-16 23:33:54 +00006273 BOOL usedDefaultChar = FALSE;
6274 BOOL *pusedDefaultChar;
6275 int mbcssize;
6276 Py_ssize_t n;
6277 PyObject *exc = NULL;
6278 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006279
6280 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006281
Victor Stinner554f3f02010-06-16 23:33:54 +00006282 /* check and handle 'errors' arg */
6283 if (errors==NULL || strcmp(errors, "strict")==0) {
6284 flags = WC_NO_BEST_FIT_CHARS;
6285 pusedDefaultChar = &usedDefaultChar;
6286 } else if (strcmp(errors, "replace")==0) {
6287 flags = 0;
6288 pusedDefaultChar = NULL;
6289 } else {
6290 PyErr_Format(PyExc_ValueError,
6291 "mbcs encoding does not support errors='%s'",
6292 errors);
6293 return -1;
6294 }
6295
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006296 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006297 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006298 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6299 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 if (mbcssize == 0) {
6301 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6302 return -1;
6303 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006304 /* If we used a default char, then we failed! */
6305 if (pusedDefaultChar && *pusedDefaultChar)
6306 goto mbcs_encode_error;
6307 } else {
6308 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006309 }
6310
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006311 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006312 /* Create string object */
6313 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6314 if (*repr == NULL)
6315 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006316 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006317 }
6318 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 /* Extend string object */
6320 n = PyBytes_Size(*repr);
6321 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6322 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006323 }
6324
6325 /* Do the conversion */
6326 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006328 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6329 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6331 return -1;
6332 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006333 if (pusedDefaultChar && *pusedDefaultChar)
6334 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006335 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006336 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006337
6338mbcs_encode_error:
6339 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6340 Py_XDECREF(exc);
6341 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006342}
6343
Alexander Belopolsky40018472011-02-26 01:02:56 +00006344PyObject *
6345PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6346 Py_ssize_t size,
6347 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006348{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006349 PyObject *repr = NULL;
6350 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006351
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006352#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006354 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006355 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006356 else
6357#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006358 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006359
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006360 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006361 Py_XDECREF(repr);
6362 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006363 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006364
6365#ifdef NEED_RETRY
6366 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 p += INT_MAX;
6368 size -= INT_MAX;
6369 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006370 }
6371#endif
6372
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006373 return repr;
6374}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006375
Alexander Belopolsky40018472011-02-26 01:02:56 +00006376PyObject *
6377PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006378{
6379 if (!PyUnicode_Check(unicode)) {
6380 PyErr_BadArgument();
6381 return NULL;
6382 }
6383 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 PyUnicode_GET_SIZE(unicode),
6385 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006386}
6387
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006388#undef NEED_RETRY
6389
Victor Stinner99b95382011-07-04 14:23:54 +02006390#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006391
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392/* --- Character Mapping Codec -------------------------------------------- */
6393
Alexander Belopolsky40018472011-02-26 01:02:56 +00006394PyObject *
6395PyUnicode_DecodeCharmap(const char *s,
6396 Py_ssize_t size,
6397 PyObject *mapping,
6398 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006401 Py_ssize_t startinpos;
6402 Py_ssize_t endinpos;
6403 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 PyUnicodeObject *v;
6406 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006407 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006408 PyObject *errorHandler = NULL;
6409 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006410 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006411 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006412
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 /* Default to Latin-1 */
6414 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416
6417 v = _PyUnicode_New(size);
6418 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006424 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 mapstring = PyUnicode_AS_UNICODE(mapping);
6426 maplen = PyUnicode_GET_SIZE(mapping);
6427 while (s < e) {
6428 unsigned char ch = *s;
6429 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 if (ch < maplen)
6432 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 if (x == 0xfffe) {
6435 /* undefined mapping */
6436 outpos = p-PyUnicode_AS_UNICODE(v);
6437 startinpos = s-starts;
6438 endinpos = startinpos+1;
6439 if (unicode_decode_call_errorhandler(
6440 errors, &errorHandler,
6441 "charmap", "character maps to <undefined>",
6442 &starts, &e, &startinpos, &endinpos, &exc, &s,
6443 &v, &outpos, &p)) {
6444 goto onError;
6445 }
6446 continue;
6447 }
6448 *p++ = x;
6449 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006450 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006451 }
6452 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 while (s < e) {
6454 unsigned char ch = *s;
6455 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006456
Benjamin Peterson29060642009-01-31 22:14:21 +00006457 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6458 w = PyLong_FromLong((long)ch);
6459 if (w == NULL)
6460 goto onError;
6461 x = PyObject_GetItem(mapping, w);
6462 Py_DECREF(w);
6463 if (x == NULL) {
6464 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6465 /* No mapping found means: mapping is undefined. */
6466 PyErr_Clear();
6467 x = Py_None;
6468 Py_INCREF(x);
6469 } else
6470 goto onError;
6471 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006472
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 /* Apply mapping */
6474 if (PyLong_Check(x)) {
6475 long value = PyLong_AS_LONG(x);
6476 if (value < 0 || value > 65535) {
6477 PyErr_SetString(PyExc_TypeError,
6478 "character mapping must be in range(65536)");
6479 Py_DECREF(x);
6480 goto onError;
6481 }
6482 *p++ = (Py_UNICODE)value;
6483 }
6484 else if (x == Py_None) {
6485 /* undefined mapping */
6486 outpos = p-PyUnicode_AS_UNICODE(v);
6487 startinpos = s-starts;
6488 endinpos = startinpos+1;
6489 if (unicode_decode_call_errorhandler(
6490 errors, &errorHandler,
6491 "charmap", "character maps to <undefined>",
6492 &starts, &e, &startinpos, &endinpos, &exc, &s,
6493 &v, &outpos, &p)) {
6494 Py_DECREF(x);
6495 goto onError;
6496 }
6497 Py_DECREF(x);
6498 continue;
6499 }
6500 else if (PyUnicode_Check(x)) {
6501 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006502
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 if (targetsize == 1)
6504 /* 1-1 mapping */
6505 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006506
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 else if (targetsize > 1) {
6508 /* 1-n mapping */
6509 if (targetsize > extrachars) {
6510 /* resize first */
6511 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6512 Py_ssize_t needed = (targetsize - extrachars) + \
6513 (targetsize << 2);
6514 extrachars += needed;
6515 /* XXX overflow detection missing */
6516 if (_PyUnicode_Resize(&v,
6517 PyUnicode_GET_SIZE(v) + needed) < 0) {
6518 Py_DECREF(x);
6519 goto onError;
6520 }
6521 p = PyUnicode_AS_UNICODE(v) + oldpos;
6522 }
6523 Py_UNICODE_COPY(p,
6524 PyUnicode_AS_UNICODE(x),
6525 targetsize);
6526 p += targetsize;
6527 extrachars -= targetsize;
6528 }
6529 /* 1-0 mapping: skip the character */
6530 }
6531 else {
6532 /* wrong return value */
6533 PyErr_SetString(PyExc_TypeError,
6534 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006535 Py_DECREF(x);
6536 goto onError;
6537 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 Py_DECREF(x);
6539 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 }
6542 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6544 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006545 Py_XDECREF(errorHandler);
6546 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006547 if (PyUnicode_READY(v) == -1) {
6548 Py_DECREF(v);
6549 return NULL;
6550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006552
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006554 Py_XDECREF(errorHandler);
6555 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556 Py_XDECREF(v);
6557 return NULL;
6558}
6559
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006560/* Charmap encoding: the lookup table */
6561
Alexander Belopolsky40018472011-02-26 01:02:56 +00006562struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 PyObject_HEAD
6564 unsigned char level1[32];
6565 int count2, count3;
6566 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006567};
6568
6569static PyObject*
6570encoding_map_size(PyObject *obj, PyObject* args)
6571{
6572 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006573 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006575}
6576
6577static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006578 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 PyDoc_STR("Return the size (in bytes) of this object") },
6580 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006581};
6582
6583static void
6584encoding_map_dealloc(PyObject* o)
6585{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006586 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006587}
6588
6589static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006590 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 "EncodingMap", /*tp_name*/
6592 sizeof(struct encoding_map), /*tp_basicsize*/
6593 0, /*tp_itemsize*/
6594 /* methods */
6595 encoding_map_dealloc, /*tp_dealloc*/
6596 0, /*tp_print*/
6597 0, /*tp_getattr*/
6598 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006599 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 0, /*tp_repr*/
6601 0, /*tp_as_number*/
6602 0, /*tp_as_sequence*/
6603 0, /*tp_as_mapping*/
6604 0, /*tp_hash*/
6605 0, /*tp_call*/
6606 0, /*tp_str*/
6607 0, /*tp_getattro*/
6608 0, /*tp_setattro*/
6609 0, /*tp_as_buffer*/
6610 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6611 0, /*tp_doc*/
6612 0, /*tp_traverse*/
6613 0, /*tp_clear*/
6614 0, /*tp_richcompare*/
6615 0, /*tp_weaklistoffset*/
6616 0, /*tp_iter*/
6617 0, /*tp_iternext*/
6618 encoding_map_methods, /*tp_methods*/
6619 0, /*tp_members*/
6620 0, /*tp_getset*/
6621 0, /*tp_base*/
6622 0, /*tp_dict*/
6623 0, /*tp_descr_get*/
6624 0, /*tp_descr_set*/
6625 0, /*tp_dictoffset*/
6626 0, /*tp_init*/
6627 0, /*tp_alloc*/
6628 0, /*tp_new*/
6629 0, /*tp_free*/
6630 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006631};
6632
6633PyObject*
6634PyUnicode_BuildEncodingMap(PyObject* string)
6635{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006636 PyObject *result;
6637 struct encoding_map *mresult;
6638 int i;
6639 int need_dict = 0;
6640 unsigned char level1[32];
6641 unsigned char level2[512];
6642 unsigned char *mlevel1, *mlevel2, *mlevel3;
6643 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006644 int kind;
6645 void *data;
6646 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006648 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006649 PyErr_BadArgument();
6650 return NULL;
6651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006652 kind = PyUnicode_KIND(string);
6653 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006654 memset(level1, 0xFF, sizeof level1);
6655 memset(level2, 0xFF, sizeof level2);
6656
6657 /* If there isn't a one-to-one mapping of NULL to \0,
6658 or if there are non-BMP characters, we need to use
6659 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006660 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006661 need_dict = 1;
6662 for (i = 1; i < 256; i++) {
6663 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006664 ch = PyUnicode_READ(kind, data, i);
6665 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006666 need_dict = 1;
6667 break;
6668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006669 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006670 /* unmapped character */
6671 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006672 l1 = ch >> 11;
6673 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006674 if (level1[l1] == 0xFF)
6675 level1[l1] = count2++;
6676 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006677 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006678 }
6679
6680 if (count2 >= 0xFF || count3 >= 0xFF)
6681 need_dict = 1;
6682
6683 if (need_dict) {
6684 PyObject *result = PyDict_New();
6685 PyObject *key, *value;
6686 if (!result)
6687 return NULL;
6688 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006689 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006690 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006691 if (!key || !value)
6692 goto failed1;
6693 if (PyDict_SetItem(result, key, value) == -1)
6694 goto failed1;
6695 Py_DECREF(key);
6696 Py_DECREF(value);
6697 }
6698 return result;
6699 failed1:
6700 Py_XDECREF(key);
6701 Py_XDECREF(value);
6702 Py_DECREF(result);
6703 return NULL;
6704 }
6705
6706 /* Create a three-level trie */
6707 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6708 16*count2 + 128*count3 - 1);
6709 if (!result)
6710 return PyErr_NoMemory();
6711 PyObject_Init(result, &EncodingMapType);
6712 mresult = (struct encoding_map*)result;
6713 mresult->count2 = count2;
6714 mresult->count3 = count3;
6715 mlevel1 = mresult->level1;
6716 mlevel2 = mresult->level23;
6717 mlevel3 = mresult->level23 + 16*count2;
6718 memcpy(mlevel1, level1, 32);
6719 memset(mlevel2, 0xFF, 16*count2);
6720 memset(mlevel3, 0, 128*count3);
6721 count3 = 0;
6722 for (i = 1; i < 256; i++) {
6723 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006724 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006725 /* unmapped character */
6726 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006727 o1 = PyUnicode_READ(kind, data, i)>>11;
6728 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006729 i2 = 16*mlevel1[o1] + o2;
6730 if (mlevel2[i2] == 0xFF)
6731 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006732 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006733 i3 = 128*mlevel2[i2] + o3;
6734 mlevel3[i3] = i;
6735 }
6736 return result;
6737}
6738
6739static int
6740encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6741{
6742 struct encoding_map *map = (struct encoding_map*)mapping;
6743 int l1 = c>>11;
6744 int l2 = (c>>7) & 0xF;
6745 int l3 = c & 0x7F;
6746 int i;
6747
6748#ifdef Py_UNICODE_WIDE
6749 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006750 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006751 }
6752#endif
6753 if (c == 0)
6754 return 0;
6755 /* level 1*/
6756 i = map->level1[l1];
6757 if (i == 0xFF) {
6758 return -1;
6759 }
6760 /* level 2*/
6761 i = map->level23[16*i+l2];
6762 if (i == 0xFF) {
6763 return -1;
6764 }
6765 /* level 3 */
6766 i = map->level23[16*map->count2 + 128*i + l3];
6767 if (i == 0) {
6768 return -1;
6769 }
6770 return i;
6771}
6772
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006773/* Lookup the character ch in the mapping. If the character
6774 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006775 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006776static PyObject *
6777charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778{
Christian Heimes217cfd12007-12-02 14:31:20 +00006779 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006780 PyObject *x;
6781
6782 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006784 x = PyObject_GetItem(mapping, w);
6785 Py_DECREF(w);
6786 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6788 /* No mapping found means: mapping is undefined. */
6789 PyErr_Clear();
6790 x = Py_None;
6791 Py_INCREF(x);
6792 return x;
6793 } else
6794 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006796 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006798 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 long value = PyLong_AS_LONG(x);
6800 if (value < 0 || value > 255) {
6801 PyErr_SetString(PyExc_TypeError,
6802 "character mapping must be in range(256)");
6803 Py_DECREF(x);
6804 return NULL;
6805 }
6806 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006808 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 /* wrong return value */
6812 PyErr_Format(PyExc_TypeError,
6813 "character mapping must return integer, bytes or None, not %.400s",
6814 x->ob_type->tp_name);
6815 Py_DECREF(x);
6816 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817 }
6818}
6819
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006820static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006821charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006822{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006823 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6824 /* exponentially overallocate to minimize reallocations */
6825 if (requiredsize < 2*outsize)
6826 requiredsize = 2*outsize;
6827 if (_PyBytes_Resize(outobj, requiredsize))
6828 return -1;
6829 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006830}
6831
Benjamin Peterson14339b62009-01-31 16:36:08 +00006832typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006834} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006835/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006836 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006837 space is available. Return a new reference to the object that
6838 was put in the output buffer, or Py_None, if the mapping was undefined
6839 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006840 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006841static charmapencode_result
6842charmapencode_output(Py_UNICODE c, PyObject *mapping,
6843 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006844{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006845 PyObject *rep;
6846 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006847 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006848
Christian Heimes90aa7642007-12-19 02:45:37 +00006849 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006850 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006852 if (res == -1)
6853 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006854 if (outsize<requiredsize)
6855 if (charmapencode_resize(outobj, outpos, requiredsize))
6856 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006857 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 outstart[(*outpos)++] = (char)res;
6859 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006860 }
6861
6862 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006863 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006864 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006865 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 Py_DECREF(rep);
6867 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006868 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 if (PyLong_Check(rep)) {
6870 Py_ssize_t requiredsize = *outpos+1;
6871 if (outsize<requiredsize)
6872 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6873 Py_DECREF(rep);
6874 return enc_EXCEPTION;
6875 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006876 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006878 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 else {
6880 const char *repchars = PyBytes_AS_STRING(rep);
6881 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6882 Py_ssize_t requiredsize = *outpos+repsize;
6883 if (outsize<requiredsize)
6884 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6885 Py_DECREF(rep);
6886 return enc_EXCEPTION;
6887 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006888 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 memcpy(outstart + *outpos, repchars, repsize);
6890 *outpos += repsize;
6891 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006892 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006893 Py_DECREF(rep);
6894 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006895}
6896
6897/* handle an error in PyUnicode_EncodeCharmap
6898 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006899static int
6900charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006901 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006902 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006903 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006904 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006905{
6906 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006907 Py_ssize_t repsize;
6908 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006909 Py_UNICODE *uni2;
6910 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006911 Py_ssize_t collstartpos = *inpos;
6912 Py_ssize_t collendpos = *inpos+1;
6913 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006914 char *encoding = "charmap";
6915 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006916 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006917
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006918 /* find all unencodable characters */
6919 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006920 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006921 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 int res = encoding_map_lookup(p[collendpos], mapping);
6923 if (res != -1)
6924 break;
6925 ++collendpos;
6926 continue;
6927 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006928
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 rep = charmapencode_lookup(p[collendpos], mapping);
6930 if (rep==NULL)
6931 return -1;
6932 else if (rep!=Py_None) {
6933 Py_DECREF(rep);
6934 break;
6935 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006936 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006938 }
6939 /* cache callback name lookup
6940 * (if not done yet, i.e. it's the first error) */
6941 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006942 if ((errors==NULL) || (!strcmp(errors, "strict")))
6943 *known_errorHandler = 1;
6944 else if (!strcmp(errors, "replace"))
6945 *known_errorHandler = 2;
6946 else if (!strcmp(errors, "ignore"))
6947 *known_errorHandler = 3;
6948 else if (!strcmp(errors, "xmlcharrefreplace"))
6949 *known_errorHandler = 4;
6950 else
6951 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006952 }
6953 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006954 case 1: /* strict */
6955 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6956 return -1;
6957 case 2: /* replace */
6958 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 x = charmapencode_output('?', mapping, res, respos);
6960 if (x==enc_EXCEPTION) {
6961 return -1;
6962 }
6963 else if (x==enc_FAILED) {
6964 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6965 return -1;
6966 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006967 }
6968 /* fall through */
6969 case 3: /* ignore */
6970 *inpos = collendpos;
6971 break;
6972 case 4: /* xmlcharrefreplace */
6973 /* generate replacement (temporarily (mis)uses p) */
6974 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 char buffer[2+29+1+1];
6976 char *cp;
6977 sprintf(buffer, "&#%d;", (int)p[collpos]);
6978 for (cp = buffer; *cp; ++cp) {
6979 x = charmapencode_output(*cp, mapping, res, respos);
6980 if (x==enc_EXCEPTION)
6981 return -1;
6982 else if (x==enc_FAILED) {
6983 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6984 return -1;
6985 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006986 }
6987 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006988 *inpos = collendpos;
6989 break;
6990 default:
6991 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00006992 encoding, reason, p, size, exceptionObject,
6993 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006994 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006995 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006996 if (PyBytes_Check(repunicode)) {
6997 /* Directly copy bytes result to output. */
6998 Py_ssize_t outsize = PyBytes_Size(*res);
6999 Py_ssize_t requiredsize;
7000 repsize = PyBytes_Size(repunicode);
7001 requiredsize = *respos + repsize;
7002 if (requiredsize > outsize)
7003 /* Make room for all additional bytes. */
7004 if (charmapencode_resize(res, respos, requiredsize)) {
7005 Py_DECREF(repunicode);
7006 return -1;
7007 }
7008 memcpy(PyBytes_AsString(*res) + *respos,
7009 PyBytes_AsString(repunicode), repsize);
7010 *respos += repsize;
7011 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007012 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007013 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007014 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007015 /* generate replacement */
7016 repsize = PyUnicode_GET_SIZE(repunicode);
7017 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 x = charmapencode_output(*uni2, mapping, res, respos);
7019 if (x==enc_EXCEPTION) {
7020 return -1;
7021 }
7022 else if (x==enc_FAILED) {
7023 Py_DECREF(repunicode);
7024 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7025 return -1;
7026 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007027 }
7028 *inpos = newpos;
7029 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007030 }
7031 return 0;
7032}
7033
Alexander Belopolsky40018472011-02-26 01:02:56 +00007034PyObject *
7035PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7036 Py_ssize_t size,
7037 PyObject *mapping,
7038 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007040 /* output object */
7041 PyObject *res = NULL;
7042 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007043 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007044 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007045 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007046 PyObject *errorHandler = NULL;
7047 PyObject *exc = NULL;
7048 /* the following variable is used for caching string comparisons
7049 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7050 * 3=ignore, 4=xmlcharrefreplace */
7051 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052
7053 /* Default to Latin-1 */
7054 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007057 /* allocate enough for a simple encoding without
7058 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007059 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007060 if (res == NULL)
7061 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007062 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007065 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007066 /* try to encode it */
7067 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7068 if (x==enc_EXCEPTION) /* error */
7069 goto onError;
7070 if (x==enc_FAILED) { /* unencodable character */
7071 if (charmap_encoding_error(p, size, &inpos, mapping,
7072 &exc,
7073 &known_errorHandler, &errorHandler, errors,
7074 &res, &respos)) {
7075 goto onError;
7076 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007077 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007078 else
7079 /* done with this character => adjust input position */
7080 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007083 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007084 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007085 if (_PyBytes_Resize(&res, respos) < 0)
7086 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007087
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007088 Py_XDECREF(exc);
7089 Py_XDECREF(errorHandler);
7090 return res;
7091
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007093 Py_XDECREF(res);
7094 Py_XDECREF(exc);
7095 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 return NULL;
7097}
7098
Alexander Belopolsky40018472011-02-26 01:02:56 +00007099PyObject *
7100PyUnicode_AsCharmapString(PyObject *unicode,
7101 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102{
7103 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 PyErr_BadArgument();
7105 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106 }
7107 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007108 PyUnicode_GET_SIZE(unicode),
7109 mapping,
7110 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111}
7112
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007113/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007114static void
7115make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007116 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007117 Py_ssize_t startpos, Py_ssize_t endpos,
7118 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007120 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007121 *exceptionObject = _PyUnicodeTranslateError_Create(
7122 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123 }
7124 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7126 goto onError;
7127 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7128 goto onError;
7129 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7130 goto onError;
7131 return;
7132 onError:
7133 Py_DECREF(*exceptionObject);
7134 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135 }
7136}
7137
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007138/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007139static void
7140raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007141 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007142 Py_ssize_t startpos, Py_ssize_t endpos,
7143 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007144{
7145 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007146 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007147 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007148 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007149}
7150
7151/* error handling callback helper:
7152 build arguments, call the callback and check the arguments,
7153 put the result into newpos and return the replacement string, which
7154 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007155static PyObject *
7156unicode_translate_call_errorhandler(const char *errors,
7157 PyObject **errorHandler,
7158 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007159 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007160 Py_ssize_t startpos, Py_ssize_t endpos,
7161 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007162{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007163 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007164
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007165 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007166 PyObject *restuple;
7167 PyObject *resunicode;
7168
7169 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007170 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007171 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007173 }
7174
7175 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007176 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007177 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007179
7180 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007181 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007182 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007183 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007184 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007185 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007186 Py_DECREF(restuple);
7187 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007188 }
7189 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 &resunicode, &i_newpos)) {
7191 Py_DECREF(restuple);
7192 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007193 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007194 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007195 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007196 else
7197 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007198 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7200 Py_DECREF(restuple);
7201 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007202 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007203 Py_INCREF(resunicode);
7204 Py_DECREF(restuple);
7205 return resunicode;
7206}
7207
7208/* Lookup the character ch in the mapping and put the result in result,
7209 which must be decrefed by the caller.
7210 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007211static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007212charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007213{
Christian Heimes217cfd12007-12-02 14:31:20 +00007214 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007215 PyObject *x;
7216
7217 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007219 x = PyObject_GetItem(mapping, w);
7220 Py_DECREF(w);
7221 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007222 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7223 /* No mapping found means: use 1:1 mapping. */
7224 PyErr_Clear();
7225 *result = NULL;
7226 return 0;
7227 } else
7228 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007229 }
7230 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 *result = x;
7232 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007233 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007234 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007235 long value = PyLong_AS_LONG(x);
7236 long max = PyUnicode_GetMax();
7237 if (value < 0 || value > max) {
7238 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007239 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007240 Py_DECREF(x);
7241 return -1;
7242 }
7243 *result = x;
7244 return 0;
7245 }
7246 else if (PyUnicode_Check(x)) {
7247 *result = x;
7248 return 0;
7249 }
7250 else {
7251 /* wrong return value */
7252 PyErr_SetString(PyExc_TypeError,
7253 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007254 Py_DECREF(x);
7255 return -1;
7256 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007257}
7258/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 if not reallocate and adjust various state variables.
7260 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007261static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007262charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007263 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007264{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007265 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007266 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 /* exponentially overallocate to minimize reallocations */
7268 if (requiredsize < 2 * oldsize)
7269 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007270 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7271 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007273 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007274 }
7275 return 0;
7276}
7277/* lookup the character, put the result in the output string and adjust
7278 various state variables. Return a new reference to the object that
7279 was put in the output buffer in *result, or Py_None, if the mapping was
7280 undefined (in which case no character was written).
7281 The called must decref result.
7282 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007283static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007284charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7285 PyObject *mapping, Py_UCS4 **output,
7286 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007287 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007288{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007289 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7290 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007292 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007294 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007295 }
7296 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007298 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007299 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007300 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007301 }
7302 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007303 Py_ssize_t repsize;
7304 if (PyUnicode_READY(*res) == -1)
7305 return -1;
7306 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007307 if (repsize==1) {
7308 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007309 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007310 }
7311 else if (repsize!=0) {
7312 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007313 Py_ssize_t requiredsize = *opos +
7314 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007316 Py_ssize_t i;
7317 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007318 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007319 for(i = 0; i < repsize; i++)
7320 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007321 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007322 }
7323 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007324 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007325 return 0;
7326}
7327
Alexander Belopolsky40018472011-02-26 01:02:56 +00007328PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007329_PyUnicode_TranslateCharmap(PyObject *input,
7330 PyObject *mapping,
7331 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007333 /* input object */
7334 char *idata;
7335 Py_ssize_t size, i;
7336 int kind;
7337 /* output buffer */
7338 Py_UCS4 *output = NULL;
7339 Py_ssize_t osize;
7340 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007341 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007342 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007343 char *reason = "character maps to <undefined>";
7344 PyObject *errorHandler = NULL;
7345 PyObject *exc = NULL;
7346 /* the following variable is used for caching string comparisons
7347 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7348 * 3=ignore, 4=xmlcharrefreplace */
7349 int known_errorHandler = -1;
7350
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 PyErr_BadArgument();
7353 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007356 if (PyUnicode_READY(input) == -1)
7357 return NULL;
7358 idata = (char*)PyUnicode_DATA(input);
7359 kind = PyUnicode_KIND(input);
7360 size = PyUnicode_GET_LENGTH(input);
7361 i = 0;
7362
7363 if (size == 0) {
7364 Py_INCREF(input);
7365 return input;
7366 }
7367
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007368 /* allocate enough for a simple 1:1 translation without
7369 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007370 osize = size;
7371 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7372 opos = 0;
7373 if (output == NULL) {
7374 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007376 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007378 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 /* try to encode it */
7380 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007381 if (charmaptranslate_output(input, i, mapping,
7382 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 Py_XDECREF(x);
7384 goto onError;
7385 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007386 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007387 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007388 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007389 else { /* untranslatable character */
7390 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7391 Py_ssize_t repsize;
7392 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007393 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007394 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007395 Py_ssize_t collstart = i;
7396 Py_ssize_t collend = i+1;
7397 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398
Benjamin Peterson29060642009-01-31 22:14:21 +00007399 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007400 while (collend < size) {
7401 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 goto onError;
7403 Py_XDECREF(x);
7404 if (x!=Py_None)
7405 break;
7406 ++collend;
7407 }
7408 /* cache callback name lookup
7409 * (if not done yet, i.e. it's the first error) */
7410 if (known_errorHandler==-1) {
7411 if ((errors==NULL) || (!strcmp(errors, "strict")))
7412 known_errorHandler = 1;
7413 else if (!strcmp(errors, "replace"))
7414 known_errorHandler = 2;
7415 else if (!strcmp(errors, "ignore"))
7416 known_errorHandler = 3;
7417 else if (!strcmp(errors, "xmlcharrefreplace"))
7418 known_errorHandler = 4;
7419 else
7420 known_errorHandler = 0;
7421 }
7422 switch (known_errorHandler) {
7423 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007424 raise_translate_exception(&exc, input, collstart,
7425 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007426 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 case 2: /* replace */
7428 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007429 for (coll = collstart; coll<collend; coll++)
7430 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 /* fall through */
7432 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007433 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007434 break;
7435 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007436 /* generate replacement (temporarily (mis)uses i) */
7437 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007438 char buffer[2+29+1+1];
7439 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007440 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7441 if (charmaptranslate_makespace(&output, &osize,
7442 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 goto onError;
7444 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007445 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007447 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 break;
7449 default:
7450 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007451 reason, input, &exc,
7452 collstart, collend, &newpos);
7453 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 goto onError;
7455 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007456 repsize = PyUnicode_GET_LENGTH(repunicode);
7457 if (charmaptranslate_makespace(&output, &osize,
7458 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 Py_DECREF(repunicode);
7460 goto onError;
7461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007462 for (uni2 = 0; repsize-->0; ++uni2)
7463 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7464 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007466 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007467 }
7468 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007469 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7470 if (!res)
7471 goto onError;
7472 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007473 Py_XDECREF(exc);
7474 Py_XDECREF(errorHandler);
7475 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007478 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007479 Py_XDECREF(exc);
7480 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481 return NULL;
7482}
7483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007484/* Deprecated. Use PyUnicode_Translate instead. */
7485PyObject *
7486PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7487 Py_ssize_t size,
7488 PyObject *mapping,
7489 const char *errors)
7490{
7491 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7492 if (!unicode)
7493 return NULL;
7494 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7495}
7496
Alexander Belopolsky40018472011-02-26 01:02:56 +00007497PyObject *
7498PyUnicode_Translate(PyObject *str,
7499 PyObject *mapping,
7500 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501{
7502 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007503
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 str = PyUnicode_FromObject(str);
7505 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007507 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 Py_DECREF(str);
7509 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007510
Benjamin Peterson29060642009-01-31 22:14:21 +00007511 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 Py_XDECREF(str);
7513 return NULL;
7514}
Tim Petersced69f82003-09-16 20:30:58 +00007515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007516static Py_UCS4
7517fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7518{
7519 /* No need to call PyUnicode_READY(self) because this function is only
7520 called as a callback from fixup() which does it already. */
7521 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7522 const int kind = PyUnicode_KIND(self);
7523 void *data = PyUnicode_DATA(self);
7524 Py_UCS4 maxchar = 0, ch, fixed;
7525 Py_ssize_t i;
7526
7527 for (i = 0; i < len; ++i) {
7528 ch = PyUnicode_READ(kind, data, i);
7529 fixed = 0;
7530 if (ch > 127) {
7531 if (Py_UNICODE_ISSPACE(ch))
7532 fixed = ' ';
7533 else {
7534 const int decimal = Py_UNICODE_TODECIMAL(ch);
7535 if (decimal >= 0)
7536 fixed = '0' + decimal;
7537 }
7538 if (fixed != 0) {
7539 if (fixed > maxchar)
7540 maxchar = fixed;
7541 PyUnicode_WRITE(kind, data, i, fixed);
7542 }
7543 else if (ch > maxchar)
7544 maxchar = ch;
7545 }
7546 else if (ch > maxchar)
7547 maxchar = ch;
7548 }
7549
7550 return maxchar;
7551}
7552
7553PyObject *
7554_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7555{
7556 if (!PyUnicode_Check(unicode)) {
7557 PyErr_BadInternalCall();
7558 return NULL;
7559 }
7560 if (PyUnicode_READY(unicode) == -1)
7561 return NULL;
7562 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7563 /* If the string is already ASCII, just return the same string */
7564 Py_INCREF(unicode);
7565 return unicode;
7566 }
7567 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7568}
7569
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007570PyObject *
7571PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7572 Py_ssize_t length)
7573{
7574 PyObject *result;
7575 Py_UNICODE *p; /* write pointer into result */
7576 Py_ssize_t i;
7577 /* Copy to a new string */
7578 result = (PyObject *)_PyUnicode_New(length);
7579 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7580 if (result == NULL)
7581 return result;
7582 p = PyUnicode_AS_UNICODE(result);
7583 /* Iterate over code points */
7584 for (i = 0; i < length; i++) {
7585 Py_UNICODE ch =s[i];
7586 if (ch > 127) {
7587 int decimal = Py_UNICODE_TODECIMAL(ch);
7588 if (decimal >= 0)
7589 p[i] = '0' + decimal;
7590 }
7591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007592 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7593 Py_DECREF(result);
7594 return NULL;
7595 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007596 return result;
7597}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007598/* --- Decimal Encoder ---------------------------------------------------- */
7599
Alexander Belopolsky40018472011-02-26 01:02:56 +00007600int
7601PyUnicode_EncodeDecimal(Py_UNICODE *s,
7602 Py_ssize_t length,
7603 char *output,
7604 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007605{
7606 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007607 PyObject *errorHandler = NULL;
7608 PyObject *exc = NULL;
7609 const char *encoding = "decimal";
7610 const char *reason = "invalid decimal Unicode string";
7611 /* the following variable is used for caching string comparisons
7612 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7613 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007614
7615 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 PyErr_BadArgument();
7617 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007618 }
7619
7620 p = s;
7621 end = s + length;
7622 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 register Py_UNICODE ch = *p;
7624 int decimal;
7625 PyObject *repunicode;
7626 Py_ssize_t repsize;
7627 Py_ssize_t newpos;
7628 Py_UNICODE *uni2;
7629 Py_UNICODE *collstart;
7630 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007631
Benjamin Peterson29060642009-01-31 22:14:21 +00007632 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007633 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007634 ++p;
7635 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007636 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 decimal = Py_UNICODE_TODECIMAL(ch);
7638 if (decimal >= 0) {
7639 *output++ = '0' + decimal;
7640 ++p;
7641 continue;
7642 }
7643 if (0 < ch && ch < 256) {
7644 *output++ = (char)ch;
7645 ++p;
7646 continue;
7647 }
7648 /* All other characters are considered unencodable */
7649 collstart = p;
7650 collend = p+1;
7651 while (collend < end) {
7652 if ((0 < *collend && *collend < 256) ||
7653 !Py_UNICODE_ISSPACE(*collend) ||
7654 Py_UNICODE_TODECIMAL(*collend))
7655 break;
7656 }
7657 /* cache callback name lookup
7658 * (if not done yet, i.e. it's the first error) */
7659 if (known_errorHandler==-1) {
7660 if ((errors==NULL) || (!strcmp(errors, "strict")))
7661 known_errorHandler = 1;
7662 else if (!strcmp(errors, "replace"))
7663 known_errorHandler = 2;
7664 else if (!strcmp(errors, "ignore"))
7665 known_errorHandler = 3;
7666 else if (!strcmp(errors, "xmlcharrefreplace"))
7667 known_errorHandler = 4;
7668 else
7669 known_errorHandler = 0;
7670 }
7671 switch (known_errorHandler) {
7672 case 1: /* strict */
7673 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7674 goto onError;
7675 case 2: /* replace */
7676 for (p = collstart; p < collend; ++p)
7677 *output++ = '?';
7678 /* fall through */
7679 case 3: /* ignore */
7680 p = collend;
7681 break;
7682 case 4: /* xmlcharrefreplace */
7683 /* generate replacement (temporarily (mis)uses p) */
7684 for (p = collstart; p < collend; ++p)
7685 output += sprintf(output, "&#%d;", (int)*p);
7686 p = collend;
7687 break;
7688 default:
7689 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7690 encoding, reason, s, length, &exc,
7691 collstart-s, collend-s, &newpos);
7692 if (repunicode == NULL)
7693 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007694 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007695 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007696 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7697 Py_DECREF(repunicode);
7698 goto onError;
7699 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007700 /* generate replacement */
7701 repsize = PyUnicode_GET_SIZE(repunicode);
7702 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7703 Py_UNICODE ch = *uni2;
7704 if (Py_UNICODE_ISSPACE(ch))
7705 *output++ = ' ';
7706 else {
7707 decimal = Py_UNICODE_TODECIMAL(ch);
7708 if (decimal >= 0)
7709 *output++ = '0' + decimal;
7710 else if (0 < ch && ch < 256)
7711 *output++ = (char)ch;
7712 else {
7713 Py_DECREF(repunicode);
7714 raise_encode_exception(&exc, encoding,
7715 s, length, collstart-s, collend-s, reason);
7716 goto onError;
7717 }
7718 }
7719 }
7720 p = s + newpos;
7721 Py_DECREF(repunicode);
7722 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007723 }
7724 /* 0-terminate the output string */
7725 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007726 Py_XDECREF(exc);
7727 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007728 return 0;
7729
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007731 Py_XDECREF(exc);
7732 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007733 return -1;
7734}
7735
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736/* --- Helpers ------------------------------------------------------------ */
7737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007738#include "stringlib/ucs1lib.h"
7739#include "stringlib/fastsearch.h"
7740#include "stringlib/partition.h"
7741#include "stringlib/split.h"
7742#include "stringlib/count.h"
7743#include "stringlib/find.h"
7744#include "stringlib/localeutil.h"
7745#include "stringlib/undef.h"
7746
7747#include "stringlib/ucs2lib.h"
7748#include "stringlib/fastsearch.h"
7749#include "stringlib/partition.h"
7750#include "stringlib/split.h"
7751#include "stringlib/count.h"
7752#include "stringlib/find.h"
7753#include "stringlib/localeutil.h"
7754#include "stringlib/undef.h"
7755
7756#include "stringlib/ucs4lib.h"
7757#include "stringlib/fastsearch.h"
7758#include "stringlib/partition.h"
7759#include "stringlib/split.h"
7760#include "stringlib/count.h"
7761#include "stringlib/find.h"
7762#include "stringlib/localeutil.h"
7763#include "stringlib/undef.h"
7764
7765static Py_ssize_t
7766any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7767 const Py_UCS1*, Py_ssize_t,
7768 Py_ssize_t, Py_ssize_t),
7769 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7770 const Py_UCS2*, Py_ssize_t,
7771 Py_ssize_t, Py_ssize_t),
7772 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7773 const Py_UCS4*, Py_ssize_t,
7774 Py_ssize_t, Py_ssize_t),
7775 PyObject* s1, PyObject* s2,
7776 Py_ssize_t start,
7777 Py_ssize_t end)
7778{
7779 int kind1, kind2, kind;
7780 void *buf1, *buf2;
7781 Py_ssize_t len1, len2, result;
7782
7783 kind1 = PyUnicode_KIND(s1);
7784 kind2 = PyUnicode_KIND(s2);
7785 kind = kind1 > kind2 ? kind1 : kind2;
7786 buf1 = PyUnicode_DATA(s1);
7787 buf2 = PyUnicode_DATA(s2);
7788 if (kind1 != kind)
7789 buf1 = _PyUnicode_AsKind(s1, kind);
7790 if (!buf1)
7791 return -2;
7792 if (kind2 != kind)
7793 buf2 = _PyUnicode_AsKind(s2, kind);
7794 if (!buf2) {
7795 if (kind1 != kind) PyMem_Free(buf1);
7796 return -2;
7797 }
7798 len1 = PyUnicode_GET_LENGTH(s1);
7799 len2 = PyUnicode_GET_LENGTH(s2);
7800
7801 switch(kind) {
7802 case PyUnicode_1BYTE_KIND:
7803 result = ucs1(buf1, len1, buf2, len2, start, end);
7804 break;
7805 case PyUnicode_2BYTE_KIND:
7806 result = ucs2(buf1, len1, buf2, len2, start, end);
7807 break;
7808 case PyUnicode_4BYTE_KIND:
7809 result = ucs4(buf1, len1, buf2, len2, start, end);
7810 break;
7811 default:
7812 assert(0); result = -2;
7813 }
7814
7815 if (kind1 != kind)
7816 PyMem_Free(buf1);
7817 if (kind2 != kind)
7818 PyMem_Free(buf2);
7819
7820 return result;
7821}
7822
7823Py_ssize_t
7824_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7825 Py_ssize_t n_buffer,
7826 void *digits, Py_ssize_t n_digits,
7827 Py_ssize_t min_width,
7828 const char *grouping,
7829 const char *thousands_sep)
7830{
7831 switch(kind) {
7832 case PyUnicode_1BYTE_KIND:
7833 return _PyUnicode_ucs1_InsertThousandsGrouping(
7834 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7835 min_width, grouping, thousands_sep);
7836 case PyUnicode_2BYTE_KIND:
7837 return _PyUnicode_ucs2_InsertThousandsGrouping(
7838 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7839 min_width, grouping, thousands_sep);
7840 case PyUnicode_4BYTE_KIND:
7841 return _PyUnicode_ucs4_InsertThousandsGrouping(
7842 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7843 min_width, grouping, thousands_sep);
7844 }
7845 assert(0);
7846 return -1;
7847}
7848
7849
Eric Smith8c663262007-08-25 02:26:07 +00007850#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007851#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007852
Thomas Wouters477c8d52006-05-27 19:21:47 +00007853#include "stringlib/count.h"
7854#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007855
Thomas Wouters477c8d52006-05-27 19:21:47 +00007856/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007857#define ADJUST_INDICES(start, end, len) \
7858 if (end > len) \
7859 end = len; \
7860 else if (end < 0) { \
7861 end += len; \
7862 if (end < 0) \
7863 end = 0; \
7864 } \
7865 if (start < 0) { \
7866 start += len; \
7867 if (start < 0) \
7868 start = 0; \
7869 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007870
Alexander Belopolsky40018472011-02-26 01:02:56 +00007871Py_ssize_t
7872PyUnicode_Count(PyObject *str,
7873 PyObject *substr,
7874 Py_ssize_t start,
7875 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007877 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007878 PyUnicodeObject* str_obj;
7879 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007880 int kind1, kind2, kind;
7881 void *buf1 = NULL, *buf2 = NULL;
7882 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007883
Thomas Wouters477c8d52006-05-27 19:21:47 +00007884 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007885 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007887 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007888 if (!sub_obj || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 Py_DECREF(str_obj);
7890 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891 }
Tim Petersced69f82003-09-16 20:30:58 +00007892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007893 kind1 = PyUnicode_KIND(str_obj);
7894 kind2 = PyUnicode_KIND(sub_obj);
7895 kind = kind1 > kind2 ? kind1 : kind2;
7896 buf1 = PyUnicode_DATA(str_obj);
7897 if (kind1 != kind)
7898 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7899 if (!buf1)
7900 goto onError;
7901 buf2 = PyUnicode_DATA(sub_obj);
7902 if (kind2 != kind)
7903 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7904 if (!buf2)
7905 goto onError;
7906 len1 = PyUnicode_GET_LENGTH(str_obj);
7907 len2 = PyUnicode_GET_LENGTH(sub_obj);
7908
7909 ADJUST_INDICES(start, end, len1);
7910 switch(kind) {
7911 case PyUnicode_1BYTE_KIND:
7912 result = ucs1lib_count(
7913 ((Py_UCS1*)buf1) + start, end - start,
7914 buf2, len2, PY_SSIZE_T_MAX
7915 );
7916 break;
7917 case PyUnicode_2BYTE_KIND:
7918 result = ucs2lib_count(
7919 ((Py_UCS2*)buf1) + start, end - start,
7920 buf2, len2, PY_SSIZE_T_MAX
7921 );
7922 break;
7923 case PyUnicode_4BYTE_KIND:
7924 result = ucs4lib_count(
7925 ((Py_UCS4*)buf1) + start, end - start,
7926 buf2, len2, PY_SSIZE_T_MAX
7927 );
7928 break;
7929 default:
7930 assert(0); result = 0;
7931 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007932
7933 Py_DECREF(sub_obj);
7934 Py_DECREF(str_obj);
7935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007936 if (kind1 != kind)
7937 PyMem_Free(buf1);
7938 if (kind2 != kind)
7939 PyMem_Free(buf2);
7940
Guido van Rossumd57fd912000-03-10 22:53:23 +00007941 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007942 onError:
7943 Py_DECREF(sub_obj);
7944 Py_DECREF(str_obj);
7945 if (kind1 != kind && buf1)
7946 PyMem_Free(buf1);
7947 if (kind2 != kind && buf2)
7948 PyMem_Free(buf2);
7949 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950}
7951
Alexander Belopolsky40018472011-02-26 01:02:56 +00007952Py_ssize_t
7953PyUnicode_Find(PyObject *str,
7954 PyObject *sub,
7955 Py_ssize_t start,
7956 Py_ssize_t end,
7957 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007959 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00007960
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007962 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007964 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007965 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 Py_DECREF(str);
7967 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968 }
Tim Petersced69f82003-09-16 20:30:58 +00007969
Thomas Wouters477c8d52006-05-27 19:21:47 +00007970 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007971 result = any_find_slice(
7972 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
7973 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007974 );
7975 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007976 result = any_find_slice(
7977 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
7978 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007979 );
7980
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007982 Py_DECREF(sub);
7983
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984 return result;
7985}
7986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007987Py_ssize_t
7988PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
7989 Py_ssize_t start, Py_ssize_t end,
7990 int direction)
7991{
7992 char *result;
7993 int kind;
7994 if (PyUnicode_READY(str) == -1)
7995 return -2;
7996 if (end > PyUnicode_GET_LENGTH(str))
7997 end = PyUnicode_GET_LENGTH(str);
7998 kind = PyUnicode_KIND(str);
7999 result = findchar(PyUnicode_1BYTE_DATA(str)
8000 + PyUnicode_KIND_SIZE(kind, start),
8001 kind,
8002 end-start, ch, direction);
8003 if (!result)
8004 return -1;
8005 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8006}
8007
Alexander Belopolsky40018472011-02-26 01:02:56 +00008008static int
8009tailmatch(PyUnicodeObject *self,
8010 PyUnicodeObject *substring,
8011 Py_ssize_t start,
8012 Py_ssize_t end,
8013 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008015 int kind_self;
8016 int kind_sub;
8017 void *data_self;
8018 void *data_sub;
8019 Py_ssize_t offset;
8020 Py_ssize_t i;
8021 Py_ssize_t end_sub;
8022
8023 if (PyUnicode_READY(self) == -1 ||
8024 PyUnicode_READY(substring) == -1)
8025 return 0;
8026
8027 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 return 1;
8029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008030 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8031 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008035 kind_self = PyUnicode_KIND(self);
8036 data_self = PyUnicode_DATA(self);
8037 kind_sub = PyUnicode_KIND(substring);
8038 data_sub = PyUnicode_DATA(substring);
8039 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8040
8041 if (direction > 0)
8042 offset = end;
8043 else
8044 offset = start;
8045
8046 if (PyUnicode_READ(kind_self, data_self, offset) ==
8047 PyUnicode_READ(kind_sub, data_sub, 0) &&
8048 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8049 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8050 /* If both are of the same kind, memcmp is sufficient */
8051 if (kind_self == kind_sub) {
8052 return ! memcmp((char *)data_self +
8053 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8054 data_sub,
8055 PyUnicode_GET_LENGTH(substring) *
8056 PyUnicode_CHARACTER_SIZE(substring));
8057 }
8058 /* otherwise we have to compare each character by first accesing it */
8059 else {
8060 /* We do not need to compare 0 and len(substring)-1 because
8061 the if statement above ensured already that they are equal
8062 when we end up here. */
8063 // TODO: honor direction and do a forward or backwards search
8064 for (i = 1; i < end_sub; ++i) {
8065 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8066 PyUnicode_READ(kind_sub, data_sub, i))
8067 return 0;
8068 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008070 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071 }
8072
8073 return 0;
8074}
8075
Alexander Belopolsky40018472011-02-26 01:02:56 +00008076Py_ssize_t
8077PyUnicode_Tailmatch(PyObject *str,
8078 PyObject *substr,
8079 Py_ssize_t start,
8080 Py_ssize_t end,
8081 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008083 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008084
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085 str = PyUnicode_FromObject(str);
8086 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008087 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088 substr = PyUnicode_FromObject(substr);
8089 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008090 Py_DECREF(str);
8091 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092 }
Tim Petersced69f82003-09-16 20:30:58 +00008093
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 (PyUnicodeObject *)substr,
8096 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 Py_DECREF(str);
8098 Py_DECREF(substr);
8099 return result;
8100}
8101
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102/* Apply fixfct filter to the Unicode object self and return a
8103 reference to the modified object */
8104
Alexander Belopolsky40018472011-02-26 01:02:56 +00008105static PyObject *
8106fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008107 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008109 PyObject *u;
8110 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008112 if (PyUnicode_READY(self) == -1)
8113 return NULL;
8114 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8115 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8116 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008120 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8121 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008123 /* fix functions return the new maximum character in a string,
8124 if the kind of the resulting unicode object does not change,
8125 everything is fine. Otherwise we need to change the string kind
8126 and re-run the fix function. */
8127 maxchar_new = fixfct((PyUnicodeObject*)u);
8128 if (maxchar_new == 0)
8129 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8130 else if (maxchar_new <= 127)
8131 maxchar_new = 127;
8132 else if (maxchar_new <= 255)
8133 maxchar_new = 255;
8134 else if (maxchar_new <= 65535)
8135 maxchar_new = 65535;
8136 else
8137 maxchar_new = 1114111; /* 0x10ffff */
8138
8139 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008140 /* fixfct should return TRUE if it modified the buffer. If
8141 FALSE, return a reference to the original buffer instead
8142 (to save space, not time) */
8143 Py_INCREF(self);
8144 Py_DECREF(u);
8145 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008147 else if (maxchar_new == maxchar_old) {
8148 return u;
8149 }
8150 else {
8151 /* In case the maximum character changed, we need to
8152 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008153 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008154 if (v == NULL) {
8155 Py_DECREF(u);
8156 return NULL;
8157 }
8158 if (maxchar_new > maxchar_old) {
8159 /* If the maxchar increased so that the kind changed, not all
8160 characters are representable anymore and we need to fix the
8161 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008162 if (PyUnicode_CopyCharacters(v, 0,
8163 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008164 PyUnicode_GET_LENGTH(self)) < 0)
8165 {
8166 Py_DECREF(u);
8167 return NULL;
8168 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008169 maxchar_old = fixfct((PyUnicodeObject*)v);
8170 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8171 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008172 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008173 if (PyUnicode_CopyCharacters(v, 0,
8174 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008175 PyUnicode_GET_LENGTH(self)) < 0)
8176 {
8177 Py_DECREF(u);
8178 return NULL;
8179 }
8180 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008181
8182 Py_DECREF(u);
8183 return v;
8184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185}
8186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008187static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008188fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008190 /* No need to call PyUnicode_READY(self) because this function is only
8191 called as a callback from fixup() which does it already. */
8192 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8193 const int kind = PyUnicode_KIND(self);
8194 void *data = PyUnicode_DATA(self);
8195 int touched = 0;
8196 Py_UCS4 maxchar = 0;
8197 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008199 for (i = 0; i < len; ++i) {
8200 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8201 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8202 if (up != ch) {
8203 if (up > maxchar)
8204 maxchar = up;
8205 PyUnicode_WRITE(kind, data, i, up);
8206 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008207 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008208 else if (ch > maxchar)
8209 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210 }
8211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008212 if (touched)
8213 return maxchar;
8214 else
8215 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216}
8217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008218static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008219fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008221 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8222 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8223 const int kind = PyUnicode_KIND(self);
8224 void *data = PyUnicode_DATA(self);
8225 int touched = 0;
8226 Py_UCS4 maxchar = 0;
8227 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008229 for(i = 0; i < len; ++i) {
8230 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8231 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8232 if (lo != ch) {
8233 if (lo > maxchar)
8234 maxchar = lo;
8235 PyUnicode_WRITE(kind, data, i, lo);
8236 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238 else if (ch > maxchar)
8239 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240 }
8241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008242 if (touched)
8243 return maxchar;
8244 else
8245 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246}
8247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008249fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008251 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8252 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8253 const int kind = PyUnicode_KIND(self);
8254 void *data = PyUnicode_DATA(self);
8255 int touched = 0;
8256 Py_UCS4 maxchar = 0;
8257 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008259 for(i = 0; i < len; ++i) {
8260 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8261 Py_UCS4 nu = 0;
8262
8263 if (Py_UNICODE_ISUPPER(ch))
8264 nu = Py_UNICODE_TOLOWER(ch);
8265 else if (Py_UNICODE_ISLOWER(ch))
8266 nu = Py_UNICODE_TOUPPER(ch);
8267
8268 if (nu != 0) {
8269 if (nu > maxchar)
8270 maxchar = nu;
8271 PyUnicode_WRITE(kind, data, i, nu);
8272 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274 else if (ch > maxchar)
8275 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276 }
8277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008278 if (touched)
8279 return maxchar;
8280 else
8281 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282}
8283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008284static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008285fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008287 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8288 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8289 const int kind = PyUnicode_KIND(self);
8290 void *data = PyUnicode_DATA(self);
8291 int touched = 0;
8292 Py_UCS4 maxchar = 0;
8293 Py_ssize_t i = 0;
8294 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008295
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008296 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008298
8299 ch = PyUnicode_READ(kind, data, i);
8300 if (!Py_UNICODE_ISUPPER(ch)) {
8301 maxchar = Py_UNICODE_TOUPPER(ch);
8302 PyUnicode_WRITE(kind, data, i, maxchar);
8303 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008305 ++i;
8306 for(; i < len; ++i) {
8307 ch = PyUnicode_READ(kind, data, i);
8308 if (!Py_UNICODE_ISLOWER(ch)) {
8309 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8310 if (lo > maxchar)
8311 maxchar = lo;
8312 PyUnicode_WRITE(kind, data, i, lo);
8313 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008315 else if (ch > maxchar)
8316 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008317 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008318
8319 if (touched)
8320 return maxchar;
8321 else
8322 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323}
8324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008325static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008326fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008328 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8329 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8330 const int kind = PyUnicode_KIND(self);
8331 void *data = PyUnicode_DATA(self);
8332 Py_UCS4 maxchar = 0;
8333 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 int previous_is_cased;
8335
8336 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008337 if (len == 1) {
8338 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8339 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8340 if (ti != ch) {
8341 PyUnicode_WRITE(kind, data, i, ti);
8342 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 }
8344 else
8345 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348 for(; i < len; ++i) {
8349 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8350 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008351
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 nu = Py_UNICODE_TOTITLE(ch);
8356
8357 if (nu > maxchar)
8358 maxchar = nu;
8359 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008360
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 if (Py_UNICODE_ISLOWER(ch) ||
8362 Py_UNICODE_ISUPPER(ch) ||
8363 Py_UNICODE_ISTITLE(ch))
8364 previous_is_cased = 1;
8365 else
8366 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008368 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369}
8370
Tim Peters8ce9f162004-08-27 01:49:32 +00008371PyObject *
8372PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008375 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008377 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008378 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8379 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008380 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 Py_ssize_t sz, i, res_offset;
8382 Py_UCS4 maxchar = 0;
8383 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384
Tim Peters05eba1f2004-08-27 21:32:02 +00008385 fseq = PySequence_Fast(seq, "");
8386 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008387 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008388 }
8389
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008390 /* NOTE: the following code can't call back into Python code,
8391 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008392 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008393
Tim Peters05eba1f2004-08-27 21:32:02 +00008394 seqlen = PySequence_Fast_GET_SIZE(fseq);
8395 /* If empty sequence, return u"". */
8396 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008397 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008398 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008399 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008400 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008401 /* If singleton sequence with an exact Unicode, return that. */
8402 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 item = items[0];
8404 if (PyUnicode_CheckExact(item)) {
8405 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008406 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 goto Done;
8408 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008409 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008410 else {
8411 /* Set up sep and seplen */
8412 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008413 /* fall back to a blank space separator */
8414 sep = PyUnicode_FromOrdinal(' ');
8415 if (!sep || PyUnicode_READY(sep) == -1)
8416 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008417 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008418 else {
8419 if (!PyUnicode_Check(separator)) {
8420 PyErr_Format(PyExc_TypeError,
8421 "separator: expected str instance,"
8422 " %.80s found",
8423 Py_TYPE(separator)->tp_name);
8424 goto onError;
8425 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008426 if (PyUnicode_READY(separator) == -1)
8427 goto onError;
8428 sep = separator;
8429 seplen = PyUnicode_GET_LENGTH(separator);
8430 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8431 /* inc refcount to keep this code path symetric with the
8432 above case of a blank separator */
8433 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008434 }
8435 }
8436
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008437 /* There are at least two things to join, or else we have a subclass
8438 * of str in the sequence.
8439 * Do a pre-pass to figure out the total amount of space we'll
8440 * need (sz), and see whether all argument are strings.
8441 */
8442 sz = 0;
8443 for (i = 0; i < seqlen; i++) {
8444 const Py_ssize_t old_sz = sz;
8445 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 if (!PyUnicode_Check(item)) {
8447 PyErr_Format(PyExc_TypeError,
8448 "sequence item %zd: expected str instance,"
8449 " %.80s found",
8450 i, Py_TYPE(item)->tp_name);
8451 goto onError;
8452 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008453 if (PyUnicode_READY(item) == -1)
8454 goto onError;
8455 sz += PyUnicode_GET_LENGTH(item);
8456 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8457 if (item_maxchar > maxchar)
8458 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008459 if (i != 0)
8460 sz += seplen;
8461 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8462 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008464 goto onError;
8465 }
8466 }
Tim Petersced69f82003-09-16 20:30:58 +00008467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008469 if (res == NULL)
8470 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008471
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008472 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008474 Py_ssize_t itemlen;
8475 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008476 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 /* Copy item, and maybe the separator. */
8478 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008479 if (PyUnicode_CopyCharacters(res, res_offset,
8480 sep, 0, seplen) < 0)
8481 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008482 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008484 if (PyUnicode_CopyCharacters(res, res_offset,
8485 item, 0, itemlen) < 0)
8486 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008490
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008492 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493 Py_XDECREF(sep);
8494 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008497 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008499 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 return NULL;
8501}
8502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503#define FILL(kind, data, value, start, length) \
8504 do { \
8505 Py_ssize_t i_ = 0; \
8506 assert(kind != PyUnicode_WCHAR_KIND); \
8507 switch ((kind)) { \
8508 case PyUnicode_1BYTE_KIND: { \
8509 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8510 memset(to_, (unsigned char)value, length); \
8511 break; \
8512 } \
8513 case PyUnicode_2BYTE_KIND: { \
8514 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8515 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8516 break; \
8517 } \
8518 default: { \
8519 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8520 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8521 break; \
8522 } \
8523 } \
8524 } while (0)
8525
Alexander Belopolsky40018472011-02-26 01:02:56 +00008526static PyUnicodeObject *
8527pad(PyUnicodeObject *self,
8528 Py_ssize_t left,
8529 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008530 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 PyObject *u;
8533 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008534 int kind;
8535 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536
8537 if (left < 0)
8538 left = 0;
8539 if (right < 0)
8540 right = 0;
8541
Tim Peters7a29bd52001-09-12 03:03:31 +00008542 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543 Py_INCREF(self);
8544 return self;
8545 }
8546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008547 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8548 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008549 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8550 return NULL;
8551 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008552 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8553 if (fill > maxchar)
8554 maxchar = fill;
8555 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008556 if (!u)
8557 return NULL;
8558
8559 kind = PyUnicode_KIND(u);
8560 data = PyUnicode_DATA(u);
8561 if (left)
8562 FILL(kind, data, fill, 0, left);
8563 if (right)
8564 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008565 if (PyUnicode_CopyCharacters(u, left,
8566 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008567 _PyUnicode_LENGTH(self)) < 0)
8568 {
8569 Py_DECREF(u);
8570 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 }
8572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576
Alexander Belopolsky40018472011-02-26 01:02:56 +00008577PyObject *
8578PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581
8582 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008586 switch(PyUnicode_KIND(string)) {
8587 case PyUnicode_1BYTE_KIND:
8588 list = ucs1lib_splitlines(
8589 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8590 PyUnicode_GET_LENGTH(string), keepends);
8591 break;
8592 case PyUnicode_2BYTE_KIND:
8593 list = ucs2lib_splitlines(
8594 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8595 PyUnicode_GET_LENGTH(string), keepends);
8596 break;
8597 case PyUnicode_4BYTE_KIND:
8598 list = ucs4lib_splitlines(
8599 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8600 PyUnicode_GET_LENGTH(string), keepends);
8601 break;
8602 default:
8603 assert(0);
8604 list = 0;
8605 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606 Py_DECREF(string);
8607 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608}
8609
Alexander Belopolsky40018472011-02-26 01:02:56 +00008610static PyObject *
8611split(PyUnicodeObject *self,
8612 PyUnicodeObject *substring,
8613 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615 int kind1, kind2, kind;
8616 void *buf1, *buf2;
8617 Py_ssize_t len1, len2;
8618 PyObject* out;
8619
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008621 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 if (PyUnicode_READY(self) == -1)
8624 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 if (substring == NULL)
8627 switch(PyUnicode_KIND(self)) {
8628 case PyUnicode_1BYTE_KIND:
8629 return ucs1lib_split_whitespace(
8630 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8631 PyUnicode_GET_LENGTH(self), maxcount
8632 );
8633 case PyUnicode_2BYTE_KIND:
8634 return ucs2lib_split_whitespace(
8635 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8636 PyUnicode_GET_LENGTH(self), maxcount
8637 );
8638 case PyUnicode_4BYTE_KIND:
8639 return ucs4lib_split_whitespace(
8640 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8641 PyUnicode_GET_LENGTH(self), maxcount
8642 );
8643 default:
8644 assert(0);
8645 return NULL;
8646 }
8647
8648 if (PyUnicode_READY(substring) == -1)
8649 return NULL;
8650
8651 kind1 = PyUnicode_KIND(self);
8652 kind2 = PyUnicode_KIND(substring);
8653 kind = kind1 > kind2 ? kind1 : kind2;
8654 buf1 = PyUnicode_DATA(self);
8655 buf2 = PyUnicode_DATA(substring);
8656 if (kind1 != kind)
8657 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8658 if (!buf1)
8659 return NULL;
8660 if (kind2 != kind)
8661 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8662 if (!buf2) {
8663 if (kind1 != kind) PyMem_Free(buf1);
8664 return NULL;
8665 }
8666 len1 = PyUnicode_GET_LENGTH(self);
8667 len2 = PyUnicode_GET_LENGTH(substring);
8668
8669 switch(kind) {
8670 case PyUnicode_1BYTE_KIND:
8671 out = ucs1lib_split(
8672 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8673 break;
8674 case PyUnicode_2BYTE_KIND:
8675 out = ucs2lib_split(
8676 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8677 break;
8678 case PyUnicode_4BYTE_KIND:
8679 out = ucs4lib_split(
8680 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8681 break;
8682 default:
8683 out = NULL;
8684 }
8685 if (kind1 != kind)
8686 PyMem_Free(buf1);
8687 if (kind2 != kind)
8688 PyMem_Free(buf2);
8689 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690}
8691
Alexander Belopolsky40018472011-02-26 01:02:56 +00008692static PyObject *
8693rsplit(PyUnicodeObject *self,
8694 PyUnicodeObject *substring,
8695 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008696{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 int kind1, kind2, kind;
8698 void *buf1, *buf2;
8699 Py_ssize_t len1, len2;
8700 PyObject* out;
8701
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008702 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008703 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008705 if (PyUnicode_READY(self) == -1)
8706 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008708 if (substring == NULL)
8709 switch(PyUnicode_KIND(self)) {
8710 case PyUnicode_1BYTE_KIND:
8711 return ucs1lib_rsplit_whitespace(
8712 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8713 PyUnicode_GET_LENGTH(self), maxcount
8714 );
8715 case PyUnicode_2BYTE_KIND:
8716 return ucs2lib_rsplit_whitespace(
8717 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8718 PyUnicode_GET_LENGTH(self), maxcount
8719 );
8720 case PyUnicode_4BYTE_KIND:
8721 return ucs4lib_rsplit_whitespace(
8722 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8723 PyUnicode_GET_LENGTH(self), maxcount
8724 );
8725 default:
8726 assert(0);
8727 return NULL;
8728 }
8729
8730 if (PyUnicode_READY(substring) == -1)
8731 return NULL;
8732
8733 kind1 = PyUnicode_KIND(self);
8734 kind2 = PyUnicode_KIND(substring);
8735 kind = kind1 > kind2 ? kind1 : kind2;
8736 buf1 = PyUnicode_DATA(self);
8737 buf2 = PyUnicode_DATA(substring);
8738 if (kind1 != kind)
8739 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8740 if (!buf1)
8741 return NULL;
8742 if (kind2 != kind)
8743 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8744 if (!buf2) {
8745 if (kind1 != kind) PyMem_Free(buf1);
8746 return NULL;
8747 }
8748 len1 = PyUnicode_GET_LENGTH(self);
8749 len2 = PyUnicode_GET_LENGTH(substring);
8750
8751 switch(kind) {
8752 case PyUnicode_1BYTE_KIND:
8753 out = ucs1lib_rsplit(
8754 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8755 break;
8756 case PyUnicode_2BYTE_KIND:
8757 out = ucs2lib_rsplit(
8758 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8759 break;
8760 case PyUnicode_4BYTE_KIND:
8761 out = ucs4lib_rsplit(
8762 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8763 break;
8764 default:
8765 out = NULL;
8766 }
8767 if (kind1 != kind)
8768 PyMem_Free(buf1);
8769 if (kind2 != kind)
8770 PyMem_Free(buf2);
8771 return out;
8772}
8773
8774static Py_ssize_t
8775anylib_find(int kind, void *buf1, Py_ssize_t len1,
8776 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8777{
8778 switch(kind) {
8779 case PyUnicode_1BYTE_KIND:
8780 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8781 case PyUnicode_2BYTE_KIND:
8782 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8783 case PyUnicode_4BYTE_KIND:
8784 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8785 }
8786 assert(0);
8787 return -1;
8788}
8789
8790static Py_ssize_t
8791anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8792 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8793{
8794 switch(kind) {
8795 case PyUnicode_1BYTE_KIND:
8796 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8797 case PyUnicode_2BYTE_KIND:
8798 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8799 case PyUnicode_4BYTE_KIND:
8800 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8801 }
8802 assert(0);
8803 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008804}
8805
Alexander Belopolsky40018472011-02-26 01:02:56 +00008806static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008807replace(PyObject *self, PyObject *str1,
8808 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008810 PyObject *u;
8811 char *sbuf = PyUnicode_DATA(self);
8812 char *buf1 = PyUnicode_DATA(str1);
8813 char *buf2 = PyUnicode_DATA(str2);
8814 int srelease = 0, release1 = 0, release2 = 0;
8815 int skind = PyUnicode_KIND(self);
8816 int kind1 = PyUnicode_KIND(str1);
8817 int kind2 = PyUnicode_KIND(str2);
8818 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8819 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8820 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821
8822 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008825 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 if (skind < kind1)
8828 /* substring too wide to be present */
8829 goto nothing;
8830
8831 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008832 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008833 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008835 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008837 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838 Py_UCS4 u1, u2, maxchar;
8839 int mayshrink, rkind;
8840 u1 = PyUnicode_READ_CHAR(str1, 0);
8841 if (!findchar(sbuf, PyUnicode_KIND(self),
8842 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008843 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008844 u2 = PyUnicode_READ_CHAR(str2, 0);
8845 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8846 /* Replacing u1 with u2 may cause a maxchar reduction in the
8847 result string. */
8848 mayshrink = maxchar > 127;
8849 if (u2 > maxchar) {
8850 maxchar = u2;
8851 mayshrink = 0;
8852 }
8853 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008854 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008856 if (PyUnicode_CopyCharacters(u, 0,
8857 (PyObject*)self, 0, slen) < 0)
8858 {
8859 Py_DECREF(u);
8860 return NULL;
8861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862 rkind = PyUnicode_KIND(u);
8863 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8864 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008865 if (--maxcount < 0)
8866 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008867 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008869 if (mayshrink) {
8870 PyObject *tmp = u;
8871 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8872 PyUnicode_GET_LENGTH(tmp));
8873 Py_DECREF(tmp);
8874 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 int rkind = skind;
8877 char *res;
8878 if (kind1 < rkind) {
8879 /* widen substring */
8880 buf1 = _PyUnicode_AsKind(str1, rkind);
8881 if (!buf1) goto error;
8882 release1 = 1;
8883 }
8884 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008885 if (i < 0)
8886 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008887 if (rkind > kind2) {
8888 /* widen replacement */
8889 buf2 = _PyUnicode_AsKind(str2, rkind);
8890 if (!buf2) goto error;
8891 release2 = 1;
8892 }
8893 else if (rkind < kind2) {
8894 /* widen self and buf1 */
8895 rkind = kind2;
8896 if (release1) PyMem_Free(buf1);
8897 sbuf = _PyUnicode_AsKind(self, rkind);
8898 if (!sbuf) goto error;
8899 srelease = 1;
8900 buf1 = _PyUnicode_AsKind(str1, rkind);
8901 if (!buf1) goto error;
8902 release1 = 1;
8903 }
8904 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8905 if (!res) {
8906 PyErr_NoMemory();
8907 goto error;
8908 }
8909 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008910 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8912 buf2,
8913 PyUnicode_KIND_SIZE(rkind, len2));
8914 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008915
8916 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8918 slen-i,
8919 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008920 if (i == -1)
8921 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8923 buf2,
8924 PyUnicode_KIND_SIZE(rkind, len2));
8925 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927
8928 u = PyUnicode_FromKindAndData(rkind, res, slen);
8929 PyMem_Free(res);
8930 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 Py_ssize_t n, i, j, ires;
8935 Py_ssize_t product, new_size;
8936 int rkind = skind;
8937 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008939 if (kind1 < rkind) {
8940 buf1 = _PyUnicode_AsKind(str1, rkind);
8941 if (!buf1) goto error;
8942 release1 = 1;
8943 }
8944 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008945 if (n == 0)
8946 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 if (kind2 < rkind) {
8948 buf2 = _PyUnicode_AsKind(str2, rkind);
8949 if (!buf2) goto error;
8950 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 else if (kind2 > rkind) {
8953 rkind = kind2;
8954 sbuf = _PyUnicode_AsKind(self, rkind);
8955 if (!sbuf) goto error;
8956 srelease = 1;
8957 if (release1) PyMem_Free(buf1);
8958 buf1 = _PyUnicode_AsKind(str1, rkind);
8959 if (!buf1) goto error;
8960 release1 = 1;
8961 }
8962 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
8963 PyUnicode_GET_LENGTH(str1))); */
8964 product = n * (len2-len1);
8965 if ((product / (len2-len1)) != n) {
8966 PyErr_SetString(PyExc_OverflowError,
8967 "replace string is too long");
8968 goto error;
8969 }
8970 new_size = slen + product;
8971 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
8972 PyErr_SetString(PyExc_OverflowError,
8973 "replace string is too long");
8974 goto error;
8975 }
8976 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
8977 if (!res)
8978 goto error;
8979 ires = i = 0;
8980 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008981 while (n-- > 0) {
8982 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008983 j = anylib_find(rkind,
8984 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8985 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008986 if (j == -1)
8987 break;
8988 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008989 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8991 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8992 PyUnicode_KIND_SIZE(rkind, j-i));
8993 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008994 }
8995 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008996 if (len2 > 0) {
8997 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8998 buf2,
8999 PyUnicode_KIND_SIZE(rkind, len2));
9000 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009001 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009003 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009005 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9007 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9008 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009009 } else {
9010 /* interleave */
9011 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009012 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9013 buf2,
9014 PyUnicode_KIND_SIZE(rkind, len2));
9015 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009016 if (--n <= 0)
9017 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9019 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9020 PyUnicode_KIND_SIZE(rkind, 1));
9021 ires++;
9022 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009023 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9025 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9026 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 if (srelease)
9031 PyMem_FREE(sbuf);
9032 if (release1)
9033 PyMem_FREE(buf1);
9034 if (release2)
9035 PyMem_FREE(buf2);
9036 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009037
Benjamin Peterson29060642009-01-31 22:14:21 +00009038 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009039 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040 if (srelease)
9041 PyMem_FREE(sbuf);
9042 if (release1)
9043 PyMem_FREE(buf1);
9044 if (release2)
9045 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009046 if (PyUnicode_CheckExact(self)) {
9047 Py_INCREF(self);
9048 return (PyObject *) self;
9049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 return PyUnicode_FromKindAndData(PyUnicode_KIND(self),
9051 PyUnicode_DATA(self),
9052 PyUnicode_GET_LENGTH(self));
9053 error:
9054 if (srelease && sbuf)
9055 PyMem_FREE(sbuf);
9056 if (release1 && buf1)
9057 PyMem_FREE(buf1);
9058 if (release2 && buf2)
9059 PyMem_FREE(buf2);
9060 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061}
9062
9063/* --- Unicode Object Methods --------------------------------------------- */
9064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009065PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009066 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067\n\
9068Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009069characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070
9071static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009072unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074 return fixup(self, fixtitle);
9075}
9076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009077PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009078 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079\n\
9080Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009081have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082
9083static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009084unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086 return fixup(self, fixcapitalize);
9087}
9088
9089#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009090PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009091 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092\n\
9093Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009094normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095
9096static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009097unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098{
9099 PyObject *list;
9100 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009101 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103 /* Split into words */
9104 list = split(self, NULL, -1);
9105 if (!list)
9106 return NULL;
9107
9108 /* Capitalize each word */
9109 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9110 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112 if (item == NULL)
9113 goto onError;
9114 Py_DECREF(PyList_GET_ITEM(list, i));
9115 PyList_SET_ITEM(list, i, item);
9116 }
9117
9118 /* Join the words to form a new string */
9119 item = PyUnicode_Join(NULL, list);
9120
Benjamin Peterson29060642009-01-31 22:14:21 +00009121 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122 Py_DECREF(list);
9123 return (PyObject *)item;
9124}
9125#endif
9126
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009127/* Argument converter. Coerces to a single unicode character */
9128
9129static int
9130convert_uc(PyObject *obj, void *addr)
9131{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009132 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009133 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009134
Benjamin Peterson14339b62009-01-31 16:36:08 +00009135 uniobj = PyUnicode_FromObject(obj);
9136 if (uniobj == NULL) {
9137 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009139 return 0;
9140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009142 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009143 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009144 Py_DECREF(uniobj);
9145 return 0;
9146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147 if (PyUnicode_READY(uniobj)) {
9148 Py_DECREF(uniobj);
9149 return 0;
9150 }
9151 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009152 Py_DECREF(uniobj);
9153 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009154}
9155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009156PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009157 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009159Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009160done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161
9162static PyObject *
9163unicode_center(PyUnicodeObject *self, PyObject *args)
9164{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009165 Py_ssize_t marg, left;
9166 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167 Py_UCS4 fillchar = ' ';
9168
9169 if (PyUnicode_READY(self) == -1)
9170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171
Thomas Woutersde017742006-02-16 19:34:37 +00009172 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173 return NULL;
9174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009176 Py_INCREF(self);
9177 return (PyObject*) self;
9178 }
9179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181 left = marg / 2 + (marg & width & 1);
9182
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009183 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184}
9185
Marc-André Lemburge5034372000-08-08 08:04:29 +00009186#if 0
9187
9188/* This code should go into some future Unicode collation support
9189 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009190 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009191
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009192/* speedy UTF-16 code point order comparison */
9193/* gleaned from: */
9194/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9195
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009196static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009197{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009198 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009199 0, 0, 0, 0, 0, 0, 0, 0,
9200 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009201 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009202};
9203
Guido van Rossumd57fd912000-03-10 22:53:23 +00009204static int
9205unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9206{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009207 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009208
Guido van Rossumd57fd912000-03-10 22:53:23 +00009209 Py_UNICODE *s1 = str1->str;
9210 Py_UNICODE *s2 = str2->str;
9211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212 len1 = str1->_base._base.length;
9213 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009214
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009216 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009217
9218 c1 = *s1++;
9219 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009220
Benjamin Peterson29060642009-01-31 22:14:21 +00009221 if (c1 > (1<<11) * 26)
9222 c1 += utf16Fixup[c1>>11];
9223 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009224 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009225 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009226
9227 if (c1 != c2)
9228 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009229
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009230 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231 }
9232
9233 return (len1 < len2) ? -1 : (len1 != len2);
9234}
9235
Marc-André Lemburge5034372000-08-08 08:04:29 +00009236#else
9237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238/* This function assumes that str1 and str2 are readied by the caller. */
9239
Marc-André Lemburge5034372000-08-08 08:04:29 +00009240static int
9241unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243 int kind1, kind2;
9244 void *data1, *data2;
9245 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 kind1 = PyUnicode_KIND(str1);
9248 kind2 = PyUnicode_KIND(str2);
9249 data1 = PyUnicode_DATA(str1);
9250 data2 = PyUnicode_DATA(str2);
9251 len1 = PyUnicode_GET_LENGTH(str1);
9252 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 for (i = 0; i < len1 && i < len2; ++i) {
9255 Py_UCS4 c1, c2;
9256 c1 = PyUnicode_READ(kind1, data1, i);
9257 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009258
9259 if (c1 != c2)
9260 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009261 }
9262
9263 return (len1 < len2) ? -1 : (len1 != len2);
9264}
9265
9266#endif
9267
Alexander Belopolsky40018472011-02-26 01:02:56 +00009268int
9269PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9272 if (PyUnicode_READY(left) == -1 ||
9273 PyUnicode_READY(right) == -1)
9274 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009275 return unicode_compare((PyUnicodeObject *)left,
9276 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009278 PyErr_Format(PyExc_TypeError,
9279 "Can't compare %.100s and %.100s",
9280 left->ob_type->tp_name,
9281 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009282 return -1;
9283}
9284
Martin v. Löwis5b222132007-06-10 09:51:05 +00009285int
9286PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9287{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288 Py_ssize_t i;
9289 int kind;
9290 void *data;
9291 Py_UCS4 chr;
9292
Martin v. Löwis5b222132007-06-10 09:51:05 +00009293 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294 if (PyUnicode_READY(uni) == -1)
9295 return -1;
9296 kind = PyUnicode_KIND(uni);
9297 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009298 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9300 if (chr != str[i])
9301 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009302 /* This check keeps Python strings that end in '\0' from comparing equal
9303 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009305 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009306 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009307 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009308 return 0;
9309}
9310
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009311
Benjamin Peterson29060642009-01-31 22:14:21 +00009312#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009313 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009314
Alexander Belopolsky40018472011-02-26 01:02:56 +00009315PyObject *
9316PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009317{
9318 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009319
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009320 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9321 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 if (PyUnicode_READY(left) == -1 ||
9323 PyUnicode_READY(right) == -1)
9324 return NULL;
9325 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9326 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009327 if (op == Py_EQ) {
9328 Py_INCREF(Py_False);
9329 return Py_False;
9330 }
9331 if (op == Py_NE) {
9332 Py_INCREF(Py_True);
9333 return Py_True;
9334 }
9335 }
9336 if (left == right)
9337 result = 0;
9338 else
9339 result = unicode_compare((PyUnicodeObject *)left,
9340 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009341
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009342 /* Convert the return value to a Boolean */
9343 switch (op) {
9344 case Py_EQ:
9345 v = TEST_COND(result == 0);
9346 break;
9347 case Py_NE:
9348 v = TEST_COND(result != 0);
9349 break;
9350 case Py_LE:
9351 v = TEST_COND(result <= 0);
9352 break;
9353 case Py_GE:
9354 v = TEST_COND(result >= 0);
9355 break;
9356 case Py_LT:
9357 v = TEST_COND(result == -1);
9358 break;
9359 case Py_GT:
9360 v = TEST_COND(result == 1);
9361 break;
9362 default:
9363 PyErr_BadArgument();
9364 return NULL;
9365 }
9366 Py_INCREF(v);
9367 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009368 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009369
Brian Curtindfc80e32011-08-10 20:28:54 -05009370 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009371}
9372
Alexander Belopolsky40018472011-02-26 01:02:56 +00009373int
9374PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009375{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009376 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 int kind1, kind2, kind;
9378 void *buf1, *buf2;
9379 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009380 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009381
9382 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009383 sub = PyUnicode_FromObject(element);
9384 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009385 PyErr_Format(PyExc_TypeError,
9386 "'in <string>' requires string as left operand, not %s",
9387 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009388 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 if (PyUnicode_READY(sub) == -1)
9391 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009392
Thomas Wouters477c8d52006-05-27 19:21:47 +00009393 str = PyUnicode_FromObject(container);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 if (!str || PyUnicode_READY(container) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009395 Py_DECREF(sub);
9396 return -1;
9397 }
9398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399 kind1 = PyUnicode_KIND(str);
9400 kind2 = PyUnicode_KIND(sub);
9401 kind = kind1 > kind2 ? kind1 : kind2;
9402 buf1 = PyUnicode_DATA(str);
9403 buf2 = PyUnicode_DATA(sub);
9404 if (kind1 != kind)
9405 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9406 if (!buf1) {
9407 Py_DECREF(sub);
9408 return -1;
9409 }
9410 if (kind2 != kind)
9411 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9412 if (!buf2) {
9413 Py_DECREF(sub);
9414 if (kind1 != kind) PyMem_Free(buf1);
9415 return -1;
9416 }
9417 len1 = PyUnicode_GET_LENGTH(str);
9418 len2 = PyUnicode_GET_LENGTH(sub);
9419
9420 switch(kind) {
9421 case PyUnicode_1BYTE_KIND:
9422 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9423 break;
9424 case PyUnicode_2BYTE_KIND:
9425 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9426 break;
9427 case PyUnicode_4BYTE_KIND:
9428 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9429 break;
9430 default:
9431 result = -1;
9432 assert(0);
9433 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009434
9435 Py_DECREF(str);
9436 Py_DECREF(sub);
9437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 if (kind1 != kind)
9439 PyMem_Free(buf1);
9440 if (kind2 != kind)
9441 PyMem_Free(buf2);
9442
Guido van Rossum403d68b2000-03-13 15:55:09 +00009443 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009444}
9445
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446/* Concat to string or Unicode object giving a new Unicode object. */
9447
Alexander Belopolsky40018472011-02-26 01:02:56 +00009448PyObject *
9449PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 PyObject *u = NULL, *v = NULL, *w;
9452 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453
9454 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009457 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009460 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461
9462 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009464 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009468 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 }
9471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 if (PyUnicode_READY(u) == -1 || PyUnicode_READY(v) == -1)
9473 goto onError;
9474
9475 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009476 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 w = PyUnicode_New(
9480 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9481 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009483 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009484 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9485 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009486 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009487 v, 0,
9488 PyUnicode_GET_LENGTH(v)) < 0)
9489 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490 Py_DECREF(u);
9491 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009493
Benjamin Peterson29060642009-01-31 22:14:21 +00009494 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495 Py_XDECREF(u);
9496 Py_XDECREF(v);
9497 return NULL;
9498}
9499
Walter Dörwald1ab83302007-05-18 17:15:44 +00009500void
9501PyUnicode_Append(PyObject **pleft, PyObject *right)
9502{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009503 PyObject *new;
9504 if (*pleft == NULL)
9505 return;
9506 if (right == NULL || !PyUnicode_Check(*pleft)) {
9507 Py_DECREF(*pleft);
9508 *pleft = NULL;
9509 return;
9510 }
9511 new = PyUnicode_Concat(*pleft, right);
9512 Py_DECREF(*pleft);
9513 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009514}
9515
9516void
9517PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9518{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009519 PyUnicode_Append(pleft, right);
9520 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009521}
9522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009523PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009524 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009526Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009527string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009528interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529
9530static PyObject *
9531unicode_count(PyUnicodeObject *self, PyObject *args)
9532{
9533 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009534 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009535 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 int kind1, kind2, kind;
9538 void *buf1, *buf2;
9539 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540
Jesus Ceaac451502011-04-20 17:09:23 +02009541 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9542 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009543 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 kind1 = PyUnicode_KIND(self);
9546 kind2 = PyUnicode_KIND(substring);
9547 kind = kind1 > kind2 ? kind1 : kind2;
9548 buf1 = PyUnicode_DATA(self);
9549 buf2 = PyUnicode_DATA(substring);
9550 if (kind1 != kind)
9551 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9552 if (!buf1) {
9553 Py_DECREF(substring);
9554 return NULL;
9555 }
9556 if (kind2 != kind)
9557 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9558 if (!buf2) {
9559 Py_DECREF(substring);
9560 if (kind1 != kind) PyMem_Free(buf1);
9561 return NULL;
9562 }
9563 len1 = PyUnicode_GET_LENGTH(self);
9564 len2 = PyUnicode_GET_LENGTH(substring);
9565
9566 ADJUST_INDICES(start, end, len1);
9567 switch(kind) {
9568 case PyUnicode_1BYTE_KIND:
9569 iresult = ucs1lib_count(
9570 ((Py_UCS1*)buf1) + start, end - start,
9571 buf2, len2, PY_SSIZE_T_MAX
9572 );
9573 break;
9574 case PyUnicode_2BYTE_KIND:
9575 iresult = ucs2lib_count(
9576 ((Py_UCS2*)buf1) + start, end - start,
9577 buf2, len2, PY_SSIZE_T_MAX
9578 );
9579 break;
9580 case PyUnicode_4BYTE_KIND:
9581 iresult = ucs4lib_count(
9582 ((Py_UCS4*)buf1) + start, end - start,
9583 buf2, len2, PY_SSIZE_T_MAX
9584 );
9585 break;
9586 default:
9587 assert(0); iresult = 0;
9588 }
9589
9590 result = PyLong_FromSsize_t(iresult);
9591
9592 if (kind1 != kind)
9593 PyMem_Free(buf1);
9594 if (kind2 != kind)
9595 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009596
9597 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009598
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599 return result;
9600}
9601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009602PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009603 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009604\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009605Encode S using the codec registered for encoding. Default encoding\n\
9606is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009607handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009608a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9609'xmlcharrefreplace' as well as any other name registered with\n\
9610codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611
9612static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009613unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009614{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009615 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616 char *encoding = NULL;
9617 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009618
Benjamin Peterson308d6372009-09-18 21:42:35 +00009619 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9620 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009621 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009622 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009623}
9624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009625PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009626 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627\n\
9628Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009629If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630
9631static PyObject*
9632unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9633{
9634 Py_UNICODE *e;
9635 Py_UNICODE *p;
9636 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009637 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639 PyUnicodeObject *u;
9640 int tabsize = 8;
9641
9642 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9646 return NULL;
9647
Thomas Wouters7e474022000-07-16 12:04:32 +00009648 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009649 i = 0; /* chars up to and including most recent \n or \r */
9650 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9652 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009654 if (tabsize > 0) {
9655 incr = tabsize - (j % tabsize); /* cannot overflow */
9656 if (j > PY_SSIZE_T_MAX - incr)
9657 goto overflow1;
9658 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009659 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009660 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009662 if (j > PY_SSIZE_T_MAX - 1)
9663 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664 j++;
9665 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009666 if (i > PY_SSIZE_T_MAX - j)
9667 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009669 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670 }
9671 }
9672
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009673 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009674 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009675
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676 /* Second pass: create output string and fill it */
9677 u = _PyUnicode_New(i + j);
9678 if (!u)
9679 return NULL;
9680
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009681 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009682 q = _PyUnicode_WSTR(u); /* next output char */
9683 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009687 if (tabsize > 0) {
9688 i = tabsize - (j % tabsize);
9689 j += i;
9690 while (i--) {
9691 if (q >= qe)
9692 goto overflow2;
9693 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009694 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009695 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009696 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009697 else {
9698 if (q >= qe)
9699 goto overflow2;
9700 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009701 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702 if (*p == '\n' || *p == '\r')
9703 j = 0;
9704 }
9705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 if (PyUnicode_READY(u) == -1) {
9707 Py_DECREF(u);
9708 return NULL;
9709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009711
9712 overflow2:
9713 Py_DECREF(u);
9714 overflow1:
9715 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9716 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717}
9718
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009719PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009720 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721\n\
9722Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009723such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724arguments start and end are interpreted as in slice notation.\n\
9725\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009726Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727
9728static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730{
Jesus Ceaac451502011-04-20 17:09:23 +02009731 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009732 Py_ssize_t start;
9733 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009734 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735
Jesus Ceaac451502011-04-20 17:09:23 +02009736 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9737 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 if (PyUnicode_READY(self) == -1)
9741 return NULL;
9742 if (PyUnicode_READY(substring) == -1)
9743 return NULL;
9744
9745 result = any_find_slice(
9746 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9747 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009748 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749
9750 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 if (result == -2)
9753 return NULL;
9754
Christian Heimes217cfd12007-12-02 14:31:20 +00009755 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756}
9757
9758static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009759unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009761 Py_UCS4 ch;
9762
9763 if (PyUnicode_READY(self) == -1)
9764 return NULL;
9765 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766 PyErr_SetString(PyExc_IndexError, "string index out of range");
9767 return NULL;
9768 }
9769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9771 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009772}
9773
Guido van Rossumc2504932007-09-18 19:42:40 +00009774/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009775 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009776static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009777unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009778{
Guido van Rossumc2504932007-09-18 19:42:40 +00009779 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009780 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 if (_PyUnicode_HASH(self) != -1)
9783 return _PyUnicode_HASH(self);
9784 if (PyUnicode_READY(self) == -1)
9785 return -1;
9786 len = PyUnicode_GET_LENGTH(self);
9787
9788 /* The hash function as a macro, gets expanded three times below. */
9789#define HASH(P) \
9790 x = (Py_uhash_t)*P << 7; \
9791 while (--len >= 0) \
9792 x = (1000003*x) ^ (Py_uhash_t)*P++;
9793
9794 switch (PyUnicode_KIND(self)) {
9795 case PyUnicode_1BYTE_KIND: {
9796 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9797 HASH(c);
9798 break;
9799 }
9800 case PyUnicode_2BYTE_KIND: {
9801 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9802 HASH(s);
9803 break;
9804 }
9805 default: {
9806 Py_UCS4 *l;
9807 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9808 "Impossible switch case in unicode_hash");
9809 l = PyUnicode_4BYTE_DATA(self);
9810 HASH(l);
9811 break;
9812 }
9813 }
9814 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9815
Guido van Rossumc2504932007-09-18 19:42:40 +00009816 if (x == -1)
9817 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009819 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009820}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009823PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009824 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009825\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009826Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827
9828static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009831 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009832 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009833 Py_ssize_t start;
9834 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835
Jesus Ceaac451502011-04-20 17:09:23 +02009836 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9837 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009838 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840 if (PyUnicode_READY(self) == -1)
9841 return NULL;
9842 if (PyUnicode_READY(substring) == -1)
9843 return NULL;
9844
9845 result = any_find_slice(
9846 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9847 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009848 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849
9850 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 if (result == -2)
9853 return NULL;
9854
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855 if (result < 0) {
9856 PyErr_SetString(PyExc_ValueError, "substring not found");
9857 return NULL;
9858 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009859
Christian Heimes217cfd12007-12-02 14:31:20 +00009860 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861}
9862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009863PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009864 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009866Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009867at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868
9869static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009870unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009871{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 Py_ssize_t i, length;
9873 int kind;
9874 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875 int cased;
9876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009877 if (PyUnicode_READY(self) == -1)
9878 return NULL;
9879 length = PyUnicode_GET_LENGTH(self);
9880 kind = PyUnicode_KIND(self);
9881 data = PyUnicode_DATA(self);
9882
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 if (length == 1)
9885 return PyBool_FromLong(
9886 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009888 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009890 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009891
Guido van Rossumd57fd912000-03-10 22:53:23 +00009892 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009893 for (i = 0; i < length; i++) {
9894 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009895
Benjamin Peterson29060642009-01-31 22:14:21 +00009896 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9897 return PyBool_FromLong(0);
9898 else if (!cased && Py_UNICODE_ISLOWER(ch))
9899 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009901 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902}
9903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009904PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009905 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009907Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009908at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909
9910static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009911unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 Py_ssize_t i, length;
9914 int kind;
9915 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916 int cased;
9917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 if (PyUnicode_READY(self) == -1)
9919 return NULL;
9920 length = PyUnicode_GET_LENGTH(self);
9921 kind = PyUnicode_KIND(self);
9922 data = PyUnicode_DATA(self);
9923
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 if (length == 1)
9926 return PyBool_FromLong(
9927 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009929 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009931 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009932
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 for (i = 0; i < length; i++) {
9935 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009936
Benjamin Peterson29060642009-01-31 22:14:21 +00009937 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9938 return PyBool_FromLong(0);
9939 else if (!cased && Py_UNICODE_ISUPPER(ch))
9940 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009942 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009943}
9944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009945PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009946 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009948Return True if S is a titlecased string and there is at least one\n\
9949character in S, i.e. upper- and titlecase characters may only\n\
9950follow uncased characters and lowercase characters only cased ones.\n\
9951Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952
9953static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009954unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 Py_ssize_t i, length;
9957 int kind;
9958 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959 int cased, previous_is_cased;
9960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 if (PyUnicode_READY(self) == -1)
9962 return NULL;
9963 length = PyUnicode_GET_LENGTH(self);
9964 kind = PyUnicode_KIND(self);
9965 data = PyUnicode_DATA(self);
9966
Guido van Rossumd57fd912000-03-10 22:53:23 +00009967 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 if (length == 1) {
9969 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
9970 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
9971 (Py_UNICODE_ISUPPER(ch) != 0));
9972 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009974 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009976 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009977
Guido van Rossumd57fd912000-03-10 22:53:23 +00009978 cased = 0;
9979 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 for (i = 0; i < length; i++) {
9981 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009982
Benjamin Peterson29060642009-01-31 22:14:21 +00009983 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
9984 if (previous_is_cased)
9985 return PyBool_FromLong(0);
9986 previous_is_cased = 1;
9987 cased = 1;
9988 }
9989 else if (Py_UNICODE_ISLOWER(ch)) {
9990 if (!previous_is_cased)
9991 return PyBool_FromLong(0);
9992 previous_is_cased = 1;
9993 cased = 1;
9994 }
9995 else
9996 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009998 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009999}
10000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010001PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010002 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010004Return True if all characters in S are whitespace\n\
10005and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006
10007static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010008unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010009{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 Py_ssize_t i, length;
10011 int kind;
10012 void *data;
10013
10014 if (PyUnicode_READY(self) == -1)
10015 return NULL;
10016 length = PyUnicode_GET_LENGTH(self);
10017 kind = PyUnicode_KIND(self);
10018 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 if (length == 1)
10022 return PyBool_FromLong(
10023 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010024
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010025 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010027 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 for (i = 0; i < length; i++) {
10030 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010031 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010032 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010034 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035}
10036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010037PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010038 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010039\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010040Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010041and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010042
10043static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010044unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010045{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 Py_ssize_t i, length;
10047 int kind;
10048 void *data;
10049
10050 if (PyUnicode_READY(self) == -1)
10051 return NULL;
10052 length = PyUnicode_GET_LENGTH(self);
10053 kind = PyUnicode_KIND(self);
10054 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010055
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010056 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 if (length == 1)
10058 return PyBool_FromLong(
10059 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010060
10061 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010063 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 for (i = 0; i < length; i++) {
10066 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010067 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010068 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010069 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010070}
10071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010072PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010073 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010074\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010075Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010076and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010077
10078static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010079unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010080{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 int kind;
10082 void *data;
10083 Py_ssize_t len, i;
10084
10085 if (PyUnicode_READY(self) == -1)
10086 return NULL;
10087
10088 kind = PyUnicode_KIND(self);
10089 data = PyUnicode_DATA(self);
10090 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010091
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010092 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 if (len == 1) {
10094 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10095 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10096 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010097
10098 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010100 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 for (i = 0; i < len; i++) {
10103 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010104 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010105 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010106 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010107 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010108}
10109
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010110PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010111 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010112\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010113Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010114False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010115
10116static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010117unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010118{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 Py_ssize_t i, length;
10120 int kind;
10121 void *data;
10122
10123 if (PyUnicode_READY(self) == -1)
10124 return NULL;
10125 length = PyUnicode_GET_LENGTH(self);
10126 kind = PyUnicode_KIND(self);
10127 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010128
Guido van Rossumd57fd912000-03-10 22:53:23 +000010129 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 if (length == 1)
10131 return PyBool_FromLong(
10132 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010133
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010134 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010136 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 for (i = 0; i < length; i++) {
10139 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010140 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010142 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010143}
10144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010145PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010146 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010148Return True if all characters in S are digits\n\
10149and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150
10151static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010152unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 Py_ssize_t i, length;
10155 int kind;
10156 void *data;
10157
10158 if (PyUnicode_READY(self) == -1)
10159 return NULL;
10160 length = PyUnicode_GET_LENGTH(self);
10161 kind = PyUnicode_KIND(self);
10162 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 if (length == 1) {
10166 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10167 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010170 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010172 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 for (i = 0; i < length; i++) {
10175 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010176 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010178 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179}
10180
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010181PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010182 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010184Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010185False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186
10187static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010188unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 Py_ssize_t i, length;
10191 int kind;
10192 void *data;
10193
10194 if (PyUnicode_READY(self) == -1)
10195 return NULL;
10196 length = PyUnicode_GET_LENGTH(self);
10197 kind = PyUnicode_KIND(self);
10198 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 if (length == 1)
10202 return PyBool_FromLong(
10203 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010205 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010207 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 for (i = 0; i < length; i++) {
10210 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010211 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010213 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214}
10215
Martin v. Löwis47383402007-08-15 07:32:56 +000010216int
10217PyUnicode_IsIdentifier(PyObject *self)
10218{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 int kind;
10220 void *data;
10221 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010222 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 if (PyUnicode_READY(self) == -1) {
10225 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010226 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 }
10228
10229 /* Special case for empty strings */
10230 if (PyUnicode_GET_LENGTH(self) == 0)
10231 return 0;
10232 kind = PyUnicode_KIND(self);
10233 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010234
10235 /* PEP 3131 says that the first character must be in
10236 XID_Start and subsequent characters in XID_Continue,
10237 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010238 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010239 letters, digits, underscore). However, given the current
10240 definition of XID_Start and XID_Continue, it is sufficient
10241 to check just for these, except that _ must be allowed
10242 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010244 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010245 return 0;
10246
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010247 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010249 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010250 return 1;
10251}
10252
10253PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010254 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010255\n\
10256Return True if S is a valid identifier according\n\
10257to the language definition.");
10258
10259static PyObject*
10260unicode_isidentifier(PyObject *self)
10261{
10262 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10263}
10264
Georg Brandl559e5d72008-06-11 18:37:52 +000010265PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010266 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010267\n\
10268Return True if all characters in S are considered\n\
10269printable in repr() or S is empty, False otherwise.");
10270
10271static PyObject*
10272unicode_isprintable(PyObject *self)
10273{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 Py_ssize_t i, length;
10275 int kind;
10276 void *data;
10277
10278 if (PyUnicode_READY(self) == -1)
10279 return NULL;
10280 length = PyUnicode_GET_LENGTH(self);
10281 kind = PyUnicode_KIND(self);
10282 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010283
10284 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 if (length == 1)
10286 return PyBool_FromLong(
10287 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 for (i = 0; i < length; i++) {
10290 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010291 Py_RETURN_FALSE;
10292 }
10293 }
10294 Py_RETURN_TRUE;
10295}
10296
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010297PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010298 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299\n\
10300Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010301iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302
10303static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010304unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010305{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010306 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307}
10308
Martin v. Löwis18e16552006-02-15 17:27:45 +000010309static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310unicode_length(PyUnicodeObject *self)
10311{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 if (PyUnicode_READY(self) == -1)
10313 return -1;
10314 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315}
10316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010317PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010318 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010320Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010321done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322
10323static PyObject *
10324unicode_ljust(PyUnicodeObject *self, PyObject *args)
10325{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010326 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 Py_UCS4 fillchar = ' ';
10328
10329 if (PyUnicode_READY(self) == -1)
10330 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010331
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010332 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010333 return NULL;
10334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336 Py_INCREF(self);
10337 return (PyObject*) self;
10338 }
10339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010341}
10342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010343PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010344 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010345\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010346Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347
10348static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010349unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351 return fixup(self, fixlower);
10352}
10353
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010354#define LEFTSTRIP 0
10355#define RIGHTSTRIP 1
10356#define BOTHSTRIP 2
10357
10358/* Arrays indexed by above */
10359static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10360
10361#define STRIPNAME(i) (stripformat[i]+3)
10362
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010363/* externally visible for str.strip(unicode) */
10364PyObject *
10365_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 void *data;
10368 int kind;
10369 Py_ssize_t i, j, len;
10370 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10373 return NULL;
10374
10375 kind = PyUnicode_KIND(self);
10376 data = PyUnicode_DATA(self);
10377 len = PyUnicode_GET_LENGTH(self);
10378 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10379 PyUnicode_DATA(sepobj),
10380 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010381
Benjamin Peterson14339b62009-01-31 16:36:08 +000010382 i = 0;
10383 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 while (i < len &&
10385 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010386 i++;
10387 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010388 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010389
Benjamin Peterson14339b62009-01-31 16:36:08 +000010390 j = len;
10391 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010392 do {
10393 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 } while (j >= i &&
10395 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010396 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010397 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010398
Benjamin Peterson14339b62009-01-31 16:36:08 +000010399 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010400 Py_INCREF(self);
10401 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010402 }
10403 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 return PyUnicode_Substring((PyObject*)self, i, j);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010405}
10406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407/* Assumes an already ready self string. */
10408
10409static PyObject *
10410substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len)
10411{
10412 const int kind = PyUnicode_KIND(self);
10413 void *data = PyUnicode_DATA(self);
10414 Py_UCS4 maxchar = 0;
10415 Py_ssize_t i;
10416 PyObject *unicode;
10417
10418 if (start < 0 || len < 0 || (start + len) > PyUnicode_GET_LENGTH(self)) {
10419 PyErr_BadInternalCall();
10420 return NULL;
10421 }
10422
10423 if (len == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) {
10424 Py_INCREF(self);
10425 return (PyObject*)self;
10426 }
10427
10428 for (i = 0; i < len; ++i) {
10429 const Py_UCS4 ch = PyUnicode_READ(kind, data, start + i);
10430 if (ch > maxchar)
10431 maxchar = ch;
10432 }
10433
10434 unicode = PyUnicode_New(len, maxchar);
10435 if (unicode == NULL)
10436 return NULL;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010437 if (PyUnicode_CopyCharacters(unicode, 0,
10438 (PyObject*)self, start, len) < 0)
10439 {
10440 Py_DECREF(unicode);
10441 return NULL;
10442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 return unicode;
10444}
10445
10446PyObject*
10447PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10448{
10449 unsigned char *data;
10450 int kind;
10451
10452 if (start == 0 && end == PyUnicode_GET_LENGTH(self)
10453 && PyUnicode_CheckExact(self))
10454 {
10455 Py_INCREF(self);
10456 return (PyObject *)self;
10457 }
10458
10459 if ((end - start) == 1)
10460 return unicode_getitem((PyUnicodeObject*)self, start);
10461
10462 if (PyUnicode_READY(self) == -1)
10463 return NULL;
10464 kind = PyUnicode_KIND(self);
10465 data = PyUnicode_1BYTE_DATA(self);
10466 return PyUnicode_FromKindAndData(kind, data + PyUnicode_KIND_SIZE(kind, start),
10467 end-start);
10468}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010469
10470static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010471do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010472{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 int kind;
10474 void *data;
10475 Py_ssize_t len, i, j;
10476
10477 if (PyUnicode_READY(self) == -1)
10478 return NULL;
10479
10480 kind = PyUnicode_KIND(self);
10481 data = PyUnicode_DATA(self);
10482 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010483
Benjamin Peterson14339b62009-01-31 16:36:08 +000010484 i = 0;
10485 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010487 i++;
10488 }
10489 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010490
Benjamin Peterson14339b62009-01-31 16:36:08 +000010491 j = len;
10492 if (striptype != LEFTSTRIP) {
10493 do {
10494 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010496 j++;
10497 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010498
Benjamin Peterson14339b62009-01-31 16:36:08 +000010499 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
10500 Py_INCREF(self);
10501 return (PyObject*)self;
10502 }
10503 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 return substring(self, i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505}
10506
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010507
10508static PyObject *
10509do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10510{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010511 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010512
Benjamin Peterson14339b62009-01-31 16:36:08 +000010513 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10514 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010515
Benjamin Peterson14339b62009-01-31 16:36:08 +000010516 if (sep != NULL && sep != Py_None) {
10517 if (PyUnicode_Check(sep))
10518 return _PyUnicode_XStrip(self, striptype, sep);
10519 else {
10520 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010521 "%s arg must be None or str",
10522 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010523 return NULL;
10524 }
10525 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010526
Benjamin Peterson14339b62009-01-31 16:36:08 +000010527 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010528}
10529
10530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010531PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010532 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010533\n\
10534Return a copy of the string S with leading and trailing\n\
10535whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010536If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010537
10538static PyObject *
10539unicode_strip(PyUnicodeObject *self, PyObject *args)
10540{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010541 if (PyTuple_GET_SIZE(args) == 0)
10542 return do_strip(self, BOTHSTRIP); /* Common case */
10543 else
10544 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010545}
10546
10547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010548PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010549 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010550\n\
10551Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010552If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010553
10554static PyObject *
10555unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10556{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010557 if (PyTuple_GET_SIZE(args) == 0)
10558 return do_strip(self, LEFTSTRIP); /* Common case */
10559 else
10560 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010561}
10562
10563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010564PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010565 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010566\n\
10567Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010568If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010569
10570static PyObject *
10571unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10572{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010573 if (PyTuple_GET_SIZE(args) == 0)
10574 return do_strip(self, RIGHTSTRIP); /* Common case */
10575 else
10576 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010577}
10578
10579
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010581unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010582{
10583 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 Py_ssize_t nchars, n;
10585 size_t nbytes, char_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586
Georg Brandl222de0f2009-04-12 12:01:50 +000010587 if (len < 1) {
10588 Py_INCREF(unicode_empty);
10589 return (PyObject *)unicode_empty;
10590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591
Tim Peters7a29bd52001-09-12 03:03:31 +000010592 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593 /* no repeat, return original string */
10594 Py_INCREF(str);
10595 return (PyObject*) str;
10596 }
Tim Peters8f422462000-09-09 06:13:41 +000010597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 if (PyUnicode_READY(str) == -1)
10599 return NULL;
10600
Tim Peters8f422462000-09-09 06:13:41 +000010601 /* ensure # of chars needed doesn't overflow int and # of bytes
10602 * needed doesn't overflow size_t
10603 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 nchars = len * PyUnicode_GET_LENGTH(str);
10605 if (nchars / len != PyUnicode_GET_LENGTH(str)) {
Tim Peters8f422462000-09-09 06:13:41 +000010606 PyErr_SetString(PyExc_OverflowError,
10607 "repeated string is too long");
10608 return NULL;
10609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 char_size = PyUnicode_CHARACTER_SIZE(str);
10611 nbytes = (nchars + 1) * char_size;
10612 if (nbytes / char_size != (size_t)(nchars + 1)) {
Tim Peters8f422462000-09-09 06:13:41 +000010613 PyErr_SetString(PyExc_OverflowError,
10614 "repeated string is too long");
10615 return NULL;
10616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618 if (!u)
10619 return NULL;
10620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 if (PyUnicode_GET_LENGTH(str) == 1) {
10622 const int kind = PyUnicode_KIND(str);
10623 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10624 void *to = PyUnicode_DATA(u);
10625 for (n = 0; n < len; ++n)
10626 PyUnicode_WRITE(kind, to, n, fill_char);
10627 }
10628 else {
10629 /* number of characters copied this far */
10630 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10631 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10632 char *to = (char *) PyUnicode_DATA(u);
10633 Py_MEMCPY(to, PyUnicode_DATA(str),
10634 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010635 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 n = (done <= nchars-done) ? done : nchars-done;
10637 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010638 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010639 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010640 }
10641
10642 return (PyObject*) u;
10643}
10644
Alexander Belopolsky40018472011-02-26 01:02:56 +000010645PyObject *
10646PyUnicode_Replace(PyObject *obj,
10647 PyObject *subobj,
10648 PyObject *replobj,
10649 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010650{
10651 PyObject *self;
10652 PyObject *str1;
10653 PyObject *str2;
10654 PyObject *result;
10655
10656 self = PyUnicode_FromObject(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 if (self == NULL || PyUnicode_READY(obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010659 str1 = PyUnicode_FromObject(subobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 if (str1 == NULL || PyUnicode_READY(obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010661 Py_DECREF(self);
10662 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010663 }
10664 str2 = PyUnicode_FromObject(replobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 if (str2 == NULL || PyUnicode_READY(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010666 Py_DECREF(self);
10667 Py_DECREF(str1);
10668 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671 Py_DECREF(self);
10672 Py_DECREF(str1);
10673 Py_DECREF(str2);
10674 return result;
10675}
10676
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010677PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010678 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679\n\
10680Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010681old replaced by new. If the optional argument count is\n\
10682given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683
10684static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 PyObject *str1;
10688 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010689 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010690 PyObject *result;
10691
Martin v. Löwis18e16552006-02-15 17:27:45 +000010692 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010695 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 str1 = PyUnicode_FromObject(str1);
10697 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10698 return NULL;
10699 str2 = PyUnicode_FromObject(str2);
10700 if (str2 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010701 Py_DECREF(str1);
10702 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704
10705 result = replace(self, str1, str2, maxcount);
10706
10707 Py_DECREF(str1);
10708 Py_DECREF(str2);
10709 return result;
10710}
10711
Alexander Belopolsky40018472011-02-26 01:02:56 +000010712static PyObject *
10713unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010715 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 Py_ssize_t isize;
10717 Py_ssize_t osize, squote, dquote, i, o;
10718 Py_UCS4 max, quote;
10719 int ikind, okind;
10720 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010723 return NULL;
10724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 isize = PyUnicode_GET_LENGTH(unicode);
10726 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 /* Compute length of output, quote characters, and
10729 maximum character */
10730 osize = 2; /* quotes */
10731 max = 127;
10732 squote = dquote = 0;
10733 ikind = PyUnicode_KIND(unicode);
10734 for (i = 0; i < isize; i++) {
10735 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10736 switch (ch) {
10737 case '\'': squote++; osize++; break;
10738 case '"': dquote++; osize++; break;
10739 case '\\': case '\t': case '\r': case '\n':
10740 osize += 2; break;
10741 default:
10742 /* Fast-path ASCII */
10743 if (ch < ' ' || ch == 0x7f)
10744 osize += 4; /* \xHH */
10745 else if (ch < 0x7f)
10746 osize++;
10747 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10748 osize++;
10749 max = ch > max ? ch : max;
10750 }
10751 else if (ch < 0x100)
10752 osize += 4; /* \xHH */
10753 else if (ch < 0x10000)
10754 osize += 6; /* \uHHHH */
10755 else
10756 osize += 10; /* \uHHHHHHHH */
10757 }
10758 }
10759
10760 quote = '\'';
10761 if (squote) {
10762 if (dquote)
10763 /* Both squote and dquote present. Use squote,
10764 and escape them */
10765 osize += squote;
10766 else
10767 quote = '"';
10768 }
10769
10770 repr = PyUnicode_New(osize, max);
10771 if (repr == NULL)
10772 return NULL;
10773 okind = PyUnicode_KIND(repr);
10774 odata = PyUnicode_DATA(repr);
10775
10776 PyUnicode_WRITE(okind, odata, 0, quote);
10777 PyUnicode_WRITE(okind, odata, osize-1, quote);
10778
10779 for (i = 0, o = 1; i < isize; i++) {
10780 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010781
10782 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 if ((ch == quote) || (ch == '\\')) {
10784 PyUnicode_WRITE(okind, odata, o++, '\\');
10785 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010786 continue;
10787 }
10788
Benjamin Peterson29060642009-01-31 22:14:21 +000010789 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010790 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 PyUnicode_WRITE(okind, odata, o++, '\\');
10792 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010793 }
10794 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 PyUnicode_WRITE(okind, odata, o++, '\\');
10796 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010797 }
10798 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 PyUnicode_WRITE(okind, odata, o++, '\\');
10800 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010801 }
10802
10803 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010804 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 PyUnicode_WRITE(okind, odata, o++, '\\');
10806 PyUnicode_WRITE(okind, odata, o++, 'x');
10807 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10808 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010809 }
10810
Georg Brandl559e5d72008-06-11 18:37:52 +000010811 /* Copy ASCII characters as-is */
10812 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010814 }
10815
Benjamin Peterson29060642009-01-31 22:14:21 +000010816 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010817 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010818 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010819 (categories Z* and C* except ASCII space)
10820 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010822 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 if (ch <= 0xff) {
10824 PyUnicode_WRITE(okind, odata, o++, '\\');
10825 PyUnicode_WRITE(okind, odata, o++, 'x');
10826 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10827 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010828 }
10829 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830 else if (ch >= 0x10000) {
10831 PyUnicode_WRITE(okind, odata, o++, '\\');
10832 PyUnicode_WRITE(okind, odata, o++, 'U');
10833 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10834 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10835 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10836 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10837 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10838 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10839 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10840 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010841 }
10842 /* Map 16-bit characters to '\uxxxx' */
10843 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 PyUnicode_WRITE(okind, odata, o++, '\\');
10845 PyUnicode_WRITE(okind, odata, o++, 'u');
10846 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10847 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10848 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10849 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010850 }
10851 }
10852 /* Copy characters as-is */
10853 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010855 }
10856 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010859 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860}
10861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010862PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010863 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864\n\
10865Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010866such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867arguments start and end are interpreted as in slice notation.\n\
10868\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010869Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010870
10871static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010873{
Jesus Ceaac451502011-04-20 17:09:23 +020010874 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010875 Py_ssize_t start;
10876 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010877 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010878
Jesus Ceaac451502011-04-20 17:09:23 +020010879 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10880 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010881 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 if (PyUnicode_READY(self) == -1)
10884 return NULL;
10885 if (PyUnicode_READY(substring) == -1)
10886 return NULL;
10887
10888 result = any_find_slice(
10889 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10890 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010891 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010892
10893 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010895 if (result == -2)
10896 return NULL;
10897
Christian Heimes217cfd12007-12-02 14:31:20 +000010898 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010899}
10900
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010901PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010902 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010904Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905
10906static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908{
Jesus Ceaac451502011-04-20 17:09:23 +020010909 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010910 Py_ssize_t start;
10911 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010912 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010913
Jesus Ceaac451502011-04-20 17:09:23 +020010914 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10915 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010916 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010918 if (PyUnicode_READY(self) == -1)
10919 return NULL;
10920 if (PyUnicode_READY(substring) == -1)
10921 return NULL;
10922
10923 result = any_find_slice(
10924 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10925 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010926 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927
10928 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 if (result == -2)
10931 return NULL;
10932
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933 if (result < 0) {
10934 PyErr_SetString(PyExc_ValueError, "substring not found");
10935 return NULL;
10936 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937
Christian Heimes217cfd12007-12-02 14:31:20 +000010938 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939}
10940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010941PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010942 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010944Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010945done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946
10947static PyObject *
10948unicode_rjust(PyUnicodeObject *self, PyObject *args)
10949{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010950 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 Py_UCS4 fillchar = ' ';
10952
10953 if (PyUnicode_READY(self) == -1)
10954 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010955
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010956 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957 return NULL;
10958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960 Py_INCREF(self);
10961 return (PyObject*) self;
10962 }
10963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965}
10966
Alexander Belopolsky40018472011-02-26 01:02:56 +000010967PyObject *
10968PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969{
10970 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010971
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972 s = PyUnicode_FromObject(s);
10973 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010974 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010975 if (sep != NULL) {
10976 sep = PyUnicode_FromObject(sep);
10977 if (sep == NULL) {
10978 Py_DECREF(s);
10979 return NULL;
10980 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981 }
10982
10983 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
10984
10985 Py_DECREF(s);
10986 Py_XDECREF(sep);
10987 return result;
10988}
10989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010990PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010991 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992\n\
10993Return a list of the words in S, using sep as the\n\
10994delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000010995splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000010996whitespace string is a separator and empty strings are\n\
10997removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998
10999static PyObject*
11000unicode_split(PyUnicodeObject *self, PyObject *args)
11001{
11002 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011003 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004
Martin v. Löwis18e16552006-02-15 17:27:45 +000011005 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 return NULL;
11007
11008 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011009 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011011 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011013 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014}
11015
Thomas Wouters477c8d52006-05-27 19:21:47 +000011016PyObject *
11017PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11018{
11019 PyObject* str_obj;
11020 PyObject* sep_obj;
11021 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 int kind1, kind2, kind;
11023 void *buf1 = NULL, *buf2 = NULL;
11024 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011025
11026 str_obj = PyUnicode_FromObject(str_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027 if (!str_obj || PyUnicode_READY(str_in) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011028 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011029 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011031 Py_DECREF(str_obj);
11032 return NULL;
11033 }
11034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 kind1 = PyUnicode_KIND(str_in);
11036 kind2 = PyUnicode_KIND(sep_obj);
11037 kind = kind1 > kind2 ? kind1 : kind2;
11038 buf1 = PyUnicode_DATA(str_in);
11039 if (kind1 != kind)
11040 buf1 = _PyUnicode_AsKind(str_in, kind);
11041 if (!buf1)
11042 goto onError;
11043 buf2 = PyUnicode_DATA(sep_obj);
11044 if (kind2 != kind)
11045 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11046 if (!buf2)
11047 goto onError;
11048 len1 = PyUnicode_GET_LENGTH(str_obj);
11049 len2 = PyUnicode_GET_LENGTH(sep_obj);
11050
11051 switch(PyUnicode_KIND(str_in)) {
11052 case PyUnicode_1BYTE_KIND:
11053 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11054 break;
11055 case PyUnicode_2BYTE_KIND:
11056 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11057 break;
11058 case PyUnicode_4BYTE_KIND:
11059 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11060 break;
11061 default:
11062 assert(0);
11063 out = 0;
11064 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011065
11066 Py_DECREF(sep_obj);
11067 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011068 if (kind1 != kind)
11069 PyMem_Free(buf1);
11070 if (kind2 != kind)
11071 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011072
11073 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 onError:
11075 Py_DECREF(sep_obj);
11076 Py_DECREF(str_obj);
11077 if (kind1 != kind && buf1)
11078 PyMem_Free(buf1);
11079 if (kind2 != kind && buf2)
11080 PyMem_Free(buf2);
11081 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011082}
11083
11084
11085PyObject *
11086PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11087{
11088 PyObject* str_obj;
11089 PyObject* sep_obj;
11090 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091 int kind1, kind2, kind;
11092 void *buf1 = NULL, *buf2 = NULL;
11093 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011094
11095 str_obj = PyUnicode_FromObject(str_in);
11096 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011097 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011098 sep_obj = PyUnicode_FromObject(sep_in);
11099 if (!sep_obj) {
11100 Py_DECREF(str_obj);
11101 return NULL;
11102 }
11103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011104 kind1 = PyUnicode_KIND(str_in);
11105 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011106 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011107 buf1 = PyUnicode_DATA(str_in);
11108 if (kind1 != kind)
11109 buf1 = _PyUnicode_AsKind(str_in, kind);
11110 if (!buf1)
11111 goto onError;
11112 buf2 = PyUnicode_DATA(sep_obj);
11113 if (kind2 != kind)
11114 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11115 if (!buf2)
11116 goto onError;
11117 len1 = PyUnicode_GET_LENGTH(str_obj);
11118 len2 = PyUnicode_GET_LENGTH(sep_obj);
11119
11120 switch(PyUnicode_KIND(str_in)) {
11121 case PyUnicode_1BYTE_KIND:
11122 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11123 break;
11124 case PyUnicode_2BYTE_KIND:
11125 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11126 break;
11127 case PyUnicode_4BYTE_KIND:
11128 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11129 break;
11130 default:
11131 assert(0);
11132 out = 0;
11133 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011134
11135 Py_DECREF(sep_obj);
11136 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011137 if (kind1 != kind)
11138 PyMem_Free(buf1);
11139 if (kind2 != kind)
11140 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011141
11142 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 onError:
11144 Py_DECREF(sep_obj);
11145 Py_DECREF(str_obj);
11146 if (kind1 != kind && buf1)
11147 PyMem_Free(buf1);
11148 if (kind2 != kind && buf2)
11149 PyMem_Free(buf2);
11150 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011151}
11152
11153PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011154 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011155\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011156Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011157the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011158found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011159
11160static PyObject*
11161unicode_partition(PyUnicodeObject *self, PyObject *separator)
11162{
11163 return PyUnicode_Partition((PyObject *)self, separator);
11164}
11165
11166PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011167 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011168\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011169Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011170the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011171separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011172
11173static PyObject*
11174unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11175{
11176 return PyUnicode_RPartition((PyObject *)self, separator);
11177}
11178
Alexander Belopolsky40018472011-02-26 01:02:56 +000011179PyObject *
11180PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011181{
11182 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011183
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011184 s = PyUnicode_FromObject(s);
11185 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011186 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011187 if (sep != NULL) {
11188 sep = PyUnicode_FromObject(sep);
11189 if (sep == NULL) {
11190 Py_DECREF(s);
11191 return NULL;
11192 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011193 }
11194
11195 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11196
11197 Py_DECREF(s);
11198 Py_XDECREF(sep);
11199 return result;
11200}
11201
11202PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011203 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011204\n\
11205Return a list of the words in S, using sep as the\n\
11206delimiter string, starting at the end of the string and\n\
11207working to the front. If maxsplit is given, at most maxsplit\n\
11208splits are done. If sep is not specified, any whitespace string\n\
11209is a separator.");
11210
11211static PyObject*
11212unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11213{
11214 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011215 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011216
Martin v. Löwis18e16552006-02-15 17:27:45 +000011217 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011218 return NULL;
11219
11220 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011221 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011222 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011223 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011224 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011225 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011226}
11227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011228PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011229 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230\n\
11231Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011232Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011233is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234
11235static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011236unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011238 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011239 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011241 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11242 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243 return NULL;
11244
Guido van Rossum86662912000-04-11 15:38:46 +000011245 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246}
11247
11248static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011249PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250{
Walter Dörwald346737f2007-05-31 10:44:43 +000011251 if (PyUnicode_CheckExact(self)) {
11252 Py_INCREF(self);
11253 return self;
11254 } else
11255 /* Subtype -- return genuine unicode string with the same value. */
11256 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
11257 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258}
11259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011260PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011261 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262\n\
11263Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011264and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265
11266static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011267unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269 return fixup(self, fixswapcase);
11270}
11271
Georg Brandlceee0772007-11-27 23:48:05 +000011272PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011273 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011274\n\
11275Return a translation table usable for str.translate().\n\
11276If there is only one argument, it must be a dictionary mapping Unicode\n\
11277ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011278Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011279If there are two arguments, they must be strings of equal length, and\n\
11280in the resulting dictionary, each character in x will be mapped to the\n\
11281character at the same position in y. If there is a third argument, it\n\
11282must be a string, whose characters will be mapped to None in the result.");
11283
11284static PyObject*
11285unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11286{
11287 PyObject *x, *y = NULL, *z = NULL;
11288 PyObject *new = NULL, *key, *value;
11289 Py_ssize_t i = 0;
11290 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011291
Georg Brandlceee0772007-11-27 23:48:05 +000011292 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11293 return NULL;
11294 new = PyDict_New();
11295 if (!new)
11296 return NULL;
11297 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 int x_kind, y_kind, z_kind;
11299 void *x_data, *y_data, *z_data;
11300
Georg Brandlceee0772007-11-27 23:48:05 +000011301 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011302 if (!PyUnicode_Check(x)) {
11303 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11304 "be a string if there is a second argument");
11305 goto err;
11306 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011307 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011308 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11309 "arguments must have equal length");
11310 goto err;
11311 }
11312 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 x_kind = PyUnicode_KIND(x);
11314 y_kind = PyUnicode_KIND(y);
11315 x_data = PyUnicode_DATA(x);
11316 y_data = PyUnicode_DATA(y);
11317 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11318 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11319 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011320 if (!key || !value)
11321 goto err;
11322 res = PyDict_SetItem(new, key, value);
11323 Py_DECREF(key);
11324 Py_DECREF(value);
11325 if (res < 0)
11326 goto err;
11327 }
11328 /* create entries for deleting chars in z */
11329 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 z_kind = PyUnicode_KIND(z);
11331 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011332 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011334 if (!key)
11335 goto err;
11336 res = PyDict_SetItem(new, key, Py_None);
11337 Py_DECREF(key);
11338 if (res < 0)
11339 goto err;
11340 }
11341 }
11342 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 int kind;
11344 void *data;
11345
Georg Brandlceee0772007-11-27 23:48:05 +000011346 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011347 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011348 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11349 "to maketrans it must be a dict");
11350 goto err;
11351 }
11352 /* copy entries into the new dict, converting string keys to int keys */
11353 while (PyDict_Next(x, &i, &key, &value)) {
11354 if (PyUnicode_Check(key)) {
11355 /* convert string keys to integer keys */
11356 PyObject *newkey;
11357 if (PyUnicode_GET_SIZE(key) != 1) {
11358 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11359 "table must be of length 1");
11360 goto err;
11361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 kind = PyUnicode_KIND(key);
11363 data = PyUnicode_DATA(key);
11364 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011365 if (!newkey)
11366 goto err;
11367 res = PyDict_SetItem(new, newkey, value);
11368 Py_DECREF(newkey);
11369 if (res < 0)
11370 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011371 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011372 /* just keep integer keys */
11373 if (PyDict_SetItem(new, key, value) < 0)
11374 goto err;
11375 } else {
11376 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11377 "be strings or integers");
11378 goto err;
11379 }
11380 }
11381 }
11382 return new;
11383 err:
11384 Py_DECREF(new);
11385 return NULL;
11386}
11387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011388PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011389 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390\n\
11391Return a copy of the string S, where all characters have been mapped\n\
11392through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011393Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011394Unmapped characters are left untouched. Characters mapped to None\n\
11395are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396
11397static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401}
11402
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011403PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011404 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011406Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407
11408static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011409unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411 return fixup(self, fixupper);
11412}
11413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011417Pad a numeric string S with zeros on the left, to fill a field\n\
11418of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419
11420static PyObject *
11421unicode_zfill(PyUnicodeObject *self, PyObject *args)
11422{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011423 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011425 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 int kind;
11427 void *data;
11428 Py_UCS4 chr;
11429
11430 if (PyUnicode_READY(self) == -1)
11431 return NULL;
11432
Martin v. Löwis18e16552006-02-15 17:27:45 +000011433 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434 return NULL;
11435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011437 if (PyUnicode_CheckExact(self)) {
11438 Py_INCREF(self);
11439 return (PyObject*) self;
11440 }
11441 else
11442 return PyUnicode_FromUnicode(
11443 PyUnicode_AS_UNICODE(self),
11444 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +000011445 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446 }
11447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449
11450 u = pad(self, fill, 0, '0');
11451
Walter Dörwald068325e2002-04-15 13:36:47 +000011452 if (u == NULL)
11453 return NULL;
11454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 kind = PyUnicode_KIND(u);
11456 data = PyUnicode_DATA(u);
11457 chr = PyUnicode_READ(kind, data, fill);
11458
11459 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011461 PyUnicode_WRITE(kind, data, 0, chr);
11462 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463 }
11464
11465 return (PyObject*) u;
11466}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467
11468#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011469static PyObject *
11470unicode__decimal2ascii(PyObject *self)
11471{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011473}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474#endif
11475
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011476PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011477 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011479Return True if S starts with the specified prefix, False otherwise.\n\
11480With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011481With optional end, stop comparing S at that position.\n\
11482prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483
11484static PyObject *
11485unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011486 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011488 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011490 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011491 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011492 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493
Jesus Ceaac451502011-04-20 17:09:23 +020011494 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011495 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011496 if (PyTuple_Check(subobj)) {
11497 Py_ssize_t i;
11498 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11499 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011500 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011501 if (substring == NULL)
11502 return NULL;
11503 result = tailmatch(self, substring, start, end, -1);
11504 Py_DECREF(substring);
11505 if (result) {
11506 Py_RETURN_TRUE;
11507 }
11508 }
11509 /* nothing matched */
11510 Py_RETURN_FALSE;
11511 }
11512 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011513 if (substring == NULL) {
11514 if (PyErr_ExceptionMatches(PyExc_TypeError))
11515 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11516 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011517 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011518 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011519 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011521 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522}
11523
11524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011525PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011528Return True if S ends with the specified suffix, False otherwise.\n\
11529With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011530With optional end, stop comparing S at that position.\n\
11531suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532
11533static PyObject *
11534unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011535 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011537 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011539 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011540 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011541 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542
Jesus Ceaac451502011-04-20 17:09:23 +020011543 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011544 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011545 if (PyTuple_Check(subobj)) {
11546 Py_ssize_t i;
11547 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11548 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011550 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011551 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011552 result = tailmatch(self, substring, start, end, +1);
11553 Py_DECREF(substring);
11554 if (result) {
11555 Py_RETURN_TRUE;
11556 }
11557 }
11558 Py_RETURN_FALSE;
11559 }
11560 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011561 if (substring == NULL) {
11562 if (PyErr_ExceptionMatches(PyExc_TypeError))
11563 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11564 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011566 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011567 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011569 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570}
11571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011573
11574PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011575 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011576\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011577Return a formatted version of S, using substitutions from args and kwargs.\n\
11578The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011579
Eric Smith27bbca62010-11-04 17:06:58 +000011580PyDoc_STRVAR(format_map__doc__,
11581 "S.format_map(mapping) -> str\n\
11582\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011583Return a formatted version of S, using substitutions from mapping.\n\
11584The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011585
Eric Smith4a7d76d2008-05-30 18:10:19 +000011586static PyObject *
11587unicode__format__(PyObject* self, PyObject* args)
11588{
11589 PyObject *format_spec;
11590
11591 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11592 return NULL;
11593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011594 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11595 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011596}
11597
Eric Smith8c663262007-08-25 02:26:07 +000011598PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011599 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011600\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011601Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011602
11603static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011604unicode__sizeof__(PyUnicodeObject *v)
11605{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011606 Py_ssize_t size;
11607
11608 /* If it's a compact object, account for base structure +
11609 character data. */
11610 if (PyUnicode_IS_COMPACT_ASCII(v))
11611 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11612 else if (PyUnicode_IS_COMPACT(v))
11613 size = sizeof(PyCompactUnicodeObject) +
11614 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11615 else {
11616 /* If it is a two-block object, account for base object, and
11617 for character block if present. */
11618 size = sizeof(PyUnicodeObject);
11619 if (v->data.any)
11620 size += (PyUnicode_GET_LENGTH(v) + 1) *
11621 PyUnicode_CHARACTER_SIZE(v);
11622 }
11623 /* If the wstr pointer is present, account for it unless it is shared
11624 with the data pointer. Since PyUnicode_DATA will crash if the object
11625 is not ready, check whether it's either not ready (in which case the
11626 data is entirely in wstr) or if the data is not shared. */
11627 if (_PyUnicode_WSTR(v) &&
11628 (!PyUnicode_IS_READY(v) ||
11629 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11630 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11631 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11632 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11633
11634 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011635}
11636
11637PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011638 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011639
11640static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011641unicode_getnewargs(PyUnicodeObject *v)
11642{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 PyObject *copy;
11644 unsigned char *data;
11645 int kind;
11646 if (PyUnicode_READY(v) == -1)
11647 return NULL;
11648 kind = PyUnicode_KIND(v);
11649 data = PyUnicode_1BYTE_DATA(v);
11650 copy = PyUnicode_FromKindAndData(kind, data, PyUnicode_GET_LENGTH(v));
11651 if (!copy)
11652 return NULL;
11653 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011654}
11655
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656static PyMethodDef unicode_methods[] = {
11657
11658 /* Order is according to common usage: often used methods should
11659 appear first, since lookup is done sequentially. */
11660
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011661 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011662 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11663 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011664 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011665 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11666 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11667 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11668 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11669 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11670 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11671 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011672 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011673 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11674 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11675 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011676 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011677 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11678 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11679 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011680 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011681 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011682 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011683 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011684 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11685 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11686 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11687 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11688 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11689 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11690 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11691 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11692 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11693 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11694 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11695 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11696 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11697 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011698 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011699 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011700 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011701 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011702 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011703 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011704 {"maketrans", (PyCFunction) unicode_maketrans,
11705 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011706 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011707#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011708 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709#endif
11710
11711#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011712 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011713 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714#endif
11715
Benjamin Peterson14339b62009-01-31 16:36:08 +000011716 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717 {NULL, NULL}
11718};
11719
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011720static PyObject *
11721unicode_mod(PyObject *v, PyObject *w)
11722{
Brian Curtindfc80e32011-08-10 20:28:54 -050011723 if (!PyUnicode_Check(v))
11724 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011725 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011726}
11727
11728static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011729 0, /*nb_add*/
11730 0, /*nb_subtract*/
11731 0, /*nb_multiply*/
11732 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011733};
11734
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011736 (lenfunc) unicode_length, /* sq_length */
11737 PyUnicode_Concat, /* sq_concat */
11738 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11739 (ssizeargfunc) unicode_getitem, /* sq_item */
11740 0, /* sq_slice */
11741 0, /* sq_ass_item */
11742 0, /* sq_ass_slice */
11743 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744};
11745
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011746static PyObject*
11747unicode_subscript(PyUnicodeObject* self, PyObject* item)
11748{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 if (PyUnicode_READY(self) == -1)
11750 return NULL;
11751
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011752 if (PyIndex_Check(item)) {
11753 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011754 if (i == -1 && PyErr_Occurred())
11755 return NULL;
11756 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011758 return unicode_getitem(self, i);
11759 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011760 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011762 Py_UNICODE* result_buf;
11763 PyObject* result;
11764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011766 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011767 return NULL;
11768 }
11769
11770 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 return PyUnicode_New(0, 0);
11772 } else if (start == 0 && step == 1 &&
11773 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011774 PyUnicode_CheckExact(self)) {
11775 Py_INCREF(self);
11776 return (PyObject *)self;
11777 } else if (step == 1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 return substring(self, start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011779 } else {
11780 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011781 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11782 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011783
Benjamin Peterson29060642009-01-31 22:14:21 +000011784 if (result_buf == NULL)
11785 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011786
11787 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11788 result_buf[i] = source_buf[cur];
11789 }
Tim Petersced69f82003-09-16 20:30:58 +000011790
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011791 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011792 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011793 return result;
11794 }
11795 } else {
11796 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11797 return NULL;
11798 }
11799}
11800
11801static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011802 (lenfunc)unicode_length, /* mp_length */
11803 (binaryfunc)unicode_subscript, /* mp_subscript */
11804 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011805};
11806
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808/* Helpers for PyUnicode_Format() */
11809
11810static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011811getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011813 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011815 (*p_argidx)++;
11816 if (arglen < 0)
11817 return args;
11818 else
11819 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820 }
11821 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011822 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823 return NULL;
11824}
11825
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011826/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011828static PyObject *
11829formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011831 char *p;
11832 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011834
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 x = PyFloat_AsDouble(v);
11836 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011837 return NULL;
11838
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011840 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011841
Eric Smith0923d1d2009-04-16 20:16:10 +000011842 p = PyOS_double_to_string(x, type, prec,
11843 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011844 if (p == NULL)
11845 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011847 PyMem_Free(p);
11848 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849}
11850
Tim Peters38fd5b62000-09-21 05:43:11 +000011851static PyObject*
11852formatlong(PyObject *val, int flags, int prec, int type)
11853{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011854 char *buf;
11855 int len;
11856 PyObject *str; /* temporary string object. */
11857 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011858
Benjamin Peterson14339b62009-01-31 16:36:08 +000011859 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11860 if (!str)
11861 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011863 Py_DECREF(str);
11864 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011865}
11866
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011869 size_t buflen,
11870 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011872 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011873 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 if (PyUnicode_GET_LENGTH(v) == 1) {
11875 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011876 buf[1] = '\0';
11877 return 1;
11878 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011879 goto onError;
11880 }
11881 else {
11882 /* Integer input truncated to a character */
11883 long x;
11884 x = PyLong_AsLong(v);
11885 if (x == -1 && PyErr_Occurred())
11886 goto onError;
11887
11888 if (x < 0 || x > 0x10ffff) {
11889 PyErr_SetString(PyExc_OverflowError,
11890 "%c arg not in range(0x110000)");
11891 return -1;
11892 }
11893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011895 buf[1] = '\0';
11896 return 1;
11897 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011898
Benjamin Peterson29060642009-01-31 22:14:21 +000011899 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011900 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011901 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011902 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903}
11904
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011905/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011906 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011907*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011908#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011909
Alexander Belopolsky40018472011-02-26 01:02:56 +000011910PyObject *
11911PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 void *fmt;
11914 int fmtkind;
11915 PyObject *result;
11916 Py_UCS4 *res, *res0;
11917 Py_UCS4 max;
11918 int kind;
11919 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011923
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011925 PyErr_BadInternalCall();
11926 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11929 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 fmt = PyUnicode_DATA(uformat);
11932 fmtkind = PyUnicode_KIND(uformat);
11933 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11934 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935
11936 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11938 if (res0 == NULL) {
11939 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011940 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942
11943 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 arglen = PyTuple_Size(args);
11945 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946 }
11947 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011948 arglen = -1;
11949 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011951 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011952 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011953 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954
11955 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011957 if (--rescnt < 0) {
11958 rescnt = fmtcnt + 100;
11959 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11961 if (res0 == NULL){
11962 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011963 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 }
11965 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011966 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011969 }
11970 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 /* Got a format specifier */
11972 int flags = 0;
11973 Py_ssize_t width = -1;
11974 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 Py_UCS4 c = '\0';
11976 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 int isnumok;
11978 PyObject *v = NULL;
11979 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 void *pbuf;
11981 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011982 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 Py_ssize_t len, len1;
11984 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 fmtpos++;
11987 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
11988 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 Py_ssize_t keylen;
11990 PyObject *key;
11991 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000011992
Benjamin Peterson29060642009-01-31 22:14:21 +000011993 if (dict == NULL) {
11994 PyErr_SetString(PyExc_TypeError,
11995 "format requires a mapping");
11996 goto onError;
11997 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011999 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012001 /* Skip over balanced parentheses */
12002 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012006 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012010 if (fmtcnt < 0 || pcount > 0) {
12011 PyErr_SetString(PyExc_ValueError,
12012 "incomplete format key");
12013 goto onError;
12014 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 key = substring(uformat, keystart, keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012016 if (key == NULL)
12017 goto onError;
12018 if (args_owned) {
12019 Py_DECREF(args);
12020 args_owned = 0;
12021 }
12022 args = PyObject_GetItem(dict, key);
12023 Py_DECREF(key);
12024 if (args == NULL) {
12025 goto onError;
12026 }
12027 args_owned = 1;
12028 arglen = -1;
12029 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012030 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012031 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012033 case '-': flags |= F_LJUST; continue;
12034 case '+': flags |= F_SIGN; continue;
12035 case ' ': flags |= F_BLANK; continue;
12036 case '#': flags |= F_ALT; continue;
12037 case '0': flags |= F_ZERO; continue;
12038 }
12039 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012040 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012041 if (c == '*') {
12042 v = getnextarg(args, arglen, &argidx);
12043 if (v == NULL)
12044 goto onError;
12045 if (!PyLong_Check(v)) {
12046 PyErr_SetString(PyExc_TypeError,
12047 "* wants int");
12048 goto onError;
12049 }
12050 width = PyLong_AsLong(v);
12051 if (width == -1 && PyErr_Occurred())
12052 goto onError;
12053 if (width < 0) {
12054 flags |= F_LJUST;
12055 width = -width;
12056 }
12057 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012059 }
12060 else if (c >= '0' && c <= '9') {
12061 width = c - '0';
12062 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012064 if (c < '0' || c > '9')
12065 break;
12066 if ((width*10) / 10 != width) {
12067 PyErr_SetString(PyExc_ValueError,
12068 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012069 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012070 }
12071 width = width*10 + (c - '0');
12072 }
12073 }
12074 if (c == '.') {
12075 prec = 0;
12076 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012078 if (c == '*') {
12079 v = getnextarg(args, arglen, &argidx);
12080 if (v == NULL)
12081 goto onError;
12082 if (!PyLong_Check(v)) {
12083 PyErr_SetString(PyExc_TypeError,
12084 "* wants int");
12085 goto onError;
12086 }
12087 prec = PyLong_AsLong(v);
12088 if (prec == -1 && PyErr_Occurred())
12089 goto onError;
12090 if (prec < 0)
12091 prec = 0;
12092 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012094 }
12095 else if (c >= '0' && c <= '9') {
12096 prec = c - '0';
12097 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012099 if (c < '0' || c > '9')
12100 break;
12101 if ((prec*10) / 10 != prec) {
12102 PyErr_SetString(PyExc_ValueError,
12103 "prec too big");
12104 goto onError;
12105 }
12106 prec = prec*10 + (c - '0');
12107 }
12108 }
12109 } /* prec */
12110 if (fmtcnt >= 0) {
12111 if (c == 'h' || c == 'l' || c == 'L') {
12112 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 }
12115 }
12116 if (fmtcnt < 0) {
12117 PyErr_SetString(PyExc_ValueError,
12118 "incomplete format");
12119 goto onError;
12120 }
12121 if (c != '%') {
12122 v = getnextarg(args, arglen, &argidx);
12123 if (v == NULL)
12124 goto onError;
12125 }
12126 sign = 0;
12127 fill = ' ';
12128 switch (c) {
12129
12130 case '%':
12131 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012133 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012135 len = 1;
12136 break;
12137
12138 case 's':
12139 case 'r':
12140 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012141 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012142 temp = v;
12143 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012144 }
12145 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012146 if (c == 's')
12147 temp = PyObject_Str(v);
12148 else if (c == 'r')
12149 temp = PyObject_Repr(v);
12150 else
12151 temp = PyObject_ASCII(v);
12152 if (temp == NULL)
12153 goto onError;
12154 if (PyUnicode_Check(temp))
12155 /* nothing to do */;
12156 else {
12157 Py_DECREF(temp);
12158 PyErr_SetString(PyExc_TypeError,
12159 "%s argument has non-string str()");
12160 goto onError;
12161 }
12162 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163 if (PyUnicode_READY(temp) == -1) {
12164 Py_CLEAR(temp);
12165 goto onError;
12166 }
12167 pbuf = PyUnicode_DATA(temp);
12168 kind = PyUnicode_KIND(temp);
12169 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012170 if (prec >= 0 && len > prec)
12171 len = prec;
12172 break;
12173
12174 case 'i':
12175 case 'd':
12176 case 'u':
12177 case 'o':
12178 case 'x':
12179 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012180 isnumok = 0;
12181 if (PyNumber_Check(v)) {
12182 PyObject *iobj=NULL;
12183
12184 if (PyLong_Check(v)) {
12185 iobj = v;
12186 Py_INCREF(iobj);
12187 }
12188 else {
12189 iobj = PyNumber_Long(v);
12190 }
12191 if (iobj!=NULL) {
12192 if (PyLong_Check(iobj)) {
12193 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012194 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012195 Py_DECREF(iobj);
12196 if (!temp)
12197 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 if (PyUnicode_READY(temp) == -1) {
12199 Py_CLEAR(temp);
12200 goto onError;
12201 }
12202 pbuf = PyUnicode_DATA(temp);
12203 kind = PyUnicode_KIND(temp);
12204 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012205 sign = 1;
12206 }
12207 else {
12208 Py_DECREF(iobj);
12209 }
12210 }
12211 }
12212 if (!isnumok) {
12213 PyErr_Format(PyExc_TypeError,
12214 "%%%c format: a number is required, "
12215 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12216 goto onError;
12217 }
12218 if (flags & F_ZERO)
12219 fill = '0';
12220 break;
12221
12222 case 'e':
12223 case 'E':
12224 case 'f':
12225 case 'F':
12226 case 'g':
12227 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012228 temp = formatfloat(v, flags, prec, c);
12229 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012230 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 if (PyUnicode_READY(temp) == -1) {
12232 Py_CLEAR(temp);
12233 goto onError;
12234 }
12235 pbuf = PyUnicode_DATA(temp);
12236 kind = PyUnicode_KIND(temp);
12237 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012238 sign = 1;
12239 if (flags & F_ZERO)
12240 fill = '0';
12241 break;
12242
12243 case 'c':
12244 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012245 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012246 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
12247 if (len < 0)
12248 goto onError;
12249 break;
12250
12251 default:
12252 PyErr_Format(PyExc_ValueError,
12253 "unsupported format character '%c' (0x%x) "
12254 "at index %zd",
12255 (31<=c && c<=126) ? (char)c : '?',
12256 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012258 goto onError;
12259 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012260 /* pbuf is initialized here. */
12261 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012262 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12264 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12265 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012266 len--;
12267 }
12268 else if (flags & F_SIGN)
12269 sign = '+';
12270 else if (flags & F_BLANK)
12271 sign = ' ';
12272 else
12273 sign = 0;
12274 }
12275 if (width < len)
12276 width = len;
12277 if (rescnt - (sign != 0) < width) {
12278 reslen -= rescnt;
12279 rescnt = width + fmtcnt + 100;
12280 reslen += rescnt;
12281 if (reslen < 0) {
12282 Py_XDECREF(temp);
12283 PyErr_NoMemory();
12284 goto onError;
12285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12287 if (res0 == 0) {
12288 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012289 Py_XDECREF(temp);
12290 goto onError;
12291 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012293 }
12294 if (sign) {
12295 if (fill != ' ')
12296 *res++ = sign;
12297 rescnt--;
12298 if (width > len)
12299 width--;
12300 }
12301 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12303 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12306 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 }
12308 rescnt -= 2;
12309 width -= 2;
12310 if (width < 0)
12311 width = 0;
12312 len -= 2;
12313 }
12314 if (width > len && !(flags & F_LJUST)) {
12315 do {
12316 --rescnt;
12317 *res++ = fill;
12318 } while (--width > len);
12319 }
12320 if (fill == ' ') {
12321 if (sign)
12322 *res++ = sign;
12323 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12325 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12326 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12327 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012328 }
12329 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 /* Copy all characters, preserving len */
12331 len1 = len;
12332 while (len1--) {
12333 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12334 rescnt--;
12335 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012336 while (--width >= len) {
12337 --rescnt;
12338 *res++ = ' ';
12339 }
12340 if (dict && (argidx < arglen) && c != '%') {
12341 PyErr_SetString(PyExc_TypeError,
12342 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012343 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012344 goto onError;
12345 }
12346 Py_XDECREF(temp);
12347 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012348 } /* until end */
12349 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012350 PyErr_SetString(PyExc_TypeError,
12351 "not all arguments converted during string formatting");
12352 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012353 }
12354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355
12356 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12357 if (*res > max)
12358 max = *res;
12359 result = PyUnicode_New(reslen - rescnt, max);
12360 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012361 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 kind = PyUnicode_KIND(result);
12363 for (res = res0; res < res0+reslen-rescnt; res++)
12364 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12365 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012367 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368 }
12369 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370 return (PyObject *)result;
12371
Benjamin Peterson29060642009-01-31 22:14:21 +000012372 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012374 Py_DECREF(uformat);
12375 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012376 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377 }
12378 return NULL;
12379}
12380
Jeremy Hylton938ace62002-07-17 16:30:39 +000012381static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012382unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12383
Tim Peters6d6c1a32001-08-02 04:15:00 +000012384static PyObject *
12385unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12386{
Benjamin Peterson29060642009-01-31 22:14:21 +000012387 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012388 static char *kwlist[] = {"object", "encoding", "errors", 0};
12389 char *encoding = NULL;
12390 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012391
Benjamin Peterson14339b62009-01-31 16:36:08 +000012392 if (type != &PyUnicode_Type)
12393 return unicode_subtype_new(type, args, kwds);
12394 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012395 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012396 return NULL;
12397 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012398 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012399 if (encoding == NULL && errors == NULL)
12400 return PyObject_Str(x);
12401 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012402 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012403}
12404
Guido van Rossume023fe02001-08-30 03:12:59 +000012405static PyObject *
12406unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12407{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012408 PyUnicodeObject *tmp, *pnew;
12409 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012411
Benjamin Peterson14339b62009-01-31 16:36:08 +000012412 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12413 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12414 if (tmp == NULL)
12415 return NULL;
12416 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12418 // it seems kind of strange that tp_alloc gets passed the size
12419 // of the unicode string because there will follow another
12420 // malloc.
12421 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12422 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012423 if (pnew == NULL) {
12424 Py_DECREF(tmp);
12425 return NULL;
12426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12428 if (_PyUnicode_WSTR(pnew) == NULL) {
12429 err = PyErr_NoMemory();
12430 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012431 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12433 _PyUnicode_WSTR_LENGTH(pnew) = n;
12434 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12435 _PyUnicode_STATE(pnew).interned = 0;
12436 _PyUnicode_STATE(pnew).kind = 0;
12437 _PyUnicode_STATE(pnew).compact = 0;
12438 _PyUnicode_STATE(pnew).ready = 0;
12439 _PyUnicode_STATE(pnew).ascii = 0;
12440 pnew->data.any = NULL;
12441 _PyUnicode_LENGTH(pnew) = 0;
12442 pnew->_base.utf8 = NULL;
12443 pnew->_base.utf8_length = 0;
12444
12445 if (PyUnicode_READY(pnew) == -1) {
12446 PyObject_FREE(_PyUnicode_WSTR(pnew));
12447 goto onError;
12448 }
12449
Benjamin Peterson14339b62009-01-31 16:36:08 +000012450 Py_DECREF(tmp);
12451 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012452
12453 onError:
12454 _Py_ForgetReference((PyObject *)pnew);
12455 PyObject_Del(pnew);
12456 Py_DECREF(tmp);
12457 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012458}
12459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012460PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012461 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012462\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012463Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012464encoding defaults to the current default string encoding.\n\
12465errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012466
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012467static PyObject *unicode_iter(PyObject *seq);
12468
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012470 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012471 "str", /* tp_name */
12472 sizeof(PyUnicodeObject), /* tp_size */
12473 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012475 (destructor)unicode_dealloc, /* tp_dealloc */
12476 0, /* tp_print */
12477 0, /* tp_getattr */
12478 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012479 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012480 unicode_repr, /* tp_repr */
12481 &unicode_as_number, /* tp_as_number */
12482 &unicode_as_sequence, /* tp_as_sequence */
12483 &unicode_as_mapping, /* tp_as_mapping */
12484 (hashfunc) unicode_hash, /* tp_hash*/
12485 0, /* tp_call*/
12486 (reprfunc) unicode_str, /* tp_str */
12487 PyObject_GenericGetAttr, /* tp_getattro */
12488 0, /* tp_setattro */
12489 0, /* tp_as_buffer */
12490 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012491 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012492 unicode_doc, /* tp_doc */
12493 0, /* tp_traverse */
12494 0, /* tp_clear */
12495 PyUnicode_RichCompare, /* tp_richcompare */
12496 0, /* tp_weaklistoffset */
12497 unicode_iter, /* tp_iter */
12498 0, /* tp_iternext */
12499 unicode_methods, /* tp_methods */
12500 0, /* tp_members */
12501 0, /* tp_getset */
12502 &PyBaseObject_Type, /* tp_base */
12503 0, /* tp_dict */
12504 0, /* tp_descr_get */
12505 0, /* tp_descr_set */
12506 0, /* tp_dictoffset */
12507 0, /* tp_init */
12508 0, /* tp_alloc */
12509 unicode_new, /* tp_new */
12510 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511};
12512
12513/* Initialize the Unicode implementation */
12514
Thomas Wouters78890102000-07-22 19:25:51 +000012515void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012517 int i;
12518
Thomas Wouters477c8d52006-05-27 19:21:47 +000012519 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012521 0x000A, /* LINE FEED */
12522 0x000D, /* CARRIAGE RETURN */
12523 0x001C, /* FILE SEPARATOR */
12524 0x001D, /* GROUP SEPARATOR */
12525 0x001E, /* RECORD SEPARATOR */
12526 0x0085, /* NEXT LINE */
12527 0x2028, /* LINE SEPARATOR */
12528 0x2029, /* PARAGRAPH SEPARATOR */
12529 };
12530
Fred Drakee4315f52000-05-09 19:53:39 +000012531 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012532 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012533 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012535
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012536 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012537 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012538 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012539 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012540
12541 /* initialize the linebreak bloom filter */
12542 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 PyUnicode_2BYTE_KIND, linebreak,
12544 sizeof(linebreak) / sizeof(linebreak[0]));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012545
12546 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547}
12548
12549/* Finalize the Unicode implementation */
12550
Christian Heimesa156e092008-02-16 07:38:31 +000012551int
12552PyUnicode_ClearFreeList(void)
12553{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012555}
12556
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557void
Thomas Wouters78890102000-07-22 19:25:51 +000012558_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012560 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012562 Py_XDECREF(unicode_empty);
12563 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012564
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012565 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012566 if (unicode_latin1[i]) {
12567 Py_DECREF(unicode_latin1[i]);
12568 unicode_latin1[i] = NULL;
12569 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012570 }
Christian Heimesa156e092008-02-16 07:38:31 +000012571 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012573
Walter Dörwald16807132007-05-25 13:52:07 +000012574void
12575PyUnicode_InternInPlace(PyObject **p)
12576{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012577 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12578 PyObject *t;
12579 if (s == NULL || !PyUnicode_Check(s))
12580 Py_FatalError(
12581 "PyUnicode_InternInPlace: unicode strings only please!");
12582 /* If it's a subclass, we don't really know what putting
12583 it in the interned dict might do. */
12584 if (!PyUnicode_CheckExact(s))
12585 return;
12586 if (PyUnicode_CHECK_INTERNED(s))
12587 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 if (PyUnicode_READY(s) == -1) {
12589 assert(0 && "ready fail in intern...");
12590 return;
12591 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012592 if (interned == NULL) {
12593 interned = PyDict_New();
12594 if (interned == NULL) {
12595 PyErr_Clear(); /* Don't leave an exception */
12596 return;
12597 }
12598 }
12599 /* It might be that the GetItem call fails even
12600 though the key is present in the dictionary,
12601 namely when this happens during a stack overflow. */
12602 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012604 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012605
Benjamin Peterson29060642009-01-31 22:14:21 +000012606 if (t) {
12607 Py_INCREF(t);
12608 Py_DECREF(*p);
12609 *p = t;
12610 return;
12611 }
Walter Dörwald16807132007-05-25 13:52:07 +000012612
Benjamin Peterson14339b62009-01-31 16:36:08 +000012613 PyThreadState_GET()->recursion_critical = 1;
12614 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12615 PyErr_Clear();
12616 PyThreadState_GET()->recursion_critical = 0;
12617 return;
12618 }
12619 PyThreadState_GET()->recursion_critical = 0;
12620 /* The two references in interned are not counted by refcnt.
12621 The deallocator will take care of this */
12622 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012624}
12625
12626void
12627PyUnicode_InternImmortal(PyObject **p)
12628{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12630
Benjamin Peterson14339b62009-01-31 16:36:08 +000012631 PyUnicode_InternInPlace(p);
12632 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012634 Py_INCREF(*p);
12635 }
Walter Dörwald16807132007-05-25 13:52:07 +000012636}
12637
12638PyObject *
12639PyUnicode_InternFromString(const char *cp)
12640{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012641 PyObject *s = PyUnicode_FromString(cp);
12642 if (s == NULL)
12643 return NULL;
12644 PyUnicode_InternInPlace(&s);
12645 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012646}
12647
Alexander Belopolsky40018472011-02-26 01:02:56 +000012648void
12649_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012650{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012651 PyObject *keys;
12652 PyUnicodeObject *s;
12653 Py_ssize_t i, n;
12654 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012655
Benjamin Peterson14339b62009-01-31 16:36:08 +000012656 if (interned == NULL || !PyDict_Check(interned))
12657 return;
12658 keys = PyDict_Keys(interned);
12659 if (keys == NULL || !PyList_Check(keys)) {
12660 PyErr_Clear();
12661 return;
12662 }
Walter Dörwald16807132007-05-25 13:52:07 +000012663
Benjamin Peterson14339b62009-01-31 16:36:08 +000012664 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12665 detector, interned unicode strings are not forcibly deallocated;
12666 rather, we give them their stolen references back, and then clear
12667 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012668
Benjamin Peterson14339b62009-01-31 16:36:08 +000012669 n = PyList_GET_SIZE(keys);
12670 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012671 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012672 for (i = 0; i < n; i++) {
12673 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012674 if (PyUnicode_READY(s) == -1)
12675 fprintf(stderr, "could not ready string\n");
12676 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012677 case SSTATE_NOT_INTERNED:
12678 /* XXX Shouldn't happen */
12679 break;
12680 case SSTATE_INTERNED_IMMORTAL:
12681 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012683 break;
12684 case SSTATE_INTERNED_MORTAL:
12685 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012687 break;
12688 default:
12689 Py_FatalError("Inconsistent interned string state.");
12690 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012692 }
12693 fprintf(stderr, "total size of all interned strings: "
12694 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12695 "mortal/immortal\n", mortal_size, immortal_size);
12696 Py_DECREF(keys);
12697 PyDict_Clear(interned);
12698 Py_DECREF(interned);
12699 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012700}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012701
12702
12703/********************* Unicode Iterator **************************/
12704
12705typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012706 PyObject_HEAD
12707 Py_ssize_t it_index;
12708 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012709} unicodeiterobject;
12710
12711static void
12712unicodeiter_dealloc(unicodeiterobject *it)
12713{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012714 _PyObject_GC_UNTRACK(it);
12715 Py_XDECREF(it->it_seq);
12716 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012717}
12718
12719static int
12720unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12721{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012722 Py_VISIT(it->it_seq);
12723 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012724}
12725
12726static PyObject *
12727unicodeiter_next(unicodeiterobject *it)
12728{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012729 PyUnicodeObject *seq;
12730 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012731
Benjamin Peterson14339b62009-01-31 16:36:08 +000012732 assert(it != NULL);
12733 seq = it->it_seq;
12734 if (seq == NULL)
12735 return NULL;
12736 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12739 int kind = PyUnicode_KIND(seq);
12740 void *data = PyUnicode_DATA(seq);
12741 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12742 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012743 if (item != NULL)
12744 ++it->it_index;
12745 return item;
12746 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012747
Benjamin Peterson14339b62009-01-31 16:36:08 +000012748 Py_DECREF(seq);
12749 it->it_seq = NULL;
12750 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012751}
12752
12753static PyObject *
12754unicodeiter_len(unicodeiterobject *it)
12755{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012756 Py_ssize_t len = 0;
12757 if (it->it_seq)
12758 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12759 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012760}
12761
12762PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12763
12764static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012765 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012766 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012767 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012768};
12769
12770PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012771 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12772 "str_iterator", /* tp_name */
12773 sizeof(unicodeiterobject), /* tp_basicsize */
12774 0, /* tp_itemsize */
12775 /* methods */
12776 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12777 0, /* tp_print */
12778 0, /* tp_getattr */
12779 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012780 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012781 0, /* tp_repr */
12782 0, /* tp_as_number */
12783 0, /* tp_as_sequence */
12784 0, /* tp_as_mapping */
12785 0, /* tp_hash */
12786 0, /* tp_call */
12787 0, /* tp_str */
12788 PyObject_GenericGetAttr, /* tp_getattro */
12789 0, /* tp_setattro */
12790 0, /* tp_as_buffer */
12791 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12792 0, /* tp_doc */
12793 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12794 0, /* tp_clear */
12795 0, /* tp_richcompare */
12796 0, /* tp_weaklistoffset */
12797 PyObject_SelfIter, /* tp_iter */
12798 (iternextfunc)unicodeiter_next, /* tp_iternext */
12799 unicodeiter_methods, /* tp_methods */
12800 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012801};
12802
12803static PyObject *
12804unicode_iter(PyObject *seq)
12805{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012806 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012807
Benjamin Peterson14339b62009-01-31 16:36:08 +000012808 if (!PyUnicode_Check(seq)) {
12809 PyErr_BadInternalCall();
12810 return NULL;
12811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812 if (PyUnicode_READY(seq) == -1)
12813 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012814 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12815 if (it == NULL)
12816 return NULL;
12817 it->it_index = 0;
12818 Py_INCREF(seq);
12819 it->it_seq = (PyUnicodeObject *)seq;
12820 _PyObject_GC_TRACK(it);
12821 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012822}
12823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824#define UNIOP(x) Py_UNICODE_##x
12825#define UNIOP_t Py_UNICODE
12826#include "uniops.h"
12827#undef UNIOP
12828#undef UNIOP_t
12829#define UNIOP(x) Py_UCS4_##x
12830#define UNIOP_t Py_UCS4
12831#include "uniops.h"
12832#undef UNIOP
12833#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012834
Victor Stinner71133ff2010-09-01 23:43:53 +000012835Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012836PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012837{
12838 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12839 Py_UNICODE *copy;
12840 Py_ssize_t size;
12841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012842 if (!PyUnicode_Check(unicode)) {
12843 PyErr_BadArgument();
12844 return NULL;
12845 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012846 /* Ensure we won't overflow the size. */
12847 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12848 PyErr_NoMemory();
12849 return NULL;
12850 }
12851 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12852 size *= sizeof(Py_UNICODE);
12853 copy = PyMem_Malloc(size);
12854 if (copy == NULL) {
12855 PyErr_NoMemory();
12856 return NULL;
12857 }
12858 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12859 return copy;
12860}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012861
Georg Brandl66c221e2010-10-14 07:04:07 +000012862/* A _string module, to export formatter_parser and formatter_field_name_split
12863 to the string.Formatter class implemented in Python. */
12864
12865static PyMethodDef _string_methods[] = {
12866 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12867 METH_O, PyDoc_STR("split the argument as a field name")},
12868 {"formatter_parser", (PyCFunction) formatter_parser,
12869 METH_O, PyDoc_STR("parse the argument as a format string")},
12870 {NULL, NULL}
12871};
12872
12873static struct PyModuleDef _string_module = {
12874 PyModuleDef_HEAD_INIT,
12875 "_string",
12876 PyDoc_STR("string helper module"),
12877 0,
12878 _string_methods,
12879 NULL,
12880 NULL,
12881 NULL,
12882 NULL
12883};
12884
12885PyMODINIT_FUNC
12886PyInit__string(void)
12887{
12888 return PyModule_Create(&_string_module);
12889}
12890
12891
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012892#ifdef __cplusplus
12893}
12894#endif