blob: ff3d4b6e0a6aa03f317942328589f37053eb3c1a [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
109#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
110#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
111#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
112#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
113#define _PyUnicode_KIND(op) \
114 (assert(PyUnicode_Check(op)), \
115 ((PyASCIIObject *)(op))->state.kind)
116#define _PyUnicode_GET_LENGTH(op) \
117 (assert(PyUnicode_Check(op)), \
118 ((PyASCIIObject *)(op))->length)
119
120
Walter Dörwald16807132007-05-25 13:52:07 +0000121/* This dictionary holds all interned unicode strings. Note that references
122 to strings in this dictionary are *not* counted in the string's ob_refcnt.
123 When the interned string reaches a refcnt of 0 the string deallocation
124 function will delete the reference from this dictionary.
125
126 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000127 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000128*/
129static PyObject *interned;
130
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000131/* The empty Unicode object is shared to improve performance. */
132static PyUnicodeObject *unicode_empty;
133
134/* Single character Unicode strings in the Latin-1 range are being
135 shared as well. */
136static PyUnicodeObject *unicode_latin1[256];
137
Christian Heimes190d79e2008-01-30 11:58:22 +0000138/* Fast detection of the most frequent whitespace characters */
139const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000141/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000142/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000143/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000144/* case 0x000C: * FORM FEED */
145/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000146 0, 1, 1, 1, 1, 1, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000148/* case 0x001C: * FILE SEPARATOR */
149/* case 0x001D: * GROUP SEPARATOR */
150/* case 0x001E: * RECORD SEPARATOR */
151/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000152 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000153/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000154 1, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000158
Benjamin Peterson14339b62009-01-31 16:36:08 +0000159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000167};
168
Alexander Belopolsky40018472011-02-26 01:02:56 +0000169static PyObject *
170unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000171 PyObject **errorHandler,const char *encoding, const char *reason,
172 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
173 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
174
Alexander Belopolsky40018472011-02-26 01:02:56 +0000175static void
176raise_encode_exception(PyObject **exceptionObject,
177 const char *encoding,
178 const Py_UNICODE *unicode, Py_ssize_t size,
179 Py_ssize_t startpos, Py_ssize_t endpos,
180 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000181
Christian Heimes190d79e2008-01-30 11:58:22 +0000182/* Same for linebreaks */
183static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000185/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000186/* 0x000B, * LINE TABULATION */
187/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000188/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000189 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* 0x001C, * FILE SEPARATOR */
192/* 0x001D, * GROUP SEPARATOR */
193/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 0, 0, 0, 0, 1, 1, 1, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
196 0, 0, 0, 0, 0, 0, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000199
Benjamin Peterson14339b62009-01-31 16:36:08 +0000200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000208};
209
210
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000211Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000212PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000213{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000214#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000216#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 /* This is actually an illegal character, so it should
218 not be passed to unichr. */
219 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000220#endif
221}
222
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223/* --- Bloom Filters ----------------------------------------------------- */
224
225/* stuff to implement simple "bloom filters" for Unicode characters.
226 to keep things simple, we use a single bitmask, using the least 5
227 bits from each unicode characters as the bit index. */
228
229/* the linebreak mask is set up by Unicode_Init below */
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#if LONG_BIT >= 128
232#define BLOOM_WIDTH 128
233#elif LONG_BIT >= 64
234#define BLOOM_WIDTH 64
235#elif LONG_BIT >= 32
236#define BLOOM_WIDTH 32
237#else
238#error "LONG_BIT is smaller than 32"
239#endif
240
Thomas Wouters477c8d52006-05-27 19:21:47 +0000241#define BLOOM_MASK unsigned long
242
243static BLOOM_MASK bloom_linebreak;
244
Antoine Pitrouf068f942010-01-13 14:19:12 +0000245#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
246#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000247
Benjamin Peterson29060642009-01-31 22:14:21 +0000248#define BLOOM_LINEBREAK(ch) \
249 ((ch) < 128U ? ascii_linebreak[(ch)] : \
250 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000251
Alexander Belopolsky40018472011-02-26 01:02:56 +0000252Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200253make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000254{
255 /* calculate simple bloom-style bitmask for a given unicode string */
256
Antoine Pitrouf068f942010-01-13 14:19:12 +0000257 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000258 Py_ssize_t i;
259
260 mask = 0;
261 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200262 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000263
264 return mask;
265}
266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200267#define BLOOM_MEMBER(mask, chr, str) \
268 (BLOOM(mask, chr) \
269 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271/* --- Unicode Object ----------------------------------------------------- */
272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200273static PyObject *
274substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len);
275
276static PyObject *
277fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
278
279Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
280 Py_ssize_t size, Py_UCS4 ch,
281 int direction)
282{
283 /* like wcschr, but doesn't stop at NULL characters */
284 Py_ssize_t i;
285 if (direction == 1) {
286 for(i = 0; i < size; i++)
287 if (PyUnicode_READ(kind, s, i) == ch)
288 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
289 }
290 else {
291 for(i = size-1; i >= 0; i--)
292 if (PyUnicode_READ(kind, s, i) == ch)
293 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
294 }
295 return NULL;
296}
297
Alexander Belopolsky40018472011-02-26 01:02:56 +0000298static int
299unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200300 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
302 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200304 /* Resizing is only supported for old unicode objects. */
305 assert(!PyUnicode_IS_COMPACT(unicode));
306 assert(_PyUnicode_WSTR(unicode) != NULL);
307
308 /* ... and only if they have not been readied yet, because
309 callees usually rely on the wstr representation when resizing. */
310 assert(unicode->data.any == NULL);
311
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000312 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200313 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 /* Resizing shared object (unicode_empty or single character
317 objects) in-place is not allowed. Use PyUnicode_Resize()
318 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
322 _PyUnicode_WSTR(unicode)[0] < 256U &&
323 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000325 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 return -1;
327 }
328
Thomas Wouters477c8d52006-05-27 19:21:47 +0000329 /* We allocate one more byte to make sure the string is Ux0000 terminated.
330 The overallocation is also used by fastsearch, which assumes that it's
331 safe to look at str[length] (without making any assumptions about what
332 it contains). */
333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200334 oldstr = _PyUnicode_WSTR(unicode);
335 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
336 sizeof(Py_UNICODE) * (length + 1));
337 if (!_PyUnicode_WSTR(unicode)) {
338 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 PyErr_NoMemory();
340 return -1;
341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200342 _PyUnicode_WSTR(unicode)[length] = 0;
343 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000344
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 if (unicode->data.any != NULL) {
347 PyObject_FREE(unicode->data.any);
348 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
349 PyObject_FREE(unicode->_base.utf8);
350 }
351 unicode->_base.utf8 = NULL;
352 unicode->_base.utf8_length = 0;
353 unicode->data.any = NULL;
354 _PyUnicode_LENGTH(unicode) = 0;
355 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
356 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 _PyUnicode_HASH(unicode) = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000359
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 return 0;
361}
362
363/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000364 Ux0000 terminated; some code (e.g. new_identifier)
365 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366
367 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369
370*/
371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200372#ifdef Py_DEBUG
373int unicode_old_new_calls = 0;
374#endif
375
Alexander Belopolsky40018472011-02-26 01:02:56 +0000376static PyUnicodeObject *
377_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378{
379 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200380 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381
Thomas Wouters477c8d52006-05-27 19:21:47 +0000382 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383 if (length == 0 && unicode_empty != NULL) {
384 Py_INCREF(unicode_empty);
385 return unicode_empty;
386 }
387
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000388 /* Ensure we won't overflow the size. */
389 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
390 return (PyUnicodeObject *)PyErr_NoMemory();
391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200392 if (length < 0) {
393 PyErr_SetString(PyExc_SystemError,
394 "Negative size passed to _PyUnicode_New");
395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396 }
397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200398#ifdef Py_DEBUG
399 ++unicode_old_new_calls;
400#endif
401
402 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
403 if (unicode == NULL)
404 return NULL;
405 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
406 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
407 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 PyErr_NoMemory();
409 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200411
Jeremy Hyltond8082792003-09-16 19:41:39 +0000412 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000413 * the caller fails before initializing str -- unicode_resize()
414 * reads str[0], and the Keep-Alive optimization can keep memory
415 * allocated for str alive across a call to unicode_dealloc(unicode).
416 * We don't want unicode_resize to read uninitialized memory in
417 * that case.
418 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419 _PyUnicode_WSTR(unicode)[0] = 0;
420 _PyUnicode_WSTR(unicode)[length] = 0;
421 _PyUnicode_WSTR_LENGTH(unicode) = length;
422 _PyUnicode_HASH(unicode) = -1;
423 _PyUnicode_STATE(unicode).interned = 0;
424 _PyUnicode_STATE(unicode).kind = 0;
425 _PyUnicode_STATE(unicode).compact = 0;
426 _PyUnicode_STATE(unicode).ready = 0;
427 _PyUnicode_STATE(unicode).ascii = 0;
428 unicode->data.any = NULL;
429 _PyUnicode_LENGTH(unicode) = 0;
430 unicode->_base.utf8 = NULL;
431 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000433
Benjamin Peterson29060642009-01-31 22:14:21 +0000434 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000435 /* XXX UNREF/NEWREF interface should be more symmetrical */
436 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000437 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000438 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440}
441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200442#ifdef Py_DEBUG
443int unicode_new_new_calls = 0;
444
445/* Functions wrapping macros for use in debugger */
446char *_PyUnicode_utf8(void *unicode){
447 return _PyUnicode_UTF8(unicode);
448}
449
450void *_PyUnicode_compact_data(void *unicode) {
451 return _PyUnicode_COMPACT_DATA(unicode);
452}
453void *_PyUnicode_data(void *unicode){
454 printf("obj %p\n", unicode);
455 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
456 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
457 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
458 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
459 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
460 return PyUnicode_DATA(unicode);
461}
462#endif
463
464PyObject *
465PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
466{
467 PyObject *obj;
468 PyCompactUnicodeObject *unicode;
469 void *data;
470 int kind_state;
471 int is_sharing = 0, is_ascii = 0;
472 Py_ssize_t char_size;
473 Py_ssize_t struct_size;
474
475 /* Optimization for empty strings */
476 if (size == 0 && unicode_empty != NULL) {
477 Py_INCREF(unicode_empty);
478 return (PyObject *)unicode_empty;
479 }
480
481#ifdef Py_DEBUG
482 ++unicode_new_new_calls;
483#endif
484
485 struct_size = sizeof(PyCompactUnicodeObject);
486 if (maxchar < 128) {
487 kind_state = PyUnicode_1BYTE_KIND;
488 char_size = 1;
489 is_ascii = 1;
490 struct_size = sizeof(PyASCIIObject);
491 }
492 else if (maxchar < 256) {
493 kind_state = PyUnicode_1BYTE_KIND;
494 char_size = 1;
495 }
496 else if (maxchar < 65536) {
497 kind_state = PyUnicode_2BYTE_KIND;
498 char_size = 2;
499 if (sizeof(wchar_t) == 2)
500 is_sharing = 1;
501 }
502 else {
503 kind_state = PyUnicode_4BYTE_KIND;
504 char_size = 4;
505 if (sizeof(wchar_t) == 4)
506 is_sharing = 1;
507 }
508
509 /* Ensure we won't overflow the size. */
510 if (size < 0) {
511 PyErr_SetString(PyExc_SystemError,
512 "Negative size passed to PyUnicode_New");
513 return NULL;
514 }
515 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
516 return PyErr_NoMemory();
517
518 /* Duplicated allocation code from _PyObject_New() instead of a call to
519 * PyObject_New() so we are able to allocate space for the object and
520 * it's data buffer.
521 */
522 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
523 if (obj == NULL)
524 return PyErr_NoMemory();
525 obj = PyObject_INIT(obj, &PyUnicode_Type);
526 if (obj == NULL)
527 return NULL;
528
529 unicode = (PyCompactUnicodeObject *)obj;
530 if (is_ascii)
531 data = ((PyASCIIObject*)obj) + 1;
532 else
533 data = unicode + 1;
534 _PyUnicode_LENGTH(unicode) = size;
535 _PyUnicode_HASH(unicode) = -1;
536 _PyUnicode_STATE(unicode).interned = 0;
537 _PyUnicode_STATE(unicode).kind = kind_state;
538 _PyUnicode_STATE(unicode).compact = 1;
539 _PyUnicode_STATE(unicode).ready = 1;
540 _PyUnicode_STATE(unicode).ascii = is_ascii;
541 if (is_ascii) {
542 ((char*)data)[size] = 0;
543 _PyUnicode_WSTR(unicode) = NULL;
544 }
545 else if (kind_state == PyUnicode_1BYTE_KIND) {
546 ((char*)data)[size] = 0;
547 _PyUnicode_WSTR(unicode) = NULL;
548 _PyUnicode_WSTR_LENGTH(unicode) = 0;
549 unicode->utf8_length = 0;
550 unicode->utf8 = NULL;
551 }
552 else {
553 unicode->utf8 = NULL;
554 if (kind_state == PyUnicode_2BYTE_KIND)
555 ((Py_UCS2*)data)[size] = 0;
556 else /* kind_state == PyUnicode_4BYTE_KIND */
557 ((Py_UCS4*)data)[size] = 0;
558 if (is_sharing) {
559 _PyUnicode_WSTR_LENGTH(unicode) = size;
560 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
561 }
562 else {
563 _PyUnicode_WSTR_LENGTH(unicode) = 0;
564 _PyUnicode_WSTR(unicode) = NULL;
565 }
566 }
567 return obj;
568}
569
570#if SIZEOF_WCHAR_T == 2
571/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
572 will decode surrogate pairs, the other conversions are implemented as macros
573 for efficency.
574
575 This function assumes that unicode can hold one more code point than wstr
576 characters for a terminating null character. */
577static int
578unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
579 PyUnicodeObject *unicode)
580{
581 const wchar_t *iter;
582 Py_UCS4 *ucs4_out;
583
584 assert(unicode && PyUnicode_Check(unicode));
585 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
586 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
587
588 for (iter = begin; iter < end; ) {
589 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
590 _PyUnicode_GET_LENGTH(unicode)));
591 if (*iter >= 0xD800 && *iter <= 0xDBFF
592 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
593 {
594 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
595 iter += 2;
596 }
597 else {
598 *ucs4_out++ = *iter;
599 iter++;
600 }
601 }
602 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
603 _PyUnicode_GET_LENGTH(unicode)));
604
605 return 0;
606}
607#endif
608
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200609Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
611 PyObject *from, Py_ssize_t from_start,
612 Py_ssize_t how_many)
613{
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200614 unsigned int from_kind;
615 unsigned int to_kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200616
617 assert(PyUnicode_Check(from));
618 assert(PyUnicode_Check(to));
619
620 if (PyUnicode_READY(from))
621 return -1;
622 if (PyUnicode_READY(to))
623 return -1;
624
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200625 how_many = PY_MIN(PyUnicode_GET_LENGTH(from), how_many);
626 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
627 PyErr_Format(PyExc_ValueError,
628 "Cannot write %zi characters at %zi "
629 "in a string of %zi characters",
630 how_many, to_start, PyUnicode_GET_LENGTH(to));
631 return -1;
632 }
633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634 from_kind = PyUnicode_KIND(from);
635 to_kind = PyUnicode_KIND(to);
636
637 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200638 /* fast path */
639 Py_MEMCPY((char*)PyUnicode_DATA(to)
640 + PyUnicode_KIND_SIZE(to_kind, to_start),
641 (char*)PyUnicode_DATA(from)
642 + PyUnicode_KIND_SIZE(from_kind, from_start),
643 PyUnicode_KIND_SIZE(to_kind, how_many));
644 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200646
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200647 if (from_kind > to_kind) {
648 /* slow path to check for character overflow */
649 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
650 void *from_data = PyUnicode_DATA(from);
651 void *to_data = PyUnicode_DATA(to);
652 Py_UCS4 ch, maxchar;
653 Py_ssize_t i;
654 int overflow;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200656 maxchar = 0;
657 for (i=0; i < how_many; i++) {
658 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
659 if (ch > maxchar) {
660 maxchar = ch;
661 if (maxchar > to_maxchar) {
662 overflow = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200663 break;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200665 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200666 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
667 }
668 if (!overflow)
669 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200670 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200671 else if (from_kind == PyUnicode_1BYTE_KIND && to_kind == PyUnicode_2BYTE_KIND)
672 {
673 _PyUnicode_CONVERT_BYTES(
674 Py_UCS1, Py_UCS2,
675 PyUnicode_1BYTE_DATA(from) + from_start,
676 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
677 PyUnicode_2BYTE_DATA(to) + to_start
678 );
679 return how_many;
680 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200681 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200682 && to_kind == PyUnicode_4BYTE_KIND)
683 {
684 _PyUnicode_CONVERT_BYTES(
685 Py_UCS1, Py_UCS4,
686 PyUnicode_1BYTE_DATA(from) + from_start,
687 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
688 PyUnicode_4BYTE_DATA(to) + to_start
689 );
690 return how_many;
691 }
692 else if (from_kind == PyUnicode_2BYTE_KIND
693 && to_kind == PyUnicode_4BYTE_KIND)
694 {
695 _PyUnicode_CONVERT_BYTES(
696 Py_UCS2, Py_UCS4,
697 PyUnicode_2BYTE_DATA(from) + from_start,
698 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
699 PyUnicode_4BYTE_DATA(to) + to_start
700 );
701 return how_many;
702 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200703 PyErr_Format(PyExc_ValueError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200704 "Cannot copy UCS%u characters "
705 "into a string of UCS%u characters",
Victor Stinner157f83f2011-09-28 21:41:31 +0200706 1 << (from_kind - 1),
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200707 1 << (to_kind -1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200708 return -1;
709}
710
711int
712_PyUnicode_FindMaxCharAndNumSurrogatePairs(const wchar_t *begin,
713 const wchar_t *end,
714 Py_UCS4 *maxchar,
715 Py_ssize_t *num_surrogates)
716{
717 const wchar_t *iter;
718
719 if (num_surrogates == NULL || maxchar == NULL) {
720 PyErr_SetString(PyExc_SystemError,
721 "unexpected NULL arguments to "
722 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
723 return -1;
724 }
725
726 *num_surrogates = 0;
727 *maxchar = 0;
728
729 for (iter = begin; iter < end; ) {
730 if (*iter > *maxchar)
731 *maxchar = *iter;
732#if SIZEOF_WCHAR_T == 2
733 if (*iter >= 0xD800 && *iter <= 0xDBFF
734 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
735 {
736 Py_UCS4 surrogate_val;
737 surrogate_val = (((iter[0] & 0x3FF)<<10)
738 | (iter[1] & 0x3FF)) + 0x10000;
739 ++(*num_surrogates);
740 if (surrogate_val > *maxchar)
741 *maxchar = surrogate_val;
742 iter += 2;
743 }
744 else
745 iter++;
746#else
747 iter++;
748#endif
749 }
750 return 0;
751}
752
753#ifdef Py_DEBUG
754int unicode_ready_calls = 0;
755#endif
756
757int
758_PyUnicode_Ready(PyUnicodeObject *unicode)
759{
760 wchar_t *end;
761 Py_UCS4 maxchar = 0;
762 Py_ssize_t num_surrogates;
763#if SIZEOF_WCHAR_T == 2
764 Py_ssize_t length_wo_surrogates;
765#endif
766
767 assert(PyUnicode_Check(unicode));
768
769 if (unicode->data.any != NULL) {
770 assert(PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
771 return 0;
772 }
773
774 /* _PyUnicode_Ready() is only intented for old-style API usage where
775 * strings were created using _PyObject_New() and where no canonical
776 * representation (the str field) has been set yet aka strings
777 * which are not yet ready.
778 */
779 assert(_PyUnicode_WSTR(unicode) != NULL);
780 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
781 assert(!PyUnicode_IS_COMPACT(unicode));
782 assert(!PyUnicode_IS_READY(unicode));
783 /* Actually, it should neither be interned nor be anything else: */
784 assert(_PyUnicode_STATE(unicode).interned == 0);
785 assert(unicode->_base.utf8 == NULL);
786
787#ifdef Py_DEBUG
788 ++unicode_ready_calls;
789#endif
790
791 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
792 if (_PyUnicode_FindMaxCharAndNumSurrogatePairs(_PyUnicode_WSTR(unicode), end,
793 &maxchar,
794 &num_surrogates) == -1) {
795 assert(0 && "PyUnicode_FindMaxCharAndNumSurrogatePairs failed");
796 return -1;
797 }
798
799 if (maxchar < 256) {
800 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
801 if (!unicode->data.any) {
802 PyErr_NoMemory();
803 return -1;
804 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200805 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200806 _PyUnicode_WSTR(unicode), end,
807 PyUnicode_1BYTE_DATA(unicode));
808 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
809 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
810 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
811 if (maxchar < 128) {
812 unicode->_base.utf8 = unicode->data.any;
813 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
814 }
815 else {
816 unicode->_base.utf8 = NULL;
817 unicode->_base.utf8_length = 0;
818 }
819 PyObject_FREE(_PyUnicode_WSTR(unicode));
820 _PyUnicode_WSTR(unicode) = NULL;
821 _PyUnicode_WSTR_LENGTH(unicode) = 0;
822 }
823 /* In this case we might have to convert down from 4-byte native
824 wchar_t to 2-byte unicode. */
825 else if (maxchar < 65536) {
826 assert(num_surrogates == 0 &&
827 "FindMaxCharAndNumSurrogatePairs() messed up");
828
829 if (sizeof(wchar_t) == 2) {
830 /* We can share representations and are done. */
831 unicode->data.any = _PyUnicode_WSTR(unicode);
832 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
833 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
834 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
835 unicode->_base.utf8 = NULL;
836 unicode->_base.utf8_length = 0;
837 }
838 else {
839 assert(sizeof(wchar_t) == 4);
840
841 unicode->data.any = PyObject_MALLOC(
842 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
843 if (!unicode->data.any) {
844 PyErr_NoMemory();
845 return -1;
846 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200847 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200848 _PyUnicode_WSTR(unicode), end,
849 PyUnicode_2BYTE_DATA(unicode));
850 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
851 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
852 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
853 unicode->_base.utf8 = NULL;
854 unicode->_base.utf8_length = 0;
855 PyObject_FREE(_PyUnicode_WSTR(unicode));
856 _PyUnicode_WSTR(unicode) = NULL;
857 _PyUnicode_WSTR_LENGTH(unicode) = 0;
858 }
859 }
860 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
861 else {
862#if SIZEOF_WCHAR_T == 2
863 /* in case the native representation is 2-bytes, we need to allocate a
864 new normalized 4-byte version. */
865 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
866 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
867 if (!unicode->data.any) {
868 PyErr_NoMemory();
869 return -1;
870 }
871 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
872 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
873 unicode->_base.utf8 = NULL;
874 unicode->_base.utf8_length = 0;
875 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
876 unicode) < 0) {
877 assert(0 && "ConvertWideCharToUCS4 failed");
878 return -1;
879 }
880 PyObject_FREE(_PyUnicode_WSTR(unicode));
881 _PyUnicode_WSTR(unicode) = NULL;
882 _PyUnicode_WSTR_LENGTH(unicode) = 0;
883#else
884 assert(num_surrogates == 0);
885
886 unicode->data.any = _PyUnicode_WSTR(unicode);
887 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
888 unicode->_base.utf8 = NULL;
889 unicode->_base.utf8_length = 0;
890 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
891#endif
892 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
893 }
894 _PyUnicode_STATE(unicode).ready = 1;
895 return 0;
896}
897
Alexander Belopolsky40018472011-02-26 01:02:56 +0000898static void
899unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000900{
Walter Dörwald16807132007-05-25 13:52:07 +0000901 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000902 case SSTATE_NOT_INTERNED:
903 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000904
Benjamin Peterson29060642009-01-31 22:14:21 +0000905 case SSTATE_INTERNED_MORTAL:
906 /* revive dead object temporarily for DelItem */
907 Py_REFCNT(unicode) = 3;
908 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
909 Py_FatalError(
910 "deletion of interned string failed");
911 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000912
Benjamin Peterson29060642009-01-31 22:14:21 +0000913 case SSTATE_INTERNED_IMMORTAL:
914 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000915
Benjamin Peterson29060642009-01-31 22:14:21 +0000916 default:
917 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000918 }
919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200920 if (_PyUnicode_WSTR(unicode) &&
921 (!PyUnicode_IS_READY(unicode) ||
922 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
923 PyObject_DEL(_PyUnicode_WSTR(unicode));
924 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
925 PyObject_DEL(unicode->_base.utf8);
926
927 if (PyUnicode_IS_COMPACT(unicode)) {
928 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000929 }
930 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931 if (unicode->data.any)
932 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000933 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934 }
935}
936
Alexander Belopolsky40018472011-02-26 01:02:56 +0000937static int
938_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000939{
940 register PyUnicodeObject *v;
941
942 /* Argument checks */
943 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000944 PyErr_BadInternalCall();
945 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000946 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000947 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200948 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
949 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000950 PyErr_BadInternalCall();
951 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000952 }
953
954 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200955 possible since these are being shared.
956 The same goes for new-representation unicode objects or objects which
957 have already been readied.
958 For these, we simply return a fresh copy with the same Unicode content.
959 */
960 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
961 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
962 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000963 PyUnicodeObject *w = _PyUnicode_New(length);
964 if (w == NULL)
965 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
967 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000968 Py_DECREF(*unicode);
969 *unicode = w;
970 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000971 }
972
973 /* Note that we don't have to modify *unicode for unshared Unicode
974 objects, since we can modify them in-place. */
975 return unicode_resize(v, length);
976}
977
Alexander Belopolsky40018472011-02-26 01:02:56 +0000978int
979PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000980{
981 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
982}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984static PyObject*
985get_latin1_char(unsigned char ch)
986{
987 PyUnicodeObject *unicode = unicode_latin1[ch];
988 if (!unicode) {
989 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
990 if (!unicode)
991 return NULL;
992 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
993 unicode_latin1[ch] = unicode;
994 }
995 Py_INCREF(unicode);
996 return (PyObject *)unicode;
997}
998
Alexander Belopolsky40018472011-02-26 01:02:56 +0000999PyObject *
1000PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001001{
1002 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001003 Py_UCS4 maxchar = 0;
1004 Py_ssize_t num_surrogates;
1005
1006 if (u == NULL)
1007 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001009 /* If the Unicode data is known at construction time, we can apply
1010 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001012 /* Optimization for empty strings */
1013 if (size == 0 && unicode_empty != NULL) {
1014 Py_INCREF(unicode_empty);
1015 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001016 }
Tim Petersced69f82003-09-16 20:30:58 +00001017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001018 /* Single character Unicode objects in the Latin-1 range are
1019 shared when using this constructor */
1020 if (size == 1 && *u < 256)
1021 return get_latin1_char((unsigned char)*u);
1022
1023 /* If not empty and not single character, copy the Unicode data
1024 into the new object */
1025 if (_PyUnicode_FindMaxCharAndNumSurrogatePairs(u, u + size, &maxchar,
1026 &num_surrogates) == -1)
1027 return NULL;
1028
1029 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1030 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031 if (!unicode)
1032 return NULL;
1033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001034 switch (PyUnicode_KIND(unicode)) {
1035 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001036 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001037 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1038 break;
1039 case PyUnicode_2BYTE_KIND:
1040#if Py_UNICODE_SIZE == 2
1041 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1042#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001043 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1045#endif
1046 break;
1047 case PyUnicode_4BYTE_KIND:
1048#if SIZEOF_WCHAR_T == 2
1049 /* This is the only case which has to process surrogates, thus
1050 a simple copy loop is not enough and we need a function. */
1051 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1052 Py_DECREF(unicode);
1053 return NULL;
1054 }
1055#else
1056 assert(num_surrogates == 0);
1057 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1058#endif
1059 break;
1060 default:
1061 assert(0 && "Impossible state");
1062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063
1064 return (PyObject *)unicode;
1065}
1066
Alexander Belopolsky40018472011-02-26 01:02:56 +00001067PyObject *
1068PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001069{
1070 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001071
Benjamin Peterson14339b62009-01-31 16:36:08 +00001072 if (size < 0) {
1073 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001074 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001075 return NULL;
1076 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001077
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001078 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001079 some optimizations which share commonly used objects.
1080 Also, this means the input must be UTF-8, so fall back to the
1081 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001082 if (u != NULL) {
1083
Benjamin Peterson29060642009-01-31 22:14:21 +00001084 /* Optimization for empty strings */
1085 if (size == 0 && unicode_empty != NULL) {
1086 Py_INCREF(unicode_empty);
1087 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001088 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001089
1090 /* Single characters are shared when using this constructor.
1091 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001092 if (size == 1 && Py_CHARMASK(*u) < 128)
1093 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001094
1095 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001096 }
1097
Walter Dörwald55507312007-05-18 13:12:10 +00001098 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001099 if (!unicode)
1100 return NULL;
1101
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001102 return (PyObject *)unicode;
1103}
1104
Alexander Belopolsky40018472011-02-26 01:02:56 +00001105PyObject *
1106PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001107{
1108 size_t size = strlen(u);
1109 if (size > PY_SSIZE_T_MAX) {
1110 PyErr_SetString(PyExc_OverflowError, "input too long");
1111 return NULL;
1112 }
1113
1114 return PyUnicode_FromStringAndSize(u, size);
1115}
1116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117PyObject*
1118PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001119{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 PyObject *res;
1121 unsigned char max = 127;
1122 Py_ssize_t i;
1123 for (i = 0; i < size; i++) {
1124 if (u[i] & 0x80) {
1125 max = 255;
1126 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001127 }
1128 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129 res = PyUnicode_New(size, max);
1130 if (!res)
1131 return NULL;
1132 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1133 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001134}
1135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136PyObject*
1137PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1138{
1139 PyObject *res;
1140 Py_UCS2 max = 0;
1141 Py_ssize_t i;
1142 for (i = 0; i < size; i++)
1143 if (u[i] > max)
1144 max = u[i];
1145 res = PyUnicode_New(size, max);
1146 if (!res)
1147 return NULL;
1148 if (max >= 256)
1149 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1150 else
1151 for (i = 0; i < size; i++)
1152 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1153 return res;
1154}
1155
1156PyObject*
1157PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1158{
1159 PyObject *res;
1160 Py_UCS4 max = 0;
1161 Py_ssize_t i;
1162 for (i = 0; i < size; i++)
1163 if (u[i] > max)
1164 max = u[i];
1165 res = PyUnicode_New(size, max);
1166 if (!res)
1167 return NULL;
1168 if (max >= 0x10000)
1169 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1170 else {
1171 int kind = PyUnicode_KIND(res);
1172 void *data = PyUnicode_DATA(res);
1173 for (i = 0; i < size; i++)
1174 PyUnicode_WRITE(kind, data, i, u[i]);
1175 }
1176 return res;
1177}
1178
1179PyObject*
1180PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1181{
1182 switch(kind) {
1183 case PyUnicode_1BYTE_KIND:
1184 return PyUnicode_FromUCS1(buffer, size);
1185 case PyUnicode_2BYTE_KIND:
1186 return PyUnicode_FromUCS2(buffer, size);
1187 case PyUnicode_4BYTE_KIND:
1188 return PyUnicode_FromUCS4(buffer, size);
1189 }
1190 assert(0);
1191 return NULL;
1192}
1193
1194
1195/* Widen Unicode objects to larger buffers.
1196 Return NULL if the string is too wide already. */
1197
1198void*
1199_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1200{
1201 Py_ssize_t i;
1202 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1203 void *d = PyUnicode_DATA(s);
1204 unsigned int skind = PyUnicode_KIND(s);
1205 if (PyUnicode_KIND(s) >= kind) {
1206 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1207 return NULL;
1208 }
1209 switch(kind) {
1210 case PyUnicode_2BYTE_KIND: {
1211 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1212 if (!result) {
1213 PyErr_NoMemory();
1214 return 0;
1215 }
1216 for (i = 0; i < len; i++)
1217 result[i] = ((Py_UCS1*)d)[i];
1218 return result;
1219 }
1220 case PyUnicode_4BYTE_KIND: {
1221 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1222 if (!result) {
1223 PyErr_NoMemory();
1224 return 0;
1225 }
1226 for (i = 0; i < len; i++)
1227 result[i] = PyUnicode_READ(skind, d, i);
1228 return result;
1229 }
1230 }
1231 Py_FatalError("invalid kind");
1232 return NULL;
1233}
1234
1235static Py_UCS4*
1236as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1237 int copy_null)
1238{
1239 int kind;
1240 void *data;
1241 Py_ssize_t len, targetlen;
1242 if (PyUnicode_READY(string) == -1)
1243 return NULL;
1244 kind = PyUnicode_KIND(string);
1245 data = PyUnicode_DATA(string);
1246 len = PyUnicode_GET_LENGTH(string);
1247 targetlen = len;
1248 if (copy_null)
1249 targetlen++;
1250 if (!target) {
1251 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1252 PyErr_NoMemory();
1253 return NULL;
1254 }
1255 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1256 if (!target) {
1257 PyErr_NoMemory();
1258 return NULL;
1259 }
1260 }
1261 else {
1262 if (targetsize < targetlen) {
1263 PyErr_Format(PyExc_SystemError,
1264 "string is longer than the buffer");
1265 if (copy_null && 0 < targetsize)
1266 target[0] = 0;
1267 return NULL;
1268 }
1269 }
1270 if (kind != PyUnicode_4BYTE_KIND) {
1271 Py_ssize_t i;
1272 for (i = 0; i < len; i++)
1273 target[i] = PyUnicode_READ(kind, data, i);
1274 }
1275 else
1276 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1277 if (copy_null)
1278 target[len] = 0;
1279 return target;
1280}
1281
1282Py_UCS4*
1283PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1284 int copy_null)
1285{
1286 if (target == NULL || targetsize < 1) {
1287 PyErr_BadInternalCall();
1288 return NULL;
1289 }
1290 return as_ucs4(string, target, targetsize, copy_null);
1291}
1292
1293Py_UCS4*
1294PyUnicode_AsUCS4Copy(PyObject *string)
1295{
1296 return as_ucs4(string, NULL, 0, 1);
1297}
1298
1299#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001300
Alexander Belopolsky40018472011-02-26 01:02:56 +00001301PyObject *
1302PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001305 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001307 PyErr_BadInternalCall();
1308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001309 }
1310
Martin v. Löwis790465f2008-04-05 20:41:37 +00001311 if (size == -1) {
1312 size = wcslen(w);
1313 }
1314
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316}
1317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001319
Walter Dörwald346737f2007-05-31 10:44:43 +00001320static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001321makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1322 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001323{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001324 *fmt++ = '%';
1325 if (width) {
1326 if (zeropad)
1327 *fmt++ = '0';
1328 fmt += sprintf(fmt, "%d", width);
1329 }
1330 if (precision)
1331 fmt += sprintf(fmt, ".%d", precision);
1332 if (longflag)
1333 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001334 else if (longlongflag) {
1335 /* longlongflag should only ever be nonzero on machines with
1336 HAVE_LONG_LONG defined */
1337#ifdef HAVE_LONG_LONG
1338 char *f = PY_FORMAT_LONG_LONG;
1339 while (*f)
1340 *fmt++ = *f++;
1341#else
1342 /* we shouldn't ever get here */
1343 assert(0);
1344 *fmt++ = 'l';
1345#endif
1346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001347 else if (size_tflag) {
1348 char *f = PY_FORMAT_SIZE_T;
1349 while (*f)
1350 *fmt++ = *f++;
1351 }
1352 *fmt++ = c;
1353 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001354}
1355
Victor Stinner96865452011-03-01 23:44:09 +00001356/* helper for PyUnicode_FromFormatV() */
1357
1358static const char*
1359parse_format_flags(const char *f,
1360 int *p_width, int *p_precision,
1361 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1362{
1363 int width, precision, longflag, longlongflag, size_tflag;
1364
1365 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1366 f++;
1367 width = 0;
1368 while (Py_ISDIGIT((unsigned)*f))
1369 width = (width*10) + *f++ - '0';
1370 precision = 0;
1371 if (*f == '.') {
1372 f++;
1373 while (Py_ISDIGIT((unsigned)*f))
1374 precision = (precision*10) + *f++ - '0';
1375 if (*f == '%') {
1376 /* "%.3%s" => f points to "3" */
1377 f--;
1378 }
1379 }
1380 if (*f == '\0') {
1381 /* bogus format "%.1" => go backward, f points to "1" */
1382 f--;
1383 }
1384 if (p_width != NULL)
1385 *p_width = width;
1386 if (p_precision != NULL)
1387 *p_precision = precision;
1388
1389 /* Handle %ld, %lu, %lld and %llu. */
1390 longflag = 0;
1391 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001392 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001393
1394 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001395 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001396 longflag = 1;
1397 ++f;
1398 }
1399#ifdef HAVE_LONG_LONG
1400 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001401 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001402 longlongflag = 1;
1403 f += 2;
1404 }
1405#endif
1406 }
1407 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001408 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001409 size_tflag = 1;
1410 ++f;
1411 }
1412 if (p_longflag != NULL)
1413 *p_longflag = longflag;
1414 if (p_longlongflag != NULL)
1415 *p_longlongflag = longlongflag;
1416 if (p_size_tflag != NULL)
1417 *p_size_tflag = size_tflag;
1418 return f;
1419}
1420
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001421/* maximum number of characters required for output of %ld. 21 characters
1422 allows for 64-bit integers (in decimal) and an optional sign. */
1423#define MAX_LONG_CHARS 21
1424/* maximum number of characters required for output of %lld.
1425 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1426 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1427#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1428
Walter Dörwaldd2034312007-05-18 16:29:38 +00001429PyObject *
1430PyUnicode_FromFormatV(const char *format, va_list vargs)
1431{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001432 va_list count;
1433 Py_ssize_t callcount = 0;
1434 PyObject **callresults = NULL;
1435 PyObject **callresult = NULL;
1436 Py_ssize_t n = 0;
1437 int width = 0;
1438 int precision = 0;
1439 int zeropad;
1440 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001442 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001443 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1445 Py_UCS4 argmaxchar;
1446 Py_ssize_t numbersize = 0;
1447 char *numberresults = NULL;
1448 char *numberresult = NULL;
1449 Py_ssize_t i;
1450 int kind;
1451 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001452
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001453 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001454 /* step 1: count the number of %S/%R/%A/%s format specifications
1455 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1456 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 * result in an array)
1458 * also esimate a upper bound for all the number formats in the string,
1459 * numbers will be formated in step 3 and be keept in a '\0'-separated
1460 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001461 for (f = format; *f; f++) {
1462 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001463 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1465 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1466 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1467 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001470#ifdef HAVE_LONG_LONG
1471 if (longlongflag) {
1472 if (width < MAX_LONG_LONG_CHARS)
1473 width = MAX_LONG_LONG_CHARS;
1474 }
1475 else
1476#endif
1477 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1478 including sign. Decimal takes the most space. This
1479 isn't enough for octal. If a width is specified we
1480 need more (which we allocate later). */
1481 if (width < MAX_LONG_CHARS)
1482 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483
1484 /* account for the size + '\0' to separate numbers
1485 inside of the numberresults buffer */
1486 numbersize += (width + 1);
1487 }
1488 }
1489 else if ((unsigned char)*f > 127) {
1490 PyErr_Format(PyExc_ValueError,
1491 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1492 "string, got a non-ASCII byte: 0x%02x",
1493 (unsigned char)*f);
1494 return NULL;
1495 }
1496 }
1497 /* step 2: allocate memory for the results of
1498 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1499 if (callcount) {
1500 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1501 if (!callresults) {
1502 PyErr_NoMemory();
1503 return NULL;
1504 }
1505 callresult = callresults;
1506 }
1507 /* step 2.5: allocate memory for the results of formating numbers */
1508 if (numbersize) {
1509 numberresults = PyObject_Malloc(numbersize);
1510 if (!numberresults) {
1511 PyErr_NoMemory();
1512 goto fail;
1513 }
1514 numberresult = numberresults;
1515 }
1516
1517 /* step 3: format numbers and figure out how large a buffer we need */
1518 for (f = format; *f; f++) {
1519 if (*f == '%') {
1520 const char* p;
1521 int longflag;
1522 int longlongflag;
1523 int size_tflag;
1524 int numprinted;
1525
1526 p = f;
1527 zeropad = (f[1] == '0');
1528 f = parse_format_flags(f, &width, &precision,
1529 &longflag, &longlongflag, &size_tflag);
1530 switch (*f) {
1531 case 'c':
1532 {
1533 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001534 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001535 n++;
1536 break;
1537 }
1538 case '%':
1539 n++;
1540 break;
1541 case 'i':
1542 case 'd':
1543 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1544 width, precision, *f);
1545 if (longflag)
1546 numprinted = sprintf(numberresult, fmt,
1547 va_arg(count, long));
1548#ifdef HAVE_LONG_LONG
1549 else if (longlongflag)
1550 numprinted = sprintf(numberresult, fmt,
1551 va_arg(count, PY_LONG_LONG));
1552#endif
1553 else if (size_tflag)
1554 numprinted = sprintf(numberresult, fmt,
1555 va_arg(count, Py_ssize_t));
1556 else
1557 numprinted = sprintf(numberresult, fmt,
1558 va_arg(count, int));
1559 n += numprinted;
1560 /* advance by +1 to skip over the '\0' */
1561 numberresult += (numprinted + 1);
1562 assert(*(numberresult - 1) == '\0');
1563 assert(*(numberresult - 2) != '\0');
1564 assert(numprinted >= 0);
1565 assert(numberresult <= numberresults + numbersize);
1566 break;
1567 case 'u':
1568 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1569 width, precision, 'u');
1570 if (longflag)
1571 numprinted = sprintf(numberresult, fmt,
1572 va_arg(count, unsigned long));
1573#ifdef HAVE_LONG_LONG
1574 else if (longlongflag)
1575 numprinted = sprintf(numberresult, fmt,
1576 va_arg(count, unsigned PY_LONG_LONG));
1577#endif
1578 else if (size_tflag)
1579 numprinted = sprintf(numberresult, fmt,
1580 va_arg(count, size_t));
1581 else
1582 numprinted = sprintf(numberresult, fmt,
1583 va_arg(count, unsigned int));
1584 n += numprinted;
1585 numberresult += (numprinted + 1);
1586 assert(*(numberresult - 1) == '\0');
1587 assert(*(numberresult - 2) != '\0');
1588 assert(numprinted >= 0);
1589 assert(numberresult <= numberresults + numbersize);
1590 break;
1591 case 'x':
1592 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1593 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1594 n += numprinted;
1595 numberresult += (numprinted + 1);
1596 assert(*(numberresult - 1) == '\0');
1597 assert(*(numberresult - 2) != '\0');
1598 assert(numprinted >= 0);
1599 assert(numberresult <= numberresults + numbersize);
1600 break;
1601 case 'p':
1602 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1603 /* %p is ill-defined: ensure leading 0x. */
1604 if (numberresult[1] == 'X')
1605 numberresult[1] = 'x';
1606 else if (numberresult[1] != 'x') {
1607 memmove(numberresult + 2, numberresult,
1608 strlen(numberresult) + 1);
1609 numberresult[0] = '0';
1610 numberresult[1] = 'x';
1611 numprinted += 2;
1612 }
1613 n += numprinted;
1614 numberresult += (numprinted + 1);
1615 assert(*(numberresult - 1) == '\0');
1616 assert(*(numberresult - 2) != '\0');
1617 assert(numprinted >= 0);
1618 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001619 break;
1620 case 's':
1621 {
1622 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001623 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001624 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1625 if (!str)
1626 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001627 /* since PyUnicode_DecodeUTF8 returns already flexible
1628 unicode objects, there is no need to call ready on them */
1629 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001630 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001632 /* Remember the str and switch to the next slot */
1633 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001634 break;
1635 }
1636 case 'U':
1637 {
1638 PyObject *obj = va_arg(count, PyObject *);
1639 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640 if (PyUnicode_READY(obj) == -1)
1641 goto fail;
1642 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001643 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001645 break;
1646 }
1647 case 'V':
1648 {
1649 PyObject *obj = va_arg(count, PyObject *);
1650 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001651 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001652 assert(obj || str);
1653 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001654 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 if (PyUnicode_READY(obj) == -1)
1656 goto fail;
1657 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001658 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001659 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001660 *callresult++ = NULL;
1661 }
1662 else {
1663 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1664 if (!str_obj)
1665 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001667 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001668 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001669 *callresult++ = str_obj;
1670 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001671 break;
1672 }
1673 case 'S':
1674 {
1675 PyObject *obj = va_arg(count, PyObject *);
1676 PyObject *str;
1677 assert(obj);
1678 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001680 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001682 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001684 /* Remember the str and switch to the next slot */
1685 *callresult++ = str;
1686 break;
1687 }
1688 case 'R':
1689 {
1690 PyObject *obj = va_arg(count, PyObject *);
1691 PyObject *repr;
1692 assert(obj);
1693 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001695 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001696 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001697 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001699 /* Remember the repr and switch to the next slot */
1700 *callresult++ = repr;
1701 break;
1702 }
1703 case 'A':
1704 {
1705 PyObject *obj = va_arg(count, PyObject *);
1706 PyObject *ascii;
1707 assert(obj);
1708 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001710 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001712 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001714 /* Remember the repr and switch to the next slot */
1715 *callresult++ = ascii;
1716 break;
1717 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001718 default:
1719 /* if we stumble upon an unknown
1720 formatting code, copy the rest of
1721 the format string to the output
1722 string. (we cannot just skip the
1723 code, since there's no way to know
1724 what's in the argument list) */
1725 n += strlen(p);
1726 goto expand;
1727 }
1728 } else
1729 n++;
1730 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001731 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001732 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001734 we don't have to resize the string.
1735 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001737 if (!string)
1738 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739 kind = PyUnicode_KIND(string);
1740 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001741 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001745 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001746 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001747
1748 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1750 /* checking for == because the last argument could be a empty
1751 string, which causes i to point to end, the assert at the end of
1752 the loop */
1753 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001754
Benjamin Peterson14339b62009-01-31 16:36:08 +00001755 switch (*f) {
1756 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001757 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 const int ordinal = va_arg(vargs, int);
1759 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001760 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001761 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001762 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001763 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001764 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001765 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 case 'p':
1767 /* unused, since we already have the result */
1768 if (*f == 'p')
1769 (void) va_arg(vargs, void *);
1770 else
1771 (void) va_arg(vargs, int);
1772 /* extract the result from numberresults and append. */
1773 for (; *numberresult; ++i, ++numberresult)
1774 PyUnicode_WRITE(kind, data, i, *numberresult);
1775 /* skip over the separating '\0' */
1776 assert(*numberresult == '\0');
1777 numberresult++;
1778 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001779 break;
1780 case 's':
1781 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001782 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001784 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 size = PyUnicode_GET_LENGTH(*callresult);
1786 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001787 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1788 *callresult, 0,
1789 size) < 0)
1790 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001792 /* We're done with the unicode()/repr() => forget it */
1793 Py_DECREF(*callresult);
1794 /* switch to next unicode()/repr() result */
1795 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001796 break;
1797 }
1798 case 'U':
1799 {
1800 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 Py_ssize_t size;
1802 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1803 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001804 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1805 obj, 0,
1806 size) < 0)
1807 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001809 break;
1810 }
1811 case 'V':
1812 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001814 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001815 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001816 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 size = PyUnicode_GET_LENGTH(obj);
1818 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001819 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1820 obj, 0,
1821 size) < 0)
1822 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001824 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 size = PyUnicode_GET_LENGTH(*callresult);
1826 assert(PyUnicode_KIND(*callresult) <=
1827 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001828 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1829 *callresult,
1830 0, size) < 0)
1831 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001833 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001834 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001835 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001836 break;
1837 }
1838 case 'S':
1839 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001840 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001841 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001842 /* unused, since we already have the result */
1843 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001845 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1846 *callresult, 0,
1847 PyUnicode_GET_LENGTH(*callresult)) < 0)
1848 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001850 /* We're done with the unicode()/repr() => forget it */
1851 Py_DECREF(*callresult);
1852 /* switch to next unicode()/repr() result */
1853 ++callresult;
1854 break;
1855 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001856 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001858 break;
1859 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860 for (; *p; ++p, ++i)
1861 PyUnicode_WRITE(kind, data, i, *p);
1862 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001863 goto end;
1864 }
Victor Stinner1205f272010-09-11 00:54:47 +00001865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 else {
1867 assert(i < PyUnicode_GET_LENGTH(string));
1868 PyUnicode_WRITE(kind, data, i++, *f);
1869 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001870 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001872
Benjamin Peterson29060642009-01-31 22:14:21 +00001873 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001874 if (callresults)
1875 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001876 if (numberresults)
1877 PyObject_Free(numberresults);
1878 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001879 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001880 if (callresults) {
1881 PyObject **callresult2 = callresults;
1882 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001883 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001884 ++callresult2;
1885 }
1886 PyObject_Free(callresults);
1887 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 if (numberresults)
1889 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001890 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001891}
1892
Walter Dörwaldd2034312007-05-18 16:29:38 +00001893PyObject *
1894PyUnicode_FromFormat(const char *format, ...)
1895{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001896 PyObject* ret;
1897 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001898
1899#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001900 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001901#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001902 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001903#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001904 ret = PyUnicode_FromFormatV(format, vargs);
1905 va_end(vargs);
1906 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001907}
1908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909#ifdef HAVE_WCHAR_H
1910
Victor Stinner5593d8a2010-10-02 11:11:27 +00001911/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1912 convert a Unicode object to a wide character string.
1913
Victor Stinnerd88d9832011-09-06 02:00:05 +02001914 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001915 character) required to convert the unicode object. Ignore size argument.
1916
Victor Stinnerd88d9832011-09-06 02:00:05 +02001917 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001918 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001919 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001920static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001921unicode_aswidechar(PyUnicodeObject *unicode,
1922 wchar_t *w,
1923 Py_ssize_t size)
1924{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001925 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 const wchar_t *wstr;
1927
1928 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1929 if (wstr == NULL)
1930 return -1;
1931
Victor Stinner5593d8a2010-10-02 11:11:27 +00001932 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001933 if (size > res)
1934 size = res + 1;
1935 else
1936 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001937 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001938 return res;
1939 }
1940 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001941 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001942}
1943
1944Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001945PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001946 wchar_t *w,
1947 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948{
1949 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001950 PyErr_BadInternalCall();
1951 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001953 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954}
1955
Victor Stinner137c34c2010-09-29 10:25:54 +00001956wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001957PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001958 Py_ssize_t *size)
1959{
1960 wchar_t* buffer;
1961 Py_ssize_t buflen;
1962
1963 if (unicode == NULL) {
1964 PyErr_BadInternalCall();
1965 return NULL;
1966 }
1967
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001968 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 if (buflen == -1)
1970 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001971 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001972 PyErr_NoMemory();
1973 return NULL;
1974 }
1975
Victor Stinner137c34c2010-09-29 10:25:54 +00001976 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1977 if (buffer == NULL) {
1978 PyErr_NoMemory();
1979 return NULL;
1980 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001981 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001982 if (buflen == -1)
1983 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001984 if (size != NULL)
1985 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001986 return buffer;
1987}
1988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990
Alexander Belopolsky40018472011-02-26 01:02:56 +00001991PyObject *
1992PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001993{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001995 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001996 PyErr_SetString(PyExc_ValueError,
1997 "chr() arg not in range(0x110000)");
1998 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001999 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 if (ordinal < 256)
2002 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 v = PyUnicode_New(1, ordinal);
2005 if (v == NULL)
2006 return NULL;
2007 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2008 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002009}
2010
Alexander Belopolsky40018472011-02-26 01:02:56 +00002011PyObject *
2012PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002014 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002015 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002016 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002017 Py_INCREF(obj);
2018 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002019 }
2020 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002021 /* For a Unicode subtype that's not a Unicode object,
2022 return a true Unicode object with the same data. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 if (PyUnicode_READY(obj) == -1)
2024 return NULL;
2025 return substring((PyUnicodeObject *)obj, 0, PyUnicode_GET_LENGTH(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002026 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002027 PyErr_Format(PyExc_TypeError,
2028 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002029 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002030 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002031}
2032
Alexander Belopolsky40018472011-02-26 01:02:56 +00002033PyObject *
2034PyUnicode_FromEncodedObject(register PyObject *obj,
2035 const char *encoding,
2036 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002037{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002038 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002039 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002040
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002042 PyErr_BadInternalCall();
2043 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002045
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002046 /* Decoding bytes objects is the most common case and should be fast */
2047 if (PyBytes_Check(obj)) {
2048 if (PyBytes_GET_SIZE(obj) == 0) {
2049 Py_INCREF(unicode_empty);
2050 v = (PyObject *) unicode_empty;
2051 }
2052 else {
2053 v = PyUnicode_Decode(
2054 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2055 encoding, errors);
2056 }
2057 return v;
2058 }
2059
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002060 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002061 PyErr_SetString(PyExc_TypeError,
2062 "decoding str is not supported");
2063 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002064 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002065
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002066 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2067 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2068 PyErr_Format(PyExc_TypeError,
2069 "coercing to str: need bytes, bytearray "
2070 "or buffer-like object, %.80s found",
2071 Py_TYPE(obj)->tp_name);
2072 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002073 }
Tim Petersced69f82003-09-16 20:30:58 +00002074
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002075 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002076 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002077 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 }
Tim Petersced69f82003-09-16 20:30:58 +00002079 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002080 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002081
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002082 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002083 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084}
2085
Victor Stinner600d3be2010-06-10 12:00:55 +00002086/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002087 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2088 1 on success. */
2089static int
2090normalize_encoding(const char *encoding,
2091 char *lower,
2092 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002094 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002095 char *l;
2096 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002097
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002098 e = encoding;
2099 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002100 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002101 while (*e) {
2102 if (l == l_end)
2103 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002104 if (Py_ISUPPER(*e)) {
2105 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002106 }
2107 else if (*e == '_') {
2108 *l++ = '-';
2109 e++;
2110 }
2111 else {
2112 *l++ = *e++;
2113 }
2114 }
2115 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002116 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002117}
2118
Alexander Belopolsky40018472011-02-26 01:02:56 +00002119PyObject *
2120PyUnicode_Decode(const char *s,
2121 Py_ssize_t size,
2122 const char *encoding,
2123 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002124{
2125 PyObject *buffer = NULL, *unicode;
2126 Py_buffer info;
2127 char lower[11]; /* Enough for any encoding shortcut */
2128
2129 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002130 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002131
2132 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002133 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002134 if ((strcmp(lower, "utf-8") == 0) ||
2135 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002136 return PyUnicode_DecodeUTF8(s, size, errors);
2137 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002138 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002139 (strcmp(lower, "iso-8859-1") == 0))
2140 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002141#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002142 else if (strcmp(lower, "mbcs") == 0)
2143 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002144#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002145 else if (strcmp(lower, "ascii") == 0)
2146 return PyUnicode_DecodeASCII(s, size, errors);
2147 else if (strcmp(lower, "utf-16") == 0)
2148 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2149 else if (strcmp(lower, "utf-32") == 0)
2150 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2151 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152
2153 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002154 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002155 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002156 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002157 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 if (buffer == NULL)
2159 goto onError;
2160 unicode = PyCodec_Decode(buffer, encoding, errors);
2161 if (unicode == NULL)
2162 goto onError;
2163 if (!PyUnicode_Check(unicode)) {
2164 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002165 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002166 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 Py_DECREF(unicode);
2168 goto onError;
2169 }
2170 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 if (PyUnicode_READY(unicode)) {
2172 Py_DECREF(unicode);
2173 return NULL;
2174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002176
Benjamin Peterson29060642009-01-31 22:14:21 +00002177 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178 Py_XDECREF(buffer);
2179 return NULL;
2180}
2181
Alexander Belopolsky40018472011-02-26 01:02:56 +00002182PyObject *
2183PyUnicode_AsDecodedObject(PyObject *unicode,
2184 const char *encoding,
2185 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002186{
2187 PyObject *v;
2188
2189 if (!PyUnicode_Check(unicode)) {
2190 PyErr_BadArgument();
2191 goto onError;
2192 }
2193
2194 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002195 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002196
2197 /* Decode via the codec registry */
2198 v = PyCodec_Decode(unicode, encoding, errors);
2199 if (v == NULL)
2200 goto onError;
2201 return v;
2202
Benjamin Peterson29060642009-01-31 22:14:21 +00002203 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002204 return NULL;
2205}
2206
Alexander Belopolsky40018472011-02-26 01:02:56 +00002207PyObject *
2208PyUnicode_AsDecodedUnicode(PyObject *unicode,
2209 const char *encoding,
2210 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002211{
2212 PyObject *v;
2213
2214 if (!PyUnicode_Check(unicode)) {
2215 PyErr_BadArgument();
2216 goto onError;
2217 }
2218
2219 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002220 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002221
2222 /* Decode via the codec registry */
2223 v = PyCodec_Decode(unicode, encoding, errors);
2224 if (v == NULL)
2225 goto onError;
2226 if (!PyUnicode_Check(v)) {
2227 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002228 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002229 Py_TYPE(v)->tp_name);
2230 Py_DECREF(v);
2231 goto onError;
2232 }
2233 return v;
2234
Benjamin Peterson29060642009-01-31 22:14:21 +00002235 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002236 return NULL;
2237}
2238
Alexander Belopolsky40018472011-02-26 01:02:56 +00002239PyObject *
2240PyUnicode_Encode(const Py_UNICODE *s,
2241 Py_ssize_t size,
2242 const char *encoding,
2243 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002244{
2245 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002246
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 unicode = PyUnicode_FromUnicode(s, size);
2248 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2251 Py_DECREF(unicode);
2252 return v;
2253}
2254
Alexander Belopolsky40018472011-02-26 01:02:56 +00002255PyObject *
2256PyUnicode_AsEncodedObject(PyObject *unicode,
2257 const char *encoding,
2258 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002259{
2260 PyObject *v;
2261
2262 if (!PyUnicode_Check(unicode)) {
2263 PyErr_BadArgument();
2264 goto onError;
2265 }
2266
2267 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002268 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002269
2270 /* Encode via the codec registry */
2271 v = PyCodec_Encode(unicode, encoding, errors);
2272 if (v == NULL)
2273 goto onError;
2274 return v;
2275
Benjamin Peterson29060642009-01-31 22:14:21 +00002276 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002277 return NULL;
2278}
2279
Victor Stinnerad158722010-10-27 00:25:46 +00002280PyObject *
2281PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002282{
Victor Stinner99b95382011-07-04 14:23:54 +02002283#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002284 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2285 PyUnicode_GET_SIZE(unicode),
2286 NULL);
2287#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002289#else
Victor Stinner793b5312011-04-27 00:24:21 +02002290 PyInterpreterState *interp = PyThreadState_GET()->interp;
2291 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2292 cannot use it to encode and decode filenames before it is loaded. Load
2293 the Python codec requires to encode at least its own filename. Use the C
2294 version of the locale codec until the codec registry is initialized and
2295 the Python codec is loaded.
2296
2297 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2298 cannot only rely on it: check also interp->fscodec_initialized for
2299 subinterpreters. */
2300 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002301 return PyUnicode_AsEncodedString(unicode,
2302 Py_FileSystemDefaultEncoding,
2303 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002304 }
2305 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002306 /* locale encoding with surrogateescape */
2307 wchar_t *wchar;
2308 char *bytes;
2309 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002310 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002311
2312 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2313 if (wchar == NULL)
2314 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002315 bytes = _Py_wchar2char(wchar, &error_pos);
2316 if (bytes == NULL) {
2317 if (error_pos != (size_t)-1) {
2318 char *errmsg = strerror(errno);
2319 PyObject *exc = NULL;
2320 if (errmsg == NULL)
2321 errmsg = "Py_wchar2char() failed";
2322 raise_encode_exception(&exc,
2323 "filesystemencoding",
2324 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2325 error_pos, error_pos+1,
2326 errmsg);
2327 Py_XDECREF(exc);
2328 }
2329 else
2330 PyErr_NoMemory();
2331 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002332 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002333 }
2334 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002335
2336 bytes_obj = PyBytes_FromString(bytes);
2337 PyMem_Free(bytes);
2338 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002339 }
Victor Stinnerad158722010-10-27 00:25:46 +00002340#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002341}
2342
Alexander Belopolsky40018472011-02-26 01:02:56 +00002343PyObject *
2344PyUnicode_AsEncodedString(PyObject *unicode,
2345 const char *encoding,
2346 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002347{
2348 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002349 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002350
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351 if (!PyUnicode_Check(unicode)) {
2352 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002353 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002354 }
Fred Drakee4315f52000-05-09 19:53:39 +00002355
Victor Stinner2f283c22011-03-02 01:21:46 +00002356 if (encoding == NULL) {
2357 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002358 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002359 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002361 }
Fred Drakee4315f52000-05-09 19:53:39 +00002362
2363 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002364 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002365 if ((strcmp(lower, "utf-8") == 0) ||
2366 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002367 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002368 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002370 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002372 }
Victor Stinner37296e82010-06-10 13:36:23 +00002373 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002374 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002375 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002376 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002377#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002378 else if (strcmp(lower, "mbcs") == 0)
2379 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2380 PyUnicode_GET_SIZE(unicode),
2381 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002382#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002383 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002384 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002385 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386
2387 /* Encode via the codec registry */
2388 v = PyCodec_Encode(unicode, encoding, errors);
2389 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002390 return NULL;
2391
2392 /* The normal path */
2393 if (PyBytes_Check(v))
2394 return v;
2395
2396 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002397 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002398 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002399 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002400
2401 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2402 "encoder %s returned bytearray instead of bytes",
2403 encoding);
2404 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002405 Py_DECREF(v);
2406 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002407 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002408
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002409 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2410 Py_DECREF(v);
2411 return b;
2412 }
2413
2414 PyErr_Format(PyExc_TypeError,
2415 "encoder did not return a bytes object (type=%.400s)",
2416 Py_TYPE(v)->tp_name);
2417 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002418 return NULL;
2419}
2420
Alexander Belopolsky40018472011-02-26 01:02:56 +00002421PyObject *
2422PyUnicode_AsEncodedUnicode(PyObject *unicode,
2423 const char *encoding,
2424 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002425{
2426 PyObject *v;
2427
2428 if (!PyUnicode_Check(unicode)) {
2429 PyErr_BadArgument();
2430 goto onError;
2431 }
2432
2433 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002434 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002435
2436 /* Encode via the codec registry */
2437 v = PyCodec_Encode(unicode, encoding, errors);
2438 if (v == NULL)
2439 goto onError;
2440 if (!PyUnicode_Check(v)) {
2441 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002442 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002443 Py_TYPE(v)->tp_name);
2444 Py_DECREF(v);
2445 goto onError;
2446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002448
Benjamin Peterson29060642009-01-31 22:14:21 +00002449 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002450 return NULL;
2451}
2452
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002453PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002454PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002455 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002456 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2457}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002458
Christian Heimes5894ba72007-11-04 11:43:14 +00002459PyObject*
2460PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2461{
Victor Stinner99b95382011-07-04 14:23:54 +02002462#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002463 return PyUnicode_DecodeMBCS(s, size, NULL);
2464#elif defined(__APPLE__)
2465 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2466#else
Victor Stinner793b5312011-04-27 00:24:21 +02002467 PyInterpreterState *interp = PyThreadState_GET()->interp;
2468 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2469 cannot use it to encode and decode filenames before it is loaded. Load
2470 the Python codec requires to encode at least its own filename. Use the C
2471 version of the locale codec until the codec registry is initialized and
2472 the Python codec is loaded.
2473
2474 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2475 cannot only rely on it: check also interp->fscodec_initialized for
2476 subinterpreters. */
2477 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002478 return PyUnicode_Decode(s, size,
2479 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002480 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002481 }
2482 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002483 /* locale encoding with surrogateescape */
2484 wchar_t *wchar;
2485 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002486 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002487
2488 if (s[size] != '\0' || size != strlen(s)) {
2489 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2490 return NULL;
2491 }
2492
Victor Stinner168e1172010-10-16 23:16:16 +00002493 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002494 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002495 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002496
Victor Stinner168e1172010-10-16 23:16:16 +00002497 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002498 PyMem_Free(wchar);
2499 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002500 }
Victor Stinnerad158722010-10-27 00:25:46 +00002501#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002502}
2503
Martin v. Löwis011e8422009-05-05 04:43:17 +00002504
2505int
2506PyUnicode_FSConverter(PyObject* arg, void* addr)
2507{
2508 PyObject *output = NULL;
2509 Py_ssize_t size;
2510 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002511 if (arg == NULL) {
2512 Py_DECREF(*(PyObject**)addr);
2513 return 1;
2514 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002515 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002516 output = arg;
2517 Py_INCREF(output);
2518 }
2519 else {
2520 arg = PyUnicode_FromObject(arg);
2521 if (!arg)
2522 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002523 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002524 Py_DECREF(arg);
2525 if (!output)
2526 return 0;
2527 if (!PyBytes_Check(output)) {
2528 Py_DECREF(output);
2529 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2530 return 0;
2531 }
2532 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002533 size = PyBytes_GET_SIZE(output);
2534 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002535 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002536 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002537 Py_DECREF(output);
2538 return 0;
2539 }
2540 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002541 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002542}
2543
2544
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002545int
2546PyUnicode_FSDecoder(PyObject* arg, void* addr)
2547{
2548 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002549 if (arg == NULL) {
2550 Py_DECREF(*(PyObject**)addr);
2551 return 1;
2552 }
2553 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 if (PyUnicode_READY(arg))
2555 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002556 output = arg;
2557 Py_INCREF(output);
2558 }
2559 else {
2560 arg = PyBytes_FromObject(arg);
2561 if (!arg)
2562 return 0;
2563 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2564 PyBytes_GET_SIZE(arg));
2565 Py_DECREF(arg);
2566 if (!output)
2567 return 0;
2568 if (!PyUnicode_Check(output)) {
2569 Py_DECREF(output);
2570 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2571 return 0;
2572 }
2573 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002574 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2575 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002576 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2577 Py_DECREF(output);
2578 return 0;
2579 }
2580 *(PyObject**)addr = output;
2581 return Py_CLEANUP_SUPPORTED;
2582}
2583
2584
Martin v. Löwis5b222132007-06-10 09:51:05 +00002585char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002586PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002587{
Christian Heimesf3863112007-11-22 07:46:41 +00002588 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2590
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002591 if (!PyUnicode_Check(unicode)) {
2592 PyErr_BadArgument();
2593 return NULL;
2594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002596 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597
2598 if (_PyUnicode_UTF8(unicode) == NULL) {
2599 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2600 if (bytes == NULL)
2601 return NULL;
2602 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2603 if (u->_base.utf8 == NULL) {
2604 Py_DECREF(bytes);
2605 return NULL;
2606 }
2607 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2608 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2609 Py_DECREF(bytes);
2610 }
2611
2612 if (psize)
2613 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2614 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002615}
2616
2617char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2621}
2622
2623#ifdef Py_DEBUG
2624int unicode_as_unicode_calls = 0;
2625#endif
2626
2627
2628Py_UNICODE *
2629PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2630{
2631 PyUnicodeObject *u;
2632 const unsigned char *one_byte;
2633#if SIZEOF_WCHAR_T == 4
2634 const Py_UCS2 *two_bytes;
2635#else
2636 const Py_UCS4 *four_bytes;
2637 const Py_UCS4 *ucs4_end;
2638 Py_ssize_t num_surrogates;
2639#endif
2640 wchar_t *w;
2641 wchar_t *wchar_end;
2642
2643 if (!PyUnicode_Check(unicode)) {
2644 PyErr_BadArgument();
2645 return NULL;
2646 }
2647 u = (PyUnicodeObject*)unicode;
2648 if (_PyUnicode_WSTR(u) == NULL) {
2649 /* Non-ASCII compact unicode object */
2650 assert(_PyUnicode_KIND(u) != 0);
2651 assert(PyUnicode_IS_READY(u));
2652
2653#ifdef Py_DEBUG
2654 ++unicode_as_unicode_calls;
2655#endif
2656
2657 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2658#if SIZEOF_WCHAR_T == 2
2659 four_bytes = PyUnicode_4BYTE_DATA(u);
2660 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2661 num_surrogates = 0;
2662
2663 for (; four_bytes < ucs4_end; ++four_bytes) {
2664 if (*four_bytes > 0xFFFF)
2665 ++num_surrogates;
2666 }
2667
2668 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2669 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2670 if (!_PyUnicode_WSTR(u)) {
2671 PyErr_NoMemory();
2672 return NULL;
2673 }
2674 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2675
2676 w = _PyUnicode_WSTR(u);
2677 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2678 four_bytes = PyUnicode_4BYTE_DATA(u);
2679 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2680 if (*four_bytes > 0xFFFF) {
2681 /* encode surrogate pair in this case */
2682 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2683 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2684 }
2685 else
2686 *w = *four_bytes;
2687
2688 if (w > wchar_end) {
2689 assert(0 && "Miscalculated string end");
2690 }
2691 }
2692 *w = 0;
2693#else
2694 /* sizeof(wchar_t) == 4 */
2695 Py_FatalError("Impossible unicode object state, wstr and str "
2696 "should share memory already.");
2697 return NULL;
2698#endif
2699 }
2700 else {
2701 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2702 (_PyUnicode_LENGTH(u) + 1));
2703 if (!_PyUnicode_WSTR(u)) {
2704 PyErr_NoMemory();
2705 return NULL;
2706 }
2707 if (!PyUnicode_IS_COMPACT_ASCII(u))
2708 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2709 w = _PyUnicode_WSTR(u);
2710 wchar_end = w + _PyUnicode_LENGTH(u);
2711
2712 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2713 one_byte = PyUnicode_1BYTE_DATA(u);
2714 for (; w < wchar_end; ++one_byte, ++w)
2715 *w = *one_byte;
2716 /* null-terminate the wstr */
2717 *w = 0;
2718 }
2719 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2720#if SIZEOF_WCHAR_T == 4
2721 two_bytes = PyUnicode_2BYTE_DATA(u);
2722 for (; w < wchar_end; ++two_bytes, ++w)
2723 *w = *two_bytes;
2724 /* null-terminate the wstr */
2725 *w = 0;
2726#else
2727 /* sizeof(wchar_t) == 2 */
2728 PyObject_FREE(_PyUnicode_WSTR(u));
2729 _PyUnicode_WSTR(u) = NULL;
2730 Py_FatalError("Impossible unicode object state, wstr "
2731 "and str should share memory already.");
2732 return NULL;
2733#endif
2734 }
2735 else {
2736 assert(0 && "This should never happen.");
2737 }
2738 }
2739 }
2740 if (size != NULL)
2741 *size = PyUnicode_WSTR_LENGTH(u);
2742 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002743}
2744
Alexander Belopolsky40018472011-02-26 01:02:56 +00002745Py_UNICODE *
2746PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749}
2750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751
Alexander Belopolsky40018472011-02-26 01:02:56 +00002752Py_ssize_t
2753PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754{
2755 if (!PyUnicode_Check(unicode)) {
2756 PyErr_BadArgument();
2757 goto onError;
2758 }
2759 return PyUnicode_GET_SIZE(unicode);
2760
Benjamin Peterson29060642009-01-31 22:14:21 +00002761 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 return -1;
2763}
2764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002765Py_ssize_t
2766PyUnicode_GetLength(PyObject *unicode)
2767{
2768 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2769 PyErr_BadArgument();
2770 return -1;
2771 }
2772
2773 return PyUnicode_GET_LENGTH(unicode);
2774}
2775
2776Py_UCS4
2777PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2778{
2779 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2780 return PyErr_BadArgument();
2781 return (Py_UCS4)-1;
2782 }
2783 return PyUnicode_READ_CHAR(unicode, index);
2784}
2785
2786int
2787PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2788{
2789 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2790 return PyErr_BadArgument();
2791 return -1;
2792 }
2793
2794 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2795 index, ch);
2796 return 0;
2797}
2798
Alexander Belopolsky40018472011-02-26 01:02:56 +00002799const char *
2800PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002801{
Victor Stinner42cb4622010-09-01 19:39:01 +00002802 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002803}
2804
Victor Stinner554f3f02010-06-16 23:33:54 +00002805/* create or adjust a UnicodeDecodeError */
2806static void
2807make_decode_exception(PyObject **exceptionObject,
2808 const char *encoding,
2809 const char *input, Py_ssize_t length,
2810 Py_ssize_t startpos, Py_ssize_t endpos,
2811 const char *reason)
2812{
2813 if (*exceptionObject == NULL) {
2814 *exceptionObject = PyUnicodeDecodeError_Create(
2815 encoding, input, length, startpos, endpos, reason);
2816 }
2817 else {
2818 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2819 goto onError;
2820 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2821 goto onError;
2822 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2823 goto onError;
2824 }
2825 return;
2826
2827onError:
2828 Py_DECREF(*exceptionObject);
2829 *exceptionObject = NULL;
2830}
2831
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002832/* error handling callback helper:
2833 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002834 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 and adjust various state variables.
2836 return 0 on success, -1 on error
2837*/
2838
Alexander Belopolsky40018472011-02-26 01:02:56 +00002839static int
2840unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2841 const char *encoding, const char *reason,
2842 const char **input, const char **inend, Py_ssize_t *startinpos,
2843 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2844 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002845{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002846 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002847
2848 PyObject *restuple = NULL;
2849 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002850 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002851 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002852 Py_ssize_t requiredsize;
2853 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002854 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002855 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002856 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 int res = -1;
2858
2859 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002860 *errorHandler = PyCodec_LookupError(errors);
2861 if (*errorHandler == NULL)
2862 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002863 }
2864
Victor Stinner554f3f02010-06-16 23:33:54 +00002865 make_decode_exception(exceptionObject,
2866 encoding,
2867 *input, *inend - *input,
2868 *startinpos, *endinpos,
2869 reason);
2870 if (*exceptionObject == NULL)
2871 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002872
2873 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2874 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002875 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002876 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002877 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002878 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879 }
2880 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002882
2883 /* Copy back the bytes variables, which might have been modified by the
2884 callback */
2885 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2886 if (!inputobj)
2887 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002888 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002889 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002890 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002891 *input = PyBytes_AS_STRING(inputobj);
2892 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002893 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002894 /* we can DECREF safely, as the exception has another reference,
2895 so the object won't go away. */
2896 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002897
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002898 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002899 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002900 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002901 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2902 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002903 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002904
2905 /* need more space? (at least enough for what we
2906 have+the replacement+the rest of the string (starting
2907 at the new input position), so we won't have to check space
2908 when there are no errors in the rest of the string) */
2909 repptr = PyUnicode_AS_UNICODE(repunicode);
2910 repsize = PyUnicode_GET_SIZE(repunicode);
2911 requiredsize = *outpos + repsize + insize-newpos;
2912 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002913 if (requiredsize<2*outsize)
2914 requiredsize = 2*outsize;
2915 if (_PyUnicode_Resize(output, requiredsize) < 0)
2916 goto onError;
2917 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002918 }
2919 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002920 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002921 Py_UNICODE_COPY(*outptr, repptr, repsize);
2922 *outptr += repsize;
2923 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002924
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002925 /* we made it! */
2926 res = 0;
2927
Benjamin Peterson29060642009-01-31 22:14:21 +00002928 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002929 Py_XDECREF(restuple);
2930 return res;
2931}
2932
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002933/* --- UTF-7 Codec -------------------------------------------------------- */
2934
Antoine Pitrou244651a2009-05-04 18:56:13 +00002935/* See RFC2152 for details. We encode conservatively and decode liberally. */
2936
2937/* Three simple macros defining base-64. */
2938
2939/* Is c a base-64 character? */
2940
2941#define IS_BASE64(c) \
2942 (((c) >= 'A' && (c) <= 'Z') || \
2943 ((c) >= 'a' && (c) <= 'z') || \
2944 ((c) >= '0' && (c) <= '9') || \
2945 (c) == '+' || (c) == '/')
2946
2947/* given that c is a base-64 character, what is its base-64 value? */
2948
2949#define FROM_BASE64(c) \
2950 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2951 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2952 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2953 (c) == '+' ? 62 : 63)
2954
2955/* What is the base-64 character of the bottom 6 bits of n? */
2956
2957#define TO_BASE64(n) \
2958 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2959
2960/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2961 * decoded as itself. We are permissive on decoding; the only ASCII
2962 * byte not decoding to itself is the + which begins a base64
2963 * string. */
2964
2965#define DECODE_DIRECT(c) \
2966 ((c) <= 127 && (c) != '+')
2967
2968/* The UTF-7 encoder treats ASCII characters differently according to
2969 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2970 * the above). See RFC2152. This array identifies these different
2971 * sets:
2972 * 0 : "Set D"
2973 * alphanumeric and '(),-./:?
2974 * 1 : "Set O"
2975 * !"#$%&*;<=>@[]^_`{|}
2976 * 2 : "whitespace"
2977 * ht nl cr sp
2978 * 3 : special (must be base64 encoded)
2979 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2980 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002981
Tim Petersced69f82003-09-16 20:30:58 +00002982static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002983char utf7_category[128] = {
2984/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2985 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2986/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2987 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2988/* sp ! " # $ % & ' ( ) * + , - . / */
2989 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2990/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2991 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2992/* @ A B C D E F G H I J K L M N O */
2993 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2994/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2995 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2996/* ` a b c d e f g h i j k l m n o */
2997 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2998/* p q r s t u v w x y z { | } ~ del */
2999 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003000};
3001
Antoine Pitrou244651a2009-05-04 18:56:13 +00003002/* ENCODE_DIRECT: this character should be encoded as itself. The
3003 * answer depends on whether we are encoding set O as itself, and also
3004 * on whether we are encoding whitespace as itself. RFC2152 makes it
3005 * clear that the answers to these questions vary between
3006 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003007
Antoine Pitrou244651a2009-05-04 18:56:13 +00003008#define ENCODE_DIRECT(c, directO, directWS) \
3009 ((c) < 128 && (c) > 0 && \
3010 ((utf7_category[(c)] == 0) || \
3011 (directWS && (utf7_category[(c)] == 2)) || \
3012 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003013
Alexander Belopolsky40018472011-02-26 01:02:56 +00003014PyObject *
3015PyUnicode_DecodeUTF7(const char *s,
3016 Py_ssize_t size,
3017 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003018{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003019 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3020}
3021
Antoine Pitrou244651a2009-05-04 18:56:13 +00003022/* The decoder. The only state we preserve is our read position,
3023 * i.e. how many characters we have consumed. So if we end in the
3024 * middle of a shift sequence we have to back off the read position
3025 * and the output to the beginning of the sequence, otherwise we lose
3026 * all the shift state (seen bits, number of bits seen, high
3027 * surrogate). */
3028
Alexander Belopolsky40018472011-02-26 01:02:56 +00003029PyObject *
3030PyUnicode_DecodeUTF7Stateful(const char *s,
3031 Py_ssize_t size,
3032 const char *errors,
3033 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003034{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003035 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003036 Py_ssize_t startinpos;
3037 Py_ssize_t endinpos;
3038 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003039 const char *e;
3040 PyUnicodeObject *unicode;
3041 Py_UNICODE *p;
3042 const char *errmsg = "";
3043 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003044 Py_UNICODE *shiftOutStart;
3045 unsigned int base64bits = 0;
3046 unsigned long base64buffer = 0;
3047 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 PyObject *errorHandler = NULL;
3049 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003050
3051 unicode = _PyUnicode_New(size);
3052 if (!unicode)
3053 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003054 if (size == 0) {
3055 if (consumed)
3056 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003057 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003058 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003060 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003061 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003062 e = s + size;
3063
3064 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003065 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003066 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003067 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003068
Antoine Pitrou244651a2009-05-04 18:56:13 +00003069 if (inShift) { /* in a base-64 section */
3070 if (IS_BASE64(ch)) { /* consume a base-64 character */
3071 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3072 base64bits += 6;
3073 s++;
3074 if (base64bits >= 16) {
3075 /* we have enough bits for a UTF-16 value */
3076 Py_UNICODE outCh = (Py_UNICODE)
3077 (base64buffer >> (base64bits-16));
3078 base64bits -= 16;
3079 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3080 if (surrogate) {
3081 /* expecting a second surrogate */
3082 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3083#ifdef Py_UNICODE_WIDE
3084 *p++ = (((surrogate & 0x3FF)<<10)
3085 | (outCh & 0x3FF)) + 0x10000;
3086#else
3087 *p++ = surrogate;
3088 *p++ = outCh;
3089#endif
3090 surrogate = 0;
3091 }
3092 else {
3093 surrogate = 0;
3094 errmsg = "second surrogate missing";
3095 goto utf7Error;
3096 }
3097 }
3098 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3099 /* first surrogate */
3100 surrogate = outCh;
3101 }
3102 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3103 errmsg = "unexpected second surrogate";
3104 goto utf7Error;
3105 }
3106 else {
3107 *p++ = outCh;
3108 }
3109 }
3110 }
3111 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003112 inShift = 0;
3113 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003114 if (surrogate) {
3115 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003116 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003117 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003118 if (base64bits > 0) { /* left-over bits */
3119 if (base64bits >= 6) {
3120 /* We've seen at least one base-64 character */
3121 errmsg = "partial character in shift sequence";
3122 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003123 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003124 else {
3125 /* Some bits remain; they should be zero */
3126 if (base64buffer != 0) {
3127 errmsg = "non-zero padding bits in shift sequence";
3128 goto utf7Error;
3129 }
3130 }
3131 }
3132 if (ch != '-') {
3133 /* '-' is absorbed; other terminating
3134 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003135 *p++ = ch;
3136 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003137 }
3138 }
3139 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003140 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003141 s++; /* consume '+' */
3142 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003143 s++;
3144 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003145 }
3146 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003147 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003148 shiftOutStart = p;
3149 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003150 }
3151 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003152 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003153 *p++ = ch;
3154 s++;
3155 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003156 else {
3157 startinpos = s-starts;
3158 s++;
3159 errmsg = "unexpected special character";
3160 goto utf7Error;
3161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003162 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003163utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003164 outpos = p-PyUnicode_AS_UNICODE(unicode);
3165 endinpos = s-starts;
3166 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003167 errors, &errorHandler,
3168 "utf7", errmsg,
3169 &starts, &e, &startinpos, &endinpos, &exc, &s,
3170 &unicode, &outpos, &p))
3171 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003172 }
3173
Antoine Pitrou244651a2009-05-04 18:56:13 +00003174 /* end of string */
3175
3176 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3177 /* if we're in an inconsistent state, that's an error */
3178 if (surrogate ||
3179 (base64bits >= 6) ||
3180 (base64bits > 0 && base64buffer != 0)) {
3181 outpos = p-PyUnicode_AS_UNICODE(unicode);
3182 endinpos = size;
3183 if (unicode_decode_call_errorhandler(
3184 errors, &errorHandler,
3185 "utf7", "unterminated shift sequence",
3186 &starts, &e, &startinpos, &endinpos, &exc, &s,
3187 &unicode, &outpos, &p))
3188 goto onError;
3189 if (s < e)
3190 goto restart;
3191 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003192 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003193
3194 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003195 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003196 if (inShift) {
3197 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003198 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003199 }
3200 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003201 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003202 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003203 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003204
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003205 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003206 goto onError;
3207
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003208 Py_XDECREF(errorHandler);
3209 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003210 if (PyUnicode_READY(unicode) == -1) {
3211 Py_DECREF(unicode);
3212 return NULL;
3213 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003214 return (PyObject *)unicode;
3215
Benjamin Peterson29060642009-01-31 22:14:21 +00003216 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003217 Py_XDECREF(errorHandler);
3218 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003219 Py_DECREF(unicode);
3220 return NULL;
3221}
3222
3223
Alexander Belopolsky40018472011-02-26 01:02:56 +00003224PyObject *
3225PyUnicode_EncodeUTF7(const Py_UNICODE *s,
3226 Py_ssize_t size,
3227 int base64SetO,
3228 int base64WhiteSpace,
3229 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003230{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003231 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003232 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003233 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003234 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003235 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003236 unsigned int base64bits = 0;
3237 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003238 char * out;
3239 char * start;
3240
3241 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003243
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003244 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003245 return PyErr_NoMemory();
3246
Antoine Pitrou244651a2009-05-04 18:56:13 +00003247 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003248 if (v == NULL)
3249 return NULL;
3250
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003251 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003252 for (;i < size; ++i) {
3253 Py_UNICODE ch = s[i];
3254
Antoine Pitrou244651a2009-05-04 18:56:13 +00003255 if (inShift) {
3256 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3257 /* shifting out */
3258 if (base64bits) { /* output remaining bits */
3259 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3260 base64buffer = 0;
3261 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003262 }
3263 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003264 /* Characters not in the BASE64 set implicitly unshift the sequence
3265 so no '-' is required, except if the character is itself a '-' */
3266 if (IS_BASE64(ch) || ch == '-') {
3267 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003268 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003269 *out++ = (char) ch;
3270 }
3271 else {
3272 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003273 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003274 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003275 else { /* not in a shift sequence */
3276 if (ch == '+') {
3277 *out++ = '+';
3278 *out++ = '-';
3279 }
3280 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3281 *out++ = (char) ch;
3282 }
3283 else {
3284 *out++ = '+';
3285 inShift = 1;
3286 goto encode_char;
3287 }
3288 }
3289 continue;
3290encode_char:
3291#ifdef Py_UNICODE_WIDE
3292 if (ch >= 0x10000) {
3293 /* code first surrogate */
3294 base64bits += 16;
3295 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3296 while (base64bits >= 6) {
3297 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3298 base64bits -= 6;
3299 }
3300 /* prepare second surrogate */
3301 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3302 }
3303#endif
3304 base64bits += 16;
3305 base64buffer = (base64buffer << 16) | ch;
3306 while (base64bits >= 6) {
3307 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3308 base64bits -= 6;
3309 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003310 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003311 if (base64bits)
3312 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3313 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003314 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003315 if (_PyBytes_Resize(&v, out - start) < 0)
3316 return NULL;
3317 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003318}
3319
Antoine Pitrou244651a2009-05-04 18:56:13 +00003320#undef IS_BASE64
3321#undef FROM_BASE64
3322#undef TO_BASE64
3323#undef DECODE_DIRECT
3324#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003325
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326/* --- UTF-8 Codec -------------------------------------------------------- */
3327
Tim Petersced69f82003-09-16 20:30:58 +00003328static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003330 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3331 illegal prefix. See RFC 3629 for details */
3332 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3333 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003334 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3336 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3337 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3338 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003339 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3340 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3342 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003343 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3344 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3345 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3346 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3347 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348};
3349
Alexander Belopolsky40018472011-02-26 01:02:56 +00003350PyObject *
3351PyUnicode_DecodeUTF8(const char *s,
3352 Py_ssize_t size,
3353 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003354{
Walter Dörwald69652032004-09-07 20:24:22 +00003355 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3356}
3357
Antoine Pitrouab868312009-01-10 15:40:25 +00003358/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3359#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3360
3361/* Mask to quickly check whether a C 'long' contains a
3362 non-ASCII, UTF8-encoded char. */
3363#if (SIZEOF_LONG == 8)
3364# define ASCII_CHAR_MASK 0x8080808080808080L
3365#elif (SIZEOF_LONG == 4)
3366# define ASCII_CHAR_MASK 0x80808080L
3367#else
3368# error C 'long' size should be either 4 or 8!
3369#endif
3370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003371/* Scans a UTF-8 string and returns the maximum character to be expected,
3372 the size of the decoded unicode string and if any major errors were
3373 encountered.
3374
3375 This function does check basic UTF-8 sanity, it does however NOT CHECK
3376 if the string contains surrogates, and if all continuation bytes are
3377 within the correct ranges, these checks are performed in
3378 PyUnicode_DecodeUTF8Stateful.
3379
3380 If it sets has_errors to 1, it means the value of unicode_size and max_char
3381 will be bogus and you should not rely on useful information in them.
3382 */
3383static Py_UCS4
3384utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3385 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3386 int *has_errors)
3387{
3388 Py_ssize_t n;
3389 Py_ssize_t char_count = 0;
3390 Py_UCS4 max_char = 127, new_max;
3391 Py_UCS4 upper_bound;
3392 const unsigned char *p = (const unsigned char *)s;
3393 const unsigned char *end = p + string_size;
3394 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3395 int err = 0;
3396
3397 for (; p < end && !err; ++p, ++char_count) {
3398 /* Only check value if it's not a ASCII char... */
3399 if (*p < 0x80) {
3400 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3401 an explanation. */
3402 if (!((size_t) p & LONG_PTR_MASK)) {
3403 /* Help register allocation */
3404 register const unsigned char *_p = p;
3405 while (_p < aligned_end) {
3406 unsigned long value = *(unsigned long *) _p;
3407 if (value & ASCII_CHAR_MASK)
3408 break;
3409 _p += SIZEOF_LONG;
3410 char_count += SIZEOF_LONG;
3411 }
3412 p = _p;
3413 if (p == end)
3414 break;
3415 }
3416 }
3417 if (*p >= 0x80) {
3418 n = utf8_code_length[*p];
3419 new_max = max_char;
3420 switch (n) {
3421 /* invalid start byte */
3422 case 0:
3423 err = 1;
3424 break;
3425 case 2:
3426 /* Code points between 0x00FF and 0x07FF inclusive.
3427 Approximate the upper bound of the code point,
3428 if this flips over 255 we can be sure it will be more
3429 than 255 and the string will need 2 bytes per code coint,
3430 if it stays under or equal to 255, we can be sure 1 byte
3431 is enough.
3432 ((*p & 0b00011111) << 6) | 0b00111111 */
3433 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3434 if (max_char < upper_bound)
3435 new_max = upper_bound;
3436 /* Ensure we track at least that we left ASCII space. */
3437 if (new_max < 128)
3438 new_max = 128;
3439 break;
3440 case 3:
3441 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3442 always > 255 and <= 65535 and will always need 2 bytes. */
3443 if (max_char < 65535)
3444 new_max = 65535;
3445 break;
3446 case 4:
3447 /* Code point will be above 0xFFFF for sure in this case. */
3448 new_max = 65537;
3449 break;
3450 /* Internal error, this should be caught by the first if */
3451 case 1:
3452 default:
3453 assert(0 && "Impossible case in utf8_max_char_and_size");
3454 err = 1;
3455 }
3456 /* Instead of number of overall bytes for this code point,
3457 n containts the number of following bytes: */
3458 --n;
3459 /* Check if the follow up chars are all valid continuation bytes */
3460 if (n >= 1) {
3461 const unsigned char *cont;
3462 if ((p + n) >= end) {
3463 if (consumed == 0)
3464 /* incomplete data, non-incremental decoding */
3465 err = 1;
3466 break;
3467 }
3468 for (cont = p + 1; cont < (p + n); ++cont) {
3469 if ((*cont & 0xc0) != 0x80) {
3470 err = 1;
3471 break;
3472 }
3473 }
3474 p += n;
3475 }
3476 else
3477 err = 1;
3478 max_char = new_max;
3479 }
3480 }
3481
3482 if (unicode_size)
3483 *unicode_size = char_count;
3484 if (has_errors)
3485 *has_errors = err;
3486 return max_char;
3487}
3488
3489/* Similar to PyUnicode_WRITE but can also write into wstr field
3490 of the legacy unicode representation */
3491#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3492 do { \
3493 const int k_ = (kind); \
3494 if (k_ == PyUnicode_WCHAR_KIND) \
3495 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3496 else if (k_ == PyUnicode_1BYTE_KIND) \
3497 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3498 else if (k_ == PyUnicode_2BYTE_KIND) \
3499 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3500 else \
3501 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3502 } while (0)
3503
Alexander Belopolsky40018472011-02-26 01:02:56 +00003504PyObject *
3505PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003506 Py_ssize_t size,
3507 const char *errors,
3508 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003509{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003510 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003512 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003513 Py_ssize_t startinpos;
3514 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003515 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003517 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 PyObject *errorHandler = NULL;
3519 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003520 Py_UCS4 maxchar = 0;
3521 Py_ssize_t unicode_size;
3522 Py_ssize_t i;
3523 int kind;
3524 void *data;
3525 int has_errors;
3526 Py_UNICODE *error_outptr;
3527#if SIZEOF_WCHAR_T == 2
3528 Py_ssize_t wchar_offset = 0;
3529#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530
Walter Dörwald69652032004-09-07 20:24:22 +00003531 if (size == 0) {
3532 if (consumed)
3533 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003534 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003536 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3537 consumed, &has_errors);
3538 if (has_errors) {
3539 unicode = _PyUnicode_New(size);
3540 if (!unicode)
3541 return NULL;
3542 kind = PyUnicode_WCHAR_KIND;
3543 data = PyUnicode_AS_UNICODE(unicode);
3544 assert(data != NULL);
3545 }
3546 else {
3547 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3548 if (!unicode)
3549 return NULL;
3550 /* When the string is ASCII only, just use memcpy and return.
3551 unicode_size may be != size if there is an incomplete UTF-8
3552 sequence at the end of the ASCII block. */
3553 if (maxchar < 128 && size == unicode_size) {
3554 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3555 return (PyObject *)unicode;
3556 }
3557 kind = PyUnicode_KIND(unicode);
3558 data = PyUnicode_DATA(unicode);
3559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003561 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003563 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564
3565 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003566 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567
3568 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003569 /* Fast path for runs of ASCII characters. Given that common UTF-8
3570 input will consist of an overwhelming majority of ASCII
3571 characters, we try to optimize for this case by checking
3572 as many characters as a C 'long' can contain.
3573 First, check if we can do an aligned read, as most CPUs have
3574 a penalty for unaligned reads.
3575 */
3576 if (!((size_t) s & LONG_PTR_MASK)) {
3577 /* Help register allocation */
3578 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003579 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003580 while (_s < aligned_end) {
3581 /* Read a whole long at a time (either 4 or 8 bytes),
3582 and do a fast unrolled copy if it only contains ASCII
3583 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003584 unsigned long value = *(unsigned long *) _s;
3585 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003586 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003587 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3588 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3589 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3590 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003591#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003592 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3593 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3594 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3595 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003596#endif
3597 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003598 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003599 }
3600 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003601 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003602 if (s == e)
3603 break;
3604 ch = (unsigned char)*s;
3605 }
3606 }
3607
3608 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003609 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 s++;
3611 continue;
3612 }
3613
3614 n = utf8_code_length[ch];
3615
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003616 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003617 if (consumed)
3618 break;
3619 else {
3620 errmsg = "unexpected end of data";
3621 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003622 endinpos = startinpos+1;
3623 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3624 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003625 goto utf8Error;
3626 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003627 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628
3629 switch (n) {
3630
3631 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003632 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003633 startinpos = s-starts;
3634 endinpos = startinpos+1;
3635 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636
3637 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003638 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003639 startinpos = s-starts;
3640 endinpos = startinpos+1;
3641 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642
3643 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003644 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003645 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003646 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003647 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003648 goto utf8Error;
3649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003651 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003652 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 break;
3654
3655 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003656 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3657 will result in surrogates in range d800-dfff. Surrogates are
3658 not valid UTF-8 so they are rejected.
3659 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3660 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003661 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003662 (s[2] & 0xc0) != 0x80 ||
3663 ((unsigned char)s[0] == 0xE0 &&
3664 (unsigned char)s[1] < 0xA0) ||
3665 ((unsigned char)s[0] == 0xED &&
3666 (unsigned char)s[1] > 0x9F)) {
3667 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003669 endinpos = startinpos + 1;
3670
3671 /* if s[1] first two bits are 1 and 0, then the invalid
3672 continuation byte is s[2], so increment endinpos by 1,
3673 if not, s[1] is invalid and endinpos doesn't need to
3674 be incremented. */
3675 if ((s[1] & 0xC0) == 0x80)
3676 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 goto utf8Error;
3678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003680 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003681 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003682 break;
3683
3684 case 4:
3685 if ((s[1] & 0xc0) != 0x80 ||
3686 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003687 (s[3] & 0xc0) != 0x80 ||
3688 ((unsigned char)s[0] == 0xF0 &&
3689 (unsigned char)s[1] < 0x90) ||
3690 ((unsigned char)s[0] == 0xF4 &&
3691 (unsigned char)s[1] > 0x8F)) {
3692 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003694 endinpos = startinpos + 1;
3695 if ((s[1] & 0xC0) == 0x80) {
3696 endinpos++;
3697 if ((s[2] & 0xC0) == 0x80)
3698 endinpos++;
3699 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003700 goto utf8Error;
3701 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003702 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003703 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3704 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003706 /* If the string is flexible or we have native UCS-4, write
3707 directly.. */
3708 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3709 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711 else {
3712 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003714 /* translate from 10000..10FFFF to 0..FFFF */
3715 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003717 /* high surrogate = top 10 bits added to D800 */
3718 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3719 (Py_UNICODE)(0xD800 + (ch >> 10)));
3720
3721 /* low surrogate = bottom 10 bits added to DC00 */
3722 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3723 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3724 }
3725#if SIZEOF_WCHAR_T == 2
3726 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003727#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 }
3730 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003731 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003732
Benjamin Peterson29060642009-01-31 22:14:21 +00003733 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003734 /* If this is not yet a resizable string, make it one.. */
3735 if (kind != PyUnicode_WCHAR_KIND) {
3736 const Py_UNICODE *u;
3737 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3738 if (!new_unicode)
3739 goto onError;
3740 u = PyUnicode_AsUnicode((PyObject *)unicode);
3741 if (!u)
3742 goto onError;
3743#if SIZEOF_WCHAR_T == 2
3744 i += wchar_offset;
3745#endif
3746 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3747 Py_DECREF(unicode);
3748 unicode = new_unicode;
3749 kind = 0;
3750 data = PyUnicode_AS_UNICODE(new_unicode);
3751 assert(data != NULL);
3752 }
3753 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003754 if (unicode_decode_call_errorhandler(
3755 errors, &errorHandler,
3756 "utf8", errmsg,
3757 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003759 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003760 /* Update data because unicode_decode_call_errorhandler might have
3761 re-created or resized the unicode object. */
3762 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003763 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765 /* Ensure the unicode_size calculation above was correct: */
3766 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3767
Walter Dörwald69652032004-09-07 20:24:22 +00003768 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003769 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 /* Adjust length and ready string when it contained errors and
3772 is of the old resizable kind. */
3773 if (kind == PyUnicode_WCHAR_KIND) {
3774 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3775 PyUnicode_READY(unicode) == -1)
3776 goto onError;
3777 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003779 Py_XDECREF(errorHandler);
3780 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 if (PyUnicode_READY(unicode) == -1) {
3782 Py_DECREF(unicode);
3783 return NULL;
3784 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 return (PyObject *)unicode;
3786
Benjamin Peterson29060642009-01-31 22:14:21 +00003787 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 Py_XDECREF(errorHandler);
3789 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 Py_DECREF(unicode);
3791 return NULL;
3792}
3793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003794#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003795
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003796#ifdef __APPLE__
3797
3798/* Simplified UTF-8 decoder using surrogateescape error handler,
3799 used to decode the command line arguments on Mac OS X. */
3800
3801wchar_t*
3802_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3803{
3804 int n;
3805 const char *e;
3806 wchar_t *unicode, *p;
3807
3808 /* Note: size will always be longer than the resulting Unicode
3809 character count */
3810 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3811 PyErr_NoMemory();
3812 return NULL;
3813 }
3814 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3815 if (!unicode)
3816 return NULL;
3817
3818 /* Unpack UTF-8 encoded data */
3819 p = unicode;
3820 e = s + size;
3821 while (s < e) {
3822 Py_UCS4 ch = (unsigned char)*s;
3823
3824 if (ch < 0x80) {
3825 *p++ = (wchar_t)ch;
3826 s++;
3827 continue;
3828 }
3829
3830 n = utf8_code_length[ch];
3831 if (s + n > e) {
3832 goto surrogateescape;
3833 }
3834
3835 switch (n) {
3836 case 0:
3837 case 1:
3838 goto surrogateescape;
3839
3840 case 2:
3841 if ((s[1] & 0xc0) != 0x80)
3842 goto surrogateescape;
3843 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3844 assert ((ch > 0x007F) && (ch <= 0x07FF));
3845 *p++ = (wchar_t)ch;
3846 break;
3847
3848 case 3:
3849 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3850 will result in surrogates in range d800-dfff. Surrogates are
3851 not valid UTF-8 so they are rejected.
3852 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3853 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3854 if ((s[1] & 0xc0) != 0x80 ||
3855 (s[2] & 0xc0) != 0x80 ||
3856 ((unsigned char)s[0] == 0xE0 &&
3857 (unsigned char)s[1] < 0xA0) ||
3858 ((unsigned char)s[0] == 0xED &&
3859 (unsigned char)s[1] > 0x9F)) {
3860
3861 goto surrogateescape;
3862 }
3863 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3864 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003865 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003866 break;
3867
3868 case 4:
3869 if ((s[1] & 0xc0) != 0x80 ||
3870 (s[2] & 0xc0) != 0x80 ||
3871 (s[3] & 0xc0) != 0x80 ||
3872 ((unsigned char)s[0] == 0xF0 &&
3873 (unsigned char)s[1] < 0x90) ||
3874 ((unsigned char)s[0] == 0xF4 &&
3875 (unsigned char)s[1] > 0x8F)) {
3876 goto surrogateescape;
3877 }
3878 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3879 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3880 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3881
3882#if SIZEOF_WCHAR_T == 4
3883 *p++ = (wchar_t)ch;
3884#else
3885 /* compute and append the two surrogates: */
3886
3887 /* translate from 10000..10FFFF to 0..FFFF */
3888 ch -= 0x10000;
3889
3890 /* high surrogate = top 10 bits added to D800 */
3891 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3892
3893 /* low surrogate = bottom 10 bits added to DC00 */
3894 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3895#endif
3896 break;
3897 }
3898 s += n;
3899 continue;
3900
3901 surrogateescape:
3902 *p++ = 0xDC00 + ch;
3903 s++;
3904 }
3905 *p = L'\0';
3906 return unicode;
3907}
3908
3909#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911/* Primary internal function which creates utf8 encoded bytes objects.
3912
3913 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003914 and allocate exactly as much space needed at the end. Else allocate the
3915 maximum possible needed (4 result bytes per Unicode character), and return
3916 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003917*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003918PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920{
Tim Peters602f7402002-04-27 18:03:26 +00003921#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003922
Guido van Rossum98297ee2007-11-06 21:34:58 +00003923 Py_ssize_t i; /* index into s of next input byte */
3924 PyObject *result; /* result string object */
3925 char *p; /* next free byte in output buffer */
3926 Py_ssize_t nallocated; /* number of result bytes allocated */
3927 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003928 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003929 PyObject *errorHandler = NULL;
3930 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931 int kind;
3932 void *data;
3933 Py_ssize_t size;
3934 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3935#if SIZEOF_WCHAR_T == 2
3936 Py_ssize_t wchar_offset = 0;
3937#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939 if (!PyUnicode_Check(unicode)) {
3940 PyErr_BadArgument();
3941 return NULL;
3942 }
3943
3944 if (PyUnicode_READY(unicode) == -1)
3945 return NULL;
3946
3947 if (_PyUnicode_UTF8(unicode))
3948 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
3949 _PyUnicode_UTF8_LENGTH(unicode));
3950
3951 kind = PyUnicode_KIND(unicode);
3952 data = PyUnicode_DATA(unicode);
3953 size = PyUnicode_GET_LENGTH(unicode);
3954
Tim Peters602f7402002-04-27 18:03:26 +00003955 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956
Tim Peters602f7402002-04-27 18:03:26 +00003957 if (size <= MAX_SHORT_UNICHARS) {
3958 /* Write into the stack buffer; nallocated can't overflow.
3959 * At the end, we'll allocate exactly as much heap space as it
3960 * turns out we need.
3961 */
3962 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003963 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00003964 p = stackbuf;
3965 }
3966 else {
3967 /* Overallocate on the heap, and give the excess back at the end. */
3968 nallocated = size * 4;
3969 if (nallocated / 4 != size) /* overflow! */
3970 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00003971 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003972 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00003973 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003974 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003975 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003976
Tim Peters602f7402002-04-27 18:03:26 +00003977 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003979
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003980 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00003981 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003983
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00003985 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00003986 *p++ = (char)(0xc0 | (ch >> 6));
3987 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00003988 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003989 Py_ssize_t newpos;
3990 PyObject *rep;
3991 Py_ssize_t repsize, k, startpos;
3992 startpos = i-1;
3993#if SIZEOF_WCHAR_T == 2
3994 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00003995#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996 rep = unicode_encode_call_errorhandler(
3997 errors, &errorHandler, "utf-8", "surrogates not allowed",
3998 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3999 &exc, startpos, startpos+1, &newpos);
4000 if (!rep)
4001 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003 if (PyBytes_Check(rep))
4004 repsize = PyBytes_GET_SIZE(rep);
4005 else
4006 repsize = PyUnicode_GET_SIZE(rep);
4007
4008 if (repsize > 4) {
4009 Py_ssize_t offset;
4010
4011 if (result == NULL)
4012 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004013 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4017 /* integer overflow */
4018 PyErr_NoMemory();
4019 goto error;
4020 }
4021 nallocated += repsize - 4;
4022 if (result != NULL) {
4023 if (_PyBytes_Resize(&result, nallocated) < 0)
4024 goto error;
4025 } else {
4026 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004027 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004028 goto error;
4029 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4030 }
4031 p = PyBytes_AS_STRING(result) + offset;
4032 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034 if (PyBytes_Check(rep)) {
4035 char *prep = PyBytes_AS_STRING(rep);
4036 for(k = repsize; k > 0; k--)
4037 *p++ = *prep++;
4038 } else /* rep is unicode */ {
4039 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4040 Py_UNICODE c;
4041
4042 for(k=0; k<repsize; k++) {
4043 c = prep[k];
4044 if (0x80 <= c) {
4045 raise_encode_exception(&exc, "utf-8",
4046 PyUnicode_AS_UNICODE(unicode),
4047 size, i-1, i,
4048 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004049 goto error;
4050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004052 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004055 } else if (ch < 0x10000) {
4056 *p++ = (char)(0xe0 | (ch >> 12));
4057 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4058 *p++ = (char)(0x80 | (ch & 0x3f));
4059 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004060 /* Encode UCS4 Unicode ordinals */
4061 *p++ = (char)(0xf0 | (ch >> 18));
4062 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4063 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4064 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065#if SIZEOF_WCHAR_T == 2
4066 wchar_offset++;
4067#endif
Tim Peters602f7402002-04-27 18:03:26 +00004068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004070
Guido van Rossum98297ee2007-11-06 21:34:58 +00004071 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004072 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004073 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004074 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004075 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004076 }
4077 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004078 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004079 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004080 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004081 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004084 Py_XDECREF(errorHandler);
4085 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004086 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004087 error:
4088 Py_XDECREF(errorHandler);
4089 Py_XDECREF(exc);
4090 Py_XDECREF(result);
4091 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004092
Tim Peters602f7402002-04-27 18:03:26 +00004093#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094}
4095
Alexander Belopolsky40018472011-02-26 01:02:56 +00004096PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4098 Py_ssize_t size,
4099 const char *errors)
4100{
4101 PyObject *v, *unicode;
4102
4103 unicode = PyUnicode_FromUnicode(s, size);
4104 if (unicode == NULL)
4105 return NULL;
4106 v = _PyUnicode_AsUTF8String(unicode, errors);
4107 Py_DECREF(unicode);
4108 return v;
4109}
4110
4111PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004112PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004114 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115}
4116
Walter Dörwald41980ca2007-08-16 21:55:45 +00004117/* --- UTF-32 Codec ------------------------------------------------------- */
4118
4119PyObject *
4120PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004121 Py_ssize_t size,
4122 const char *errors,
4123 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004124{
4125 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4126}
4127
4128PyObject *
4129PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 Py_ssize_t size,
4131 const char *errors,
4132 int *byteorder,
4133 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004134{
4135 const char *starts = s;
4136 Py_ssize_t startinpos;
4137 Py_ssize_t endinpos;
4138 Py_ssize_t outpos;
4139 PyUnicodeObject *unicode;
4140 Py_UNICODE *p;
4141#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004142 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004143 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004144#else
4145 const int pairs = 0;
4146#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004147 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004148 int bo = 0; /* assume native ordering by default */
4149 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004150 /* Offsets from q for retrieving bytes in the right order. */
4151#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4152 int iorder[] = {0, 1, 2, 3};
4153#else
4154 int iorder[] = {3, 2, 1, 0};
4155#endif
4156 PyObject *errorHandler = NULL;
4157 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004158
Walter Dörwald41980ca2007-08-16 21:55:45 +00004159 q = (unsigned char *)s;
4160 e = q + size;
4161
4162 if (byteorder)
4163 bo = *byteorder;
4164
4165 /* Check for BOM marks (U+FEFF) in the input and adjust current
4166 byte order setting accordingly. In native mode, the leading BOM
4167 mark is skipped, in all other modes, it is copied to the output
4168 stream as-is (giving a ZWNBSP character). */
4169 if (bo == 0) {
4170 if (size >= 4) {
4171 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004172 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004173#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 if (bom == 0x0000FEFF) {
4175 q += 4;
4176 bo = -1;
4177 }
4178 else if (bom == 0xFFFE0000) {
4179 q += 4;
4180 bo = 1;
4181 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004182#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 if (bom == 0x0000FEFF) {
4184 q += 4;
4185 bo = 1;
4186 }
4187 else if (bom == 0xFFFE0000) {
4188 q += 4;
4189 bo = -1;
4190 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004191#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004193 }
4194
4195 if (bo == -1) {
4196 /* force LE */
4197 iorder[0] = 0;
4198 iorder[1] = 1;
4199 iorder[2] = 2;
4200 iorder[3] = 3;
4201 }
4202 else if (bo == 1) {
4203 /* force BE */
4204 iorder[0] = 3;
4205 iorder[1] = 2;
4206 iorder[2] = 1;
4207 iorder[3] = 0;
4208 }
4209
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004210 /* On narrow builds we split characters outside the BMP into two
4211 codepoints => count how much extra space we need. */
4212#ifndef Py_UNICODE_WIDE
4213 for (qq = q; qq < e; qq += 4)
4214 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4215 pairs++;
4216#endif
4217
4218 /* This might be one to much, because of a BOM */
4219 unicode = _PyUnicode_New((size+3)/4+pairs);
4220 if (!unicode)
4221 return NULL;
4222 if (size == 0)
4223 return (PyObject *)unicode;
4224
4225 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004226 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004227
Walter Dörwald41980ca2007-08-16 21:55:45 +00004228 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004229 Py_UCS4 ch;
4230 /* remaining bytes at the end? (size should be divisible by 4) */
4231 if (e-q<4) {
4232 if (consumed)
4233 break;
4234 errmsg = "truncated data";
4235 startinpos = ((const char *)q)-starts;
4236 endinpos = ((const char *)e)-starts;
4237 goto utf32Error;
4238 /* The remaining input chars are ignored if the callback
4239 chooses to skip the input */
4240 }
4241 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4242 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004243
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 if (ch >= 0x110000)
4245 {
4246 errmsg = "codepoint not in range(0x110000)";
4247 startinpos = ((const char *)q)-starts;
4248 endinpos = startinpos+4;
4249 goto utf32Error;
4250 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004251#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 if (ch >= 0x10000)
4253 {
4254 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4255 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4256 }
4257 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004258#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 *p++ = ch;
4260 q += 4;
4261 continue;
4262 utf32Error:
4263 outpos = p-PyUnicode_AS_UNICODE(unicode);
4264 if (unicode_decode_call_errorhandler(
4265 errors, &errorHandler,
4266 "utf32", errmsg,
4267 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4268 &unicode, &outpos, &p))
4269 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004270 }
4271
4272 if (byteorder)
4273 *byteorder = bo;
4274
4275 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004277
4278 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004279 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004280 goto onError;
4281
4282 Py_XDECREF(errorHandler);
4283 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004284 if (PyUnicode_READY(unicode) == -1) {
4285 Py_DECREF(unicode);
4286 return NULL;
4287 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004288 return (PyObject *)unicode;
4289
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004291 Py_DECREF(unicode);
4292 Py_XDECREF(errorHandler);
4293 Py_XDECREF(exc);
4294 return NULL;
4295}
4296
4297PyObject *
4298PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004299 Py_ssize_t size,
4300 const char *errors,
4301 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004302{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004303 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004304 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004305 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004306#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004307 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004308#else
4309 const int pairs = 0;
4310#endif
4311 /* Offsets from p for storing byte pairs in the right order. */
4312#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4313 int iorder[] = {0, 1, 2, 3};
4314#else
4315 int iorder[] = {3, 2, 1, 0};
4316#endif
4317
Benjamin Peterson29060642009-01-31 22:14:21 +00004318#define STORECHAR(CH) \
4319 do { \
4320 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4321 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4322 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4323 p[iorder[0]] = (CH) & 0xff; \
4324 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004325 } while(0)
4326
4327 /* In narrow builds we can output surrogate pairs as one codepoint,
4328 so we need less space. */
4329#ifndef Py_UNICODE_WIDE
4330 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004331 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4332 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4333 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004334#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004335 nsize = (size - pairs + (byteorder == 0));
4336 bytesize = nsize * 4;
4337 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004338 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004339 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004340 if (v == NULL)
4341 return NULL;
4342
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004343 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004344 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004345 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004346 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004347 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004348
4349 if (byteorder == -1) {
4350 /* force LE */
4351 iorder[0] = 0;
4352 iorder[1] = 1;
4353 iorder[2] = 2;
4354 iorder[3] = 3;
4355 }
4356 else if (byteorder == 1) {
4357 /* force BE */
4358 iorder[0] = 3;
4359 iorder[1] = 2;
4360 iorder[2] = 1;
4361 iorder[3] = 0;
4362 }
4363
4364 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004366#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4368 Py_UCS4 ch2 = *s;
4369 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4370 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4371 s++;
4372 size--;
4373 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004374 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004375#endif
4376 STORECHAR(ch);
4377 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004378
4379 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004380 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004381#undef STORECHAR
4382}
4383
Alexander Belopolsky40018472011-02-26 01:02:56 +00004384PyObject *
4385PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004386{
4387 if (!PyUnicode_Check(unicode)) {
4388 PyErr_BadArgument();
4389 return NULL;
4390 }
4391 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004392 PyUnicode_GET_SIZE(unicode),
4393 NULL,
4394 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004395}
4396
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397/* --- UTF-16 Codec ------------------------------------------------------- */
4398
Tim Peters772747b2001-08-09 22:21:55 +00004399PyObject *
4400PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004401 Py_ssize_t size,
4402 const char *errors,
4403 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404{
Walter Dörwald69652032004-09-07 20:24:22 +00004405 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4406}
4407
Antoine Pitrouab868312009-01-10 15:40:25 +00004408/* Two masks for fast checking of whether a C 'long' may contain
4409 UTF16-encoded surrogate characters. This is an efficient heuristic,
4410 assuming that non-surrogate characters with a code point >= 0x8000 are
4411 rare in most input.
4412 FAST_CHAR_MASK is used when the input is in native byte ordering,
4413 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004414*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004415#if (SIZEOF_LONG == 8)
4416# define FAST_CHAR_MASK 0x8000800080008000L
4417# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4418#elif (SIZEOF_LONG == 4)
4419# define FAST_CHAR_MASK 0x80008000L
4420# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4421#else
4422# error C 'long' size should be either 4 or 8!
4423#endif
4424
Walter Dörwald69652032004-09-07 20:24:22 +00004425PyObject *
4426PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 Py_ssize_t size,
4428 const char *errors,
4429 int *byteorder,
4430 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004431{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004433 Py_ssize_t startinpos;
4434 Py_ssize_t endinpos;
4435 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436 PyUnicodeObject *unicode;
4437 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004438 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004439 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004440 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004441 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004442 /* Offsets from q for retrieving byte pairs in the right order. */
4443#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4444 int ihi = 1, ilo = 0;
4445#else
4446 int ihi = 0, ilo = 1;
4447#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448 PyObject *errorHandler = NULL;
4449 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450
4451 /* Note: size will always be longer than the resulting Unicode
4452 character count */
4453 unicode = _PyUnicode_New(size);
4454 if (!unicode)
4455 return NULL;
4456 if (size == 0)
4457 return (PyObject *)unicode;
4458
4459 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004460 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004461 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004462 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463
4464 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004465 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004467 /* Check for BOM marks (U+FEFF) in the input and adjust current
4468 byte order setting accordingly. In native mode, the leading BOM
4469 mark is skipped, in all other modes, it is copied to the output
4470 stream as-is (giving a ZWNBSP character). */
4471 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004472 if (size >= 2) {
4473 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004474#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 if (bom == 0xFEFF) {
4476 q += 2;
4477 bo = -1;
4478 }
4479 else if (bom == 0xFFFE) {
4480 q += 2;
4481 bo = 1;
4482 }
Tim Petersced69f82003-09-16 20:30:58 +00004483#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004484 if (bom == 0xFEFF) {
4485 q += 2;
4486 bo = 1;
4487 }
4488 else if (bom == 0xFFFE) {
4489 q += 2;
4490 bo = -1;
4491 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004492#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004494 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495
Tim Peters772747b2001-08-09 22:21:55 +00004496 if (bo == -1) {
4497 /* force LE */
4498 ihi = 1;
4499 ilo = 0;
4500 }
4501 else if (bo == 1) {
4502 /* force BE */
4503 ihi = 0;
4504 ilo = 1;
4505 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004506#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4507 native_ordering = ilo < ihi;
4508#else
4509 native_ordering = ilo > ihi;
4510#endif
Tim Peters772747b2001-08-09 22:21:55 +00004511
Antoine Pitrouab868312009-01-10 15:40:25 +00004512 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004513 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004515 /* First check for possible aligned read of a C 'long'. Unaligned
4516 reads are more expensive, better to defer to another iteration. */
4517 if (!((size_t) q & LONG_PTR_MASK)) {
4518 /* Fast path for runs of non-surrogate chars. */
4519 register const unsigned char *_q = q;
4520 Py_UNICODE *_p = p;
4521 if (native_ordering) {
4522 /* Native ordering is simple: as long as the input cannot
4523 possibly contain a surrogate char, do an unrolled copy
4524 of several 16-bit code points to the target object.
4525 The non-surrogate check is done on several input bytes
4526 at a time (as many as a C 'long' can contain). */
4527 while (_q < aligned_end) {
4528 unsigned long data = * (unsigned long *) _q;
4529 if (data & FAST_CHAR_MASK)
4530 break;
4531 _p[0] = ((unsigned short *) _q)[0];
4532 _p[1] = ((unsigned short *) _q)[1];
4533#if (SIZEOF_LONG == 8)
4534 _p[2] = ((unsigned short *) _q)[2];
4535 _p[3] = ((unsigned short *) _q)[3];
4536#endif
4537 _q += SIZEOF_LONG;
4538 _p += SIZEOF_LONG / 2;
4539 }
4540 }
4541 else {
4542 /* Byteswapped ordering is similar, but we must decompose
4543 the copy bytewise, and take care of zero'ing out the
4544 upper bytes if the target object is in 32-bit units
4545 (that is, in UCS-4 builds). */
4546 while (_q < aligned_end) {
4547 unsigned long data = * (unsigned long *) _q;
4548 if (data & SWAPPED_FAST_CHAR_MASK)
4549 break;
4550 /* Zero upper bytes in UCS-4 builds */
4551#if (Py_UNICODE_SIZE > 2)
4552 _p[0] = 0;
4553 _p[1] = 0;
4554#if (SIZEOF_LONG == 8)
4555 _p[2] = 0;
4556 _p[3] = 0;
4557#endif
4558#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004559 /* Issue #4916; UCS-4 builds on big endian machines must
4560 fill the two last bytes of each 4-byte unit. */
4561#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4562# define OFF 2
4563#else
4564# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004565#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004566 ((unsigned char *) _p)[OFF + 1] = _q[0];
4567 ((unsigned char *) _p)[OFF + 0] = _q[1];
4568 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4569 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4570#if (SIZEOF_LONG == 8)
4571 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4572 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4573 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4574 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4575#endif
4576#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004577 _q += SIZEOF_LONG;
4578 _p += SIZEOF_LONG / 2;
4579 }
4580 }
4581 p = _p;
4582 q = _q;
4583 if (q >= e)
4584 break;
4585 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587
Benjamin Peterson14339b62009-01-31 16:36:08 +00004588 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004589
4590 if (ch < 0xD800 || ch > 0xDFFF) {
4591 *p++ = ch;
4592 continue;
4593 }
4594
4595 /* UTF-16 code pair: */
4596 if (q > e) {
4597 errmsg = "unexpected end of data";
4598 startinpos = (((const char *)q) - 2) - starts;
4599 endinpos = ((const char *)e) + 1 - starts;
4600 goto utf16Error;
4601 }
4602 if (0xD800 <= ch && ch <= 0xDBFF) {
4603 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4604 q += 2;
4605 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004606#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 *p++ = ch;
4608 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004609#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004611#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 continue;
4613 }
4614 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004615 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 startinpos = (((const char *)q)-4)-starts;
4617 endinpos = startinpos+2;
4618 goto utf16Error;
4619 }
4620
Benjamin Peterson14339b62009-01-31 16:36:08 +00004621 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004622 errmsg = "illegal encoding";
4623 startinpos = (((const char *)q)-2)-starts;
4624 endinpos = startinpos+2;
4625 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004626
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 utf16Error:
4628 outpos = p - PyUnicode_AS_UNICODE(unicode);
4629 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004630 errors,
4631 &errorHandler,
4632 "utf16", errmsg,
4633 &starts,
4634 (const char **)&e,
4635 &startinpos,
4636 &endinpos,
4637 &exc,
4638 (const char **)&q,
4639 &unicode,
4640 &outpos,
4641 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004642 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004644 /* remaining byte at the end? (size should be even) */
4645 if (e == q) {
4646 if (!consumed) {
4647 errmsg = "truncated data";
4648 startinpos = ((const char *)q) - starts;
4649 endinpos = ((const char *)e) + 1 - starts;
4650 outpos = p - PyUnicode_AS_UNICODE(unicode);
4651 if (unicode_decode_call_errorhandler(
4652 errors,
4653 &errorHandler,
4654 "utf16", errmsg,
4655 &starts,
4656 (const char **)&e,
4657 &startinpos,
4658 &endinpos,
4659 &exc,
4660 (const char **)&q,
4661 &unicode,
4662 &outpos,
4663 &p))
4664 goto onError;
4665 /* The remaining input chars are ignored if the callback
4666 chooses to skip the input */
4667 }
4668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669
4670 if (byteorder)
4671 *byteorder = bo;
4672
Walter Dörwald69652032004-09-07 20:24:22 +00004673 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004675
Guido van Rossumd57fd912000-03-10 22:53:23 +00004676 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004677 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678 goto onError;
4679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 Py_XDECREF(errorHandler);
4681 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004682 if (PyUnicode_READY(unicode) == -1) {
4683 Py_DECREF(unicode);
4684 return NULL;
4685 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686 return (PyObject *)unicode;
4687
Benjamin Peterson29060642009-01-31 22:14:21 +00004688 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690 Py_XDECREF(errorHandler);
4691 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 return NULL;
4693}
4694
Antoine Pitrouab868312009-01-10 15:40:25 +00004695#undef FAST_CHAR_MASK
4696#undef SWAPPED_FAST_CHAR_MASK
4697
Tim Peters772747b2001-08-09 22:21:55 +00004698PyObject *
4699PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004700 Py_ssize_t size,
4701 const char *errors,
4702 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004704 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004705 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004706 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004707#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004708 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004709#else
4710 const int pairs = 0;
4711#endif
Tim Peters772747b2001-08-09 22:21:55 +00004712 /* Offsets from p for storing byte pairs in the right order. */
4713#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4714 int ihi = 1, ilo = 0;
4715#else
4716 int ihi = 0, ilo = 1;
4717#endif
4718
Benjamin Peterson29060642009-01-31 22:14:21 +00004719#define STORECHAR(CH) \
4720 do { \
4721 p[ihi] = ((CH) >> 8) & 0xff; \
4722 p[ilo] = (CH) & 0xff; \
4723 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004724 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004726#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004727 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004728 if (s[i] >= 0x10000)
4729 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004730#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004731 /* 2 * (size + pairs + (byteorder == 0)) */
4732 if (size > PY_SSIZE_T_MAX ||
4733 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004734 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004735 nsize = size + pairs + (byteorder == 0);
4736 bytesize = nsize * 2;
4737 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004738 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004739 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 if (v == NULL)
4741 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004743 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004745 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004746 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004747 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004748
4749 if (byteorder == -1) {
4750 /* force LE */
4751 ihi = 1;
4752 ilo = 0;
4753 }
4754 else if (byteorder == 1) {
4755 /* force BE */
4756 ihi = 0;
4757 ilo = 1;
4758 }
4759
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004760 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004761 Py_UNICODE ch = *s++;
4762 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004763#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 if (ch >= 0x10000) {
4765 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4766 ch = 0xD800 | ((ch-0x10000) >> 10);
4767 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004768#endif
Tim Peters772747b2001-08-09 22:21:55 +00004769 STORECHAR(ch);
4770 if (ch2)
4771 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004772 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004773
4774 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004775 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004776#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777}
4778
Alexander Belopolsky40018472011-02-26 01:02:56 +00004779PyObject *
4780PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781{
4782 if (!PyUnicode_Check(unicode)) {
4783 PyErr_BadArgument();
4784 return NULL;
4785 }
4786 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 PyUnicode_GET_SIZE(unicode),
4788 NULL,
4789 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790}
4791
4792/* --- Unicode Escape Codec ----------------------------------------------- */
4793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004794/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4795 if all the escapes in the string make it still a valid ASCII string.
4796 Returns -1 if any escapes were found which cause the string to
4797 pop out of ASCII range. Otherwise returns the length of the
4798 required buffer to hold the string.
4799 */
4800Py_ssize_t
4801length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4802{
4803 const unsigned char *p = (const unsigned char *)s;
4804 const unsigned char *end = p + size;
4805 Py_ssize_t length = 0;
4806
4807 if (size < 0)
4808 return -1;
4809
4810 for (; p < end; ++p) {
4811 if (*p > 127) {
4812 /* Non-ASCII */
4813 return -1;
4814 }
4815 else if (*p != '\\') {
4816 /* Normal character */
4817 ++length;
4818 }
4819 else {
4820 /* Backslash-escape, check next char */
4821 ++p;
4822 /* Escape sequence reaches till end of string or
4823 non-ASCII follow-up. */
4824 if (p >= end || *p > 127)
4825 return -1;
4826 switch (*p) {
4827 case '\n':
4828 /* backslash + \n result in zero characters */
4829 break;
4830 case '\\': case '\'': case '\"':
4831 case 'b': case 'f': case 't':
4832 case 'n': case 'r': case 'v': case 'a':
4833 ++length;
4834 break;
4835 case '0': case '1': case '2': case '3':
4836 case '4': case '5': case '6': case '7':
4837 case 'x': case 'u': case 'U': case 'N':
4838 /* these do not guarantee ASCII characters */
4839 return -1;
4840 default:
4841 /* count the backslash + the other character */
4842 length += 2;
4843 }
4844 }
4845 }
4846 return length;
4847}
4848
4849/* Similar to PyUnicode_WRITE but either write into wstr field
4850 or treat string as ASCII. */
4851#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4852 do { \
4853 if ((kind) != PyUnicode_WCHAR_KIND) \
4854 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4855 else \
4856 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4857 } while (0)
4858
4859#define WRITE_WSTR(buf, index, value) \
4860 assert(kind == PyUnicode_WCHAR_KIND), \
4861 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4862
4863
Fredrik Lundh06d12682001-01-24 07:59:11 +00004864static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004865
Alexander Belopolsky40018472011-02-26 01:02:56 +00004866PyObject *
4867PyUnicode_DecodeUnicodeEscape(const char *s,
4868 Py_ssize_t size,
4869 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004872 Py_ssize_t startinpos;
4873 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004874 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004878 char* message;
4879 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004880 PyObject *errorHandler = NULL;
4881 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882 Py_ssize_t ascii_length;
4883 Py_ssize_t i;
4884 int kind;
4885 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004887 ascii_length = length_of_escaped_ascii_string(s, size);
4888
4889 /* After length_of_escaped_ascii_string() there are two alternatives,
4890 either the string is pure ASCII with named escapes like \n, etc.
4891 and we determined it's exact size (common case)
4892 or it contains \x, \u, ... escape sequences. then we create a
4893 legacy wchar string and resize it at the end of this function. */
4894 if (ascii_length >= 0) {
4895 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4896 if (!v)
4897 goto onError;
4898 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4899 kind = PyUnicode_1BYTE_KIND;
4900 data = PyUnicode_DATA(v);
4901 }
4902 else {
4903 /* Escaped strings will always be longer than the resulting
4904 Unicode string, so we start with size here and then reduce the
4905 length after conversion to the true value.
4906 (but if the error callback returns a long replacement string
4907 we'll have to allocate more space) */
4908 v = _PyUnicode_New(size);
4909 if (!v)
4910 goto onError;
4911 kind = PyUnicode_WCHAR_KIND;
4912 data = PyUnicode_AS_UNICODE(v);
4913 }
4914
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915 if (size == 0)
4916 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004917 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004919
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920 while (s < end) {
4921 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004922 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004923 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004925 if (kind == PyUnicode_WCHAR_KIND) {
4926 assert(i < _PyUnicode_WSTR_LENGTH(v));
4927 }
4928 else {
4929 /* The only case in which i == ascii_length is a backslash
4930 followed by a newline. */
4931 assert(i <= ascii_length);
4932 }
4933
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 /* Non-escape characters are interpreted as Unicode ordinals */
4935 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004936 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 continue;
4938 }
4939
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004940 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 /* \ - Escapes */
4942 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004943 c = *s++;
4944 if (s > end)
4945 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004946
4947 if (kind == PyUnicode_WCHAR_KIND) {
4948 assert(i < _PyUnicode_WSTR_LENGTH(v));
4949 }
4950 else {
4951 /* The only case in which i == ascii_length is a backslash
4952 followed by a newline. */
4953 assert(i < ascii_length || (i == ascii_length && c == '\n'));
4954 }
4955
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004956 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004960 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
4961 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
4962 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
4963 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
4964 /* FF */
4965 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
4966 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
4967 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
4968 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
4969 /* VT */
4970 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
4971 /* BEL, not classic C */
4972 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 case '0': case '1': case '2': case '3':
4976 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004977 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004978 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004979 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004980 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004981 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004983 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984 break;
4985
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 /* hex escapes */
4987 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00004989 digits = 2;
4990 message = "truncated \\xXX escape";
4991 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00004995 digits = 4;
4996 message = "truncated \\uXXXX escape";
4997 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998
Benjamin Peterson29060642009-01-31 22:14:21 +00004999 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005000 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005001 digits = 8;
5002 message = "truncated \\UXXXXXXXX escape";
5003 hexescape:
5004 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005005 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005006 if (s+digits>end) {
5007 endinpos = size;
5008 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 errors, &errorHandler,
5010 "unicodeescape", "end of string in escape sequence",
5011 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005012 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005013 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005014 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005015 goto nextByte;
5016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005017 for (j = 0; j < digits; ++j) {
5018 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005019 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005020 endinpos = (s+j+1)-starts;
5021 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 errors, &errorHandler,
5024 "unicodeescape", message,
5025 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005026 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005027 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005028 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005029 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005030 }
5031 chr = (chr<<4) & ~0xF;
5032 if (c >= '0' && c <= '9')
5033 chr += c - '0';
5034 else if (c >= 'a' && c <= 'f')
5035 chr += 10 + c - 'a';
5036 else
5037 chr += 10 + c - 'A';
5038 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005039 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005040 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005041 /* _decoding_error will have already written into the
5042 target buffer. */
5043 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005044 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005045 /* when we get here, chr is a 32-bit unicode character */
5046 if (chr <= 0xffff)
5047 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005048 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005049 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005050 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005051 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005052#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005053 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005054#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005055 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005056 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5057 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005058#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005059 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005060 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005061 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005062 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 errors, &errorHandler,
5064 "unicodeescape", "illegal Unicode character",
5065 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005066 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005067 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005068 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005069 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005070 break;
5071
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005073 case 'N':
5074 message = "malformed \\N character escape";
5075 if (ucnhash_CAPI == NULL) {
5076 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005077 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5078 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005079 if (ucnhash_CAPI == NULL)
5080 goto ucnhashError;
5081 }
5082 if (*s == '{') {
5083 const char *start = s+1;
5084 /* look for the closing brace */
5085 while (*s != '}' && s < end)
5086 s++;
5087 if (s > start && s < end && *s == '}') {
5088 /* found a name. look it up in the unicode database */
5089 message = "unknown Unicode character name";
5090 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005091 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5092 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005093 goto store;
5094 }
5095 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005096 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005097 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005098 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005099 errors, &errorHandler,
5100 "unicodeescape", message,
5101 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005102 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005103 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005104 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005105 break;
5106
5107 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005108 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005109 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005110 message = "\\ at end of string";
5111 s--;
5112 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005113 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005115 errors, &errorHandler,
5116 "unicodeescape", message,
5117 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005118 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005119 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005120 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005121 }
5122 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005123 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5124 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005125 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005126 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005128 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005131 /* Ensure the length prediction worked in case of ASCII strings */
5132 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5133
5134 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5135 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005136 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005137 Py_XDECREF(errorHandler);
5138 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005140
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005142 PyErr_SetString(
5143 PyExc_UnicodeError,
5144 "\\N escapes not supported (can't load unicodedata module)"
5145 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005146 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005147 Py_XDECREF(errorHandler);
5148 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005149 return NULL;
5150
Benjamin Peterson29060642009-01-31 22:14:21 +00005151 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005153 Py_XDECREF(errorHandler);
5154 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 return NULL;
5156}
5157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005158#undef WRITE_ASCII_OR_WSTR
5159#undef WRITE_WSTR
5160
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161/* Return a Unicode-Escape string version of the Unicode object.
5162
5163 If quotes is true, the string is enclosed in u"" or u'' quotes as
5164 appropriate.
5165
5166*/
5167
Walter Dörwald79e913e2007-05-12 11:08:06 +00005168static const char *hexdigits = "0123456789abcdef";
5169
Alexander Belopolsky40018472011-02-26 01:02:56 +00005170PyObject *
5171PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5172 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005174 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005177#ifdef Py_UNICODE_WIDE
5178 const Py_ssize_t expandsize = 10;
5179#else
5180 const Py_ssize_t expandsize = 6;
5181#endif
5182
Thomas Wouters89f507f2006-12-13 04:49:30 +00005183 /* XXX(nnorwitz): rather than over-allocating, it would be
5184 better to choose a different scheme. Perhaps scan the
5185 first N-chars of the string and allocate based on that size.
5186 */
5187 /* Initial allocation is based on the longest-possible unichr
5188 escape.
5189
5190 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5191 unichr, so in this case it's the longest unichr escape. In
5192 narrow (UTF-16) builds this is five chars per source unichr
5193 since there are two unichrs in the surrogate pair, so in narrow
5194 (UTF-16) builds it's not the longest unichr escape.
5195
5196 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5197 so in the narrow (UTF-16) build case it's the longest unichr
5198 escape.
5199 */
5200
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005201 if (size == 0)
5202 return PyBytes_FromStringAndSize(NULL, 0);
5203
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005204 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005206
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005207 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 2
5209 + expandsize*size
5210 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 if (repr == NULL)
5212 return NULL;
5213
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005214 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 while (size-- > 0) {
5217 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005218
Walter Dörwald79e913e2007-05-12 11:08:06 +00005219 /* Escape backslashes */
5220 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221 *p++ = '\\';
5222 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005223 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005224 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005225
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005226#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005227 /* Map 21-bit characters to '\U00xxxxxx' */
5228 else if (ch >= 0x10000) {
5229 *p++ = '\\';
5230 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005231 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5232 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5233 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5234 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5235 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5236 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5237 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5238 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005240 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005241#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5243 else if (ch >= 0xD800 && ch < 0xDC00) {
5244 Py_UNICODE ch2;
5245 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005246
Benjamin Peterson29060642009-01-31 22:14:21 +00005247 ch2 = *s++;
5248 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005249 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5251 *p++ = '\\';
5252 *p++ = 'U';
5253 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5254 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5255 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5256 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5257 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5258 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5259 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5260 *p++ = hexdigits[ucs & 0x0000000F];
5261 continue;
5262 }
5263 /* Fall through: isolated surrogates are copied as-is */
5264 s--;
5265 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005266 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005267#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005268
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005270 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 *p++ = '\\';
5272 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005273 *p++ = hexdigits[(ch >> 12) & 0x000F];
5274 *p++ = hexdigits[(ch >> 8) & 0x000F];
5275 *p++ = hexdigits[(ch >> 4) & 0x000F];
5276 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005278
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005279 /* Map special whitespace to '\t', \n', '\r' */
5280 else if (ch == '\t') {
5281 *p++ = '\\';
5282 *p++ = 't';
5283 }
5284 else if (ch == '\n') {
5285 *p++ = '\\';
5286 *p++ = 'n';
5287 }
5288 else if (ch == '\r') {
5289 *p++ = '\\';
5290 *p++ = 'r';
5291 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005292
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005293 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005294 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005296 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005297 *p++ = hexdigits[(ch >> 4) & 0x000F];
5298 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005299 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005300
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 /* Copy everything else as-is */
5302 else
5303 *p++ = (char) ch;
5304 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005306 assert(p - PyBytes_AS_STRING(repr) > 0);
5307 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5308 return NULL;
5309 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310}
5311
Alexander Belopolsky40018472011-02-26 01:02:56 +00005312PyObject *
5313PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005315 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 if (!PyUnicode_Check(unicode)) {
5317 PyErr_BadArgument();
5318 return NULL;
5319 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005320 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5321 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005322 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323}
5324
5325/* --- Raw Unicode Escape Codec ------------------------------------------- */
5326
Alexander Belopolsky40018472011-02-26 01:02:56 +00005327PyObject *
5328PyUnicode_DecodeRawUnicodeEscape(const char *s,
5329 Py_ssize_t size,
5330 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005332 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005333 Py_ssize_t startinpos;
5334 Py_ssize_t endinpos;
5335 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005337 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 const char *end;
5339 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005340 PyObject *errorHandler = NULL;
5341 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005342
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 /* Escaped strings will always be longer than the resulting
5344 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005345 length after conversion to the true value. (But decoding error
5346 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 v = _PyUnicode_New(size);
5348 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005352 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 end = s + size;
5354 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 unsigned char c;
5356 Py_UCS4 x;
5357 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005358 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 /* Non-escape characters are interpreted as Unicode ordinals */
5361 if (*s != '\\') {
5362 *p++ = (unsigned char)*s++;
5363 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005364 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 startinpos = s-starts;
5366
5367 /* \u-escapes are only interpreted iff the number of leading
5368 backslashes if odd */
5369 bs = s;
5370 for (;s < end;) {
5371 if (*s != '\\')
5372 break;
5373 *p++ = (unsigned char)*s++;
5374 }
5375 if (((s - bs) & 1) == 0 ||
5376 s >= end ||
5377 (*s != 'u' && *s != 'U')) {
5378 continue;
5379 }
5380 p--;
5381 count = *s=='u' ? 4 : 8;
5382 s++;
5383
5384 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5385 outpos = p-PyUnicode_AS_UNICODE(v);
5386 for (x = 0, i = 0; i < count; ++i, ++s) {
5387 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005388 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005389 endinpos = s-starts;
5390 if (unicode_decode_call_errorhandler(
5391 errors, &errorHandler,
5392 "rawunicodeescape", "truncated \\uXXXX",
5393 &starts, &end, &startinpos, &endinpos, &exc, &s,
5394 &v, &outpos, &p))
5395 goto onError;
5396 goto nextByte;
5397 }
5398 x = (x<<4) & ~0xF;
5399 if (c >= '0' && c <= '9')
5400 x += c - '0';
5401 else if (c >= 'a' && c <= 'f')
5402 x += 10 + c - 'a';
5403 else
5404 x += 10 + c - 'A';
5405 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005406 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 /* UCS-2 character */
5408 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005409 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 /* UCS-4 character. Either store directly, or as
5411 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005412#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005414#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005415 x -= 0x10000L;
5416 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5417 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005418#endif
5419 } else {
5420 endinpos = s-starts;
5421 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005422 if (unicode_decode_call_errorhandler(
5423 errors, &errorHandler,
5424 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 &starts, &end, &startinpos, &endinpos, &exc, &s,
5426 &v, &outpos, &p))
5427 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005428 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 nextByte:
5430 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005432 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 Py_XDECREF(errorHandler);
5435 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005436 if (PyUnicode_READY(v) == -1) {
5437 Py_DECREF(v);
5438 return NULL;
5439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005441
Benjamin Peterson29060642009-01-31 22:14:21 +00005442 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005444 Py_XDECREF(errorHandler);
5445 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 return NULL;
5447}
5448
Alexander Belopolsky40018472011-02-26 01:02:56 +00005449PyObject *
5450PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5451 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005453 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 char *p;
5455 char *q;
5456
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005457#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005458 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005459#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005460 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005461#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005462
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005463 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005464 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005465
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005466 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 if (repr == NULL)
5468 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005469 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005470 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005472 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 while (size-- > 0) {
5474 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005475#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 /* Map 32-bit characters to '\Uxxxxxxxx' */
5477 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005478 *p++ = '\\';
5479 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005480 *p++ = hexdigits[(ch >> 28) & 0xf];
5481 *p++ = hexdigits[(ch >> 24) & 0xf];
5482 *p++ = hexdigits[(ch >> 20) & 0xf];
5483 *p++ = hexdigits[(ch >> 16) & 0xf];
5484 *p++ = hexdigits[(ch >> 12) & 0xf];
5485 *p++ = hexdigits[(ch >> 8) & 0xf];
5486 *p++ = hexdigits[(ch >> 4) & 0xf];
5487 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005488 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005489 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005490#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5492 if (ch >= 0xD800 && ch < 0xDC00) {
5493 Py_UNICODE ch2;
5494 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005495
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 ch2 = *s++;
5497 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005498 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5500 *p++ = '\\';
5501 *p++ = 'U';
5502 *p++ = hexdigits[(ucs >> 28) & 0xf];
5503 *p++ = hexdigits[(ucs >> 24) & 0xf];
5504 *p++ = hexdigits[(ucs >> 20) & 0xf];
5505 *p++ = hexdigits[(ucs >> 16) & 0xf];
5506 *p++ = hexdigits[(ucs >> 12) & 0xf];
5507 *p++ = hexdigits[(ucs >> 8) & 0xf];
5508 *p++ = hexdigits[(ucs >> 4) & 0xf];
5509 *p++ = hexdigits[ucs & 0xf];
5510 continue;
5511 }
5512 /* Fall through: isolated surrogates are copied as-is */
5513 s--;
5514 size++;
5515 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005516#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 /* Map 16-bit characters to '\uxxxx' */
5518 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 *p++ = '\\';
5520 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005521 *p++ = hexdigits[(ch >> 12) & 0xf];
5522 *p++ = hexdigits[(ch >> 8) & 0xf];
5523 *p++ = hexdigits[(ch >> 4) & 0xf];
5524 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005526 /* Copy everything else as-is */
5527 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 *p++ = (char) ch;
5529 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005530 size = p - q;
5531
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005532 assert(size > 0);
5533 if (_PyBytes_Resize(&repr, size) < 0)
5534 return NULL;
5535 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536}
5537
Alexander Belopolsky40018472011-02-26 01:02:56 +00005538PyObject *
5539PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005541 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005543 PyErr_BadArgument();
5544 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005546 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5547 PyUnicode_GET_SIZE(unicode));
5548
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005549 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550}
5551
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005552/* --- Unicode Internal Codec ------------------------------------------- */
5553
Alexander Belopolsky40018472011-02-26 01:02:56 +00005554PyObject *
5555_PyUnicode_DecodeUnicodeInternal(const char *s,
5556 Py_ssize_t size,
5557 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005558{
5559 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005560 Py_ssize_t startinpos;
5561 Py_ssize_t endinpos;
5562 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005563 PyUnicodeObject *v;
5564 Py_UNICODE *p;
5565 const char *end;
5566 const char *reason;
5567 PyObject *errorHandler = NULL;
5568 PyObject *exc = NULL;
5569
Neal Norwitzd43069c2006-01-08 01:12:10 +00005570#ifdef Py_UNICODE_WIDE
5571 Py_UNICODE unimax = PyUnicode_GetMax();
5572#endif
5573
Thomas Wouters89f507f2006-12-13 04:49:30 +00005574 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005575 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5576 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005577 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005578 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5579 as string was created with the old API. */
5580 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005582 p = PyUnicode_AS_UNICODE(v);
5583 end = s + size;
5584
5585 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005586 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005587 /* We have to sanity check the raw data, otherwise doom looms for
5588 some malformed UCS-4 data. */
5589 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005590#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005591 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005592#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005593 end-s < Py_UNICODE_SIZE
5594 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005596 startinpos = s - starts;
5597 if (end-s < Py_UNICODE_SIZE) {
5598 endinpos = end-starts;
5599 reason = "truncated input";
5600 }
5601 else {
5602 endinpos = s - starts + Py_UNICODE_SIZE;
5603 reason = "illegal code point (> 0x10FFFF)";
5604 }
5605 outpos = p - PyUnicode_AS_UNICODE(v);
5606 if (unicode_decode_call_errorhandler(
5607 errors, &errorHandler,
5608 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005609 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005610 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005611 goto onError;
5612 }
5613 }
5614 else {
5615 p++;
5616 s += Py_UNICODE_SIZE;
5617 }
5618 }
5619
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005620 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005621 goto onError;
5622 Py_XDECREF(errorHandler);
5623 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624 if (PyUnicode_READY(v) == -1) {
5625 Py_DECREF(v);
5626 return NULL;
5627 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005628 return (PyObject *)v;
5629
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005631 Py_XDECREF(v);
5632 Py_XDECREF(errorHandler);
5633 Py_XDECREF(exc);
5634 return NULL;
5635}
5636
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637/* --- Latin-1 Codec ------------------------------------------------------ */
5638
Alexander Belopolsky40018472011-02-26 01:02:56 +00005639PyObject *
5640PyUnicode_DecodeLatin1(const char *s,
5641 Py_ssize_t size,
5642 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005645 return PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646}
5647
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005649static void
5650make_encode_exception(PyObject **exceptionObject,
5651 const char *encoding,
5652 const Py_UNICODE *unicode, Py_ssize_t size,
5653 Py_ssize_t startpos, Py_ssize_t endpos,
5654 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 *exceptionObject = PyUnicodeEncodeError_Create(
5658 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 }
5660 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5662 goto onError;
5663 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5664 goto onError;
5665 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5666 goto onError;
5667 return;
5668 onError:
5669 Py_DECREF(*exceptionObject);
5670 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 }
5672}
5673
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005674/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005675static void
5676raise_encode_exception(PyObject **exceptionObject,
5677 const char *encoding,
5678 const Py_UNICODE *unicode, Py_ssize_t size,
5679 Py_ssize_t startpos, Py_ssize_t endpos,
5680 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005681{
5682 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005684 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686}
5687
5688/* error handling callback helper:
5689 build arguments, call the callback and check the arguments,
5690 put the result into newpos and return the replacement string, which
5691 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005692static PyObject *
5693unicode_encode_call_errorhandler(const char *errors,
5694 PyObject **errorHandler,
5695 const char *encoding, const char *reason,
5696 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5697 Py_ssize_t startpos, Py_ssize_t endpos,
5698 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005699{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005700 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701
5702 PyObject *restuple;
5703 PyObject *resunicode;
5704
5705 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005707 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709 }
5710
5711 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715
5716 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005718 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005721 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 Py_DECREF(restuple);
5723 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005725 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 &resunicode, newpos)) {
5727 Py_DECREF(restuple);
5728 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005729 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005730 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5731 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5732 Py_DECREF(restuple);
5733 return NULL;
5734 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005737 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5739 Py_DECREF(restuple);
5740 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005742 Py_INCREF(resunicode);
5743 Py_DECREF(restuple);
5744 return resunicode;
5745}
5746
Alexander Belopolsky40018472011-02-26 01:02:56 +00005747static PyObject *
5748unicode_encode_ucs1(const Py_UNICODE *p,
5749 Py_ssize_t size,
5750 const char *errors,
5751 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752{
5753 /* output object */
5754 PyObject *res;
5755 /* pointers to the beginning and end+1 of input */
5756 const Py_UNICODE *startp = p;
5757 const Py_UNICODE *endp = p + size;
5758 /* pointer to the beginning of the unencodable characters */
5759 /* const Py_UNICODE *badp = NULL; */
5760 /* pointer into the output */
5761 char *str;
5762 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005763 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005764 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5765 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005766 PyObject *errorHandler = NULL;
5767 PyObject *exc = NULL;
5768 /* the following variable is used for caching string comparisons
5769 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5770 int known_errorHandler = -1;
5771
5772 /* allocate enough for a simple encoding without
5773 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005774 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005775 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005776 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005778 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005779 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005780 ressize = size;
5781
5782 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 /* can we encode this? */
5786 if (c<limit) {
5787 /* no overflow check, because we know that the space is enough */
5788 *str++ = (char)c;
5789 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005790 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 else {
5792 Py_ssize_t unicodepos = p-startp;
5793 Py_ssize_t requiredsize;
5794 PyObject *repunicode;
5795 Py_ssize_t repsize;
5796 Py_ssize_t newpos;
5797 Py_ssize_t respos;
5798 Py_UNICODE *uni2;
5799 /* startpos for collecting unencodable chars */
5800 const Py_UNICODE *collstart = p;
5801 const Py_UNICODE *collend = p;
5802 /* find all unecodable characters */
5803 while ((collend < endp) && ((*collend)>=limit))
5804 ++collend;
5805 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5806 if (known_errorHandler==-1) {
5807 if ((errors==NULL) || (!strcmp(errors, "strict")))
5808 known_errorHandler = 1;
5809 else if (!strcmp(errors, "replace"))
5810 known_errorHandler = 2;
5811 else if (!strcmp(errors, "ignore"))
5812 known_errorHandler = 3;
5813 else if (!strcmp(errors, "xmlcharrefreplace"))
5814 known_errorHandler = 4;
5815 else
5816 known_errorHandler = 0;
5817 }
5818 switch (known_errorHandler) {
5819 case 1: /* strict */
5820 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5821 goto onError;
5822 case 2: /* replace */
5823 while (collstart++<collend)
5824 *str++ = '?'; /* fall through */
5825 case 3: /* ignore */
5826 p = collend;
5827 break;
5828 case 4: /* xmlcharrefreplace */
5829 respos = str - PyBytes_AS_STRING(res);
5830 /* determine replacement size (temporarily (mis)uses p) */
5831 for (p = collstart, repsize = 0; p < collend; ++p) {
5832 if (*p<10)
5833 repsize += 2+1+1;
5834 else if (*p<100)
5835 repsize += 2+2+1;
5836 else if (*p<1000)
5837 repsize += 2+3+1;
5838 else if (*p<10000)
5839 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005840#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 else
5842 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005843#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 else if (*p<100000)
5845 repsize += 2+5+1;
5846 else if (*p<1000000)
5847 repsize += 2+6+1;
5848 else
5849 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005850#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 }
5852 requiredsize = respos+repsize+(endp-collend);
5853 if (requiredsize > ressize) {
5854 if (requiredsize<2*ressize)
5855 requiredsize = 2*ressize;
5856 if (_PyBytes_Resize(&res, requiredsize))
5857 goto onError;
5858 str = PyBytes_AS_STRING(res) + respos;
5859 ressize = requiredsize;
5860 }
5861 /* generate replacement (temporarily (mis)uses p) */
5862 for (p = collstart; p < collend; ++p) {
5863 str += sprintf(str, "&#%d;", (int)*p);
5864 }
5865 p = collend;
5866 break;
5867 default:
5868 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5869 encoding, reason, startp, size, &exc,
5870 collstart-startp, collend-startp, &newpos);
5871 if (repunicode == NULL)
5872 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005873 if (PyBytes_Check(repunicode)) {
5874 /* Directly copy bytes result to output. */
5875 repsize = PyBytes_Size(repunicode);
5876 if (repsize > 1) {
5877 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005878 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005879 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5880 Py_DECREF(repunicode);
5881 goto onError;
5882 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005883 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005884 ressize += repsize-1;
5885 }
5886 memcpy(str, PyBytes_AsString(repunicode), repsize);
5887 str += repsize;
5888 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005889 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005890 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005891 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 /* need more space? (at least enough for what we
5893 have+the replacement+the rest of the string, so
5894 we won't have to check space for encodable characters) */
5895 respos = str - PyBytes_AS_STRING(res);
5896 repsize = PyUnicode_GET_SIZE(repunicode);
5897 requiredsize = respos+repsize+(endp-collend);
5898 if (requiredsize > ressize) {
5899 if (requiredsize<2*ressize)
5900 requiredsize = 2*ressize;
5901 if (_PyBytes_Resize(&res, requiredsize)) {
5902 Py_DECREF(repunicode);
5903 goto onError;
5904 }
5905 str = PyBytes_AS_STRING(res) + respos;
5906 ressize = requiredsize;
5907 }
5908 /* check if there is anything unencodable in the replacement
5909 and copy it to the output */
5910 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5911 c = *uni2;
5912 if (c >= limit) {
5913 raise_encode_exception(&exc, encoding, startp, size,
5914 unicodepos, unicodepos+1, reason);
5915 Py_DECREF(repunicode);
5916 goto onError;
5917 }
5918 *str = (char)c;
5919 }
5920 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005921 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005922 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005923 }
5924 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005925 /* Resize if we allocated to much */
5926 size = str - PyBytes_AS_STRING(res);
5927 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005928 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005929 if (_PyBytes_Resize(&res, size) < 0)
5930 goto onError;
5931 }
5932
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005933 Py_XDECREF(errorHandler);
5934 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005935 return res;
5936
5937 onError:
5938 Py_XDECREF(res);
5939 Py_XDECREF(errorHandler);
5940 Py_XDECREF(exc);
5941 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005942}
5943
Alexander Belopolsky40018472011-02-26 01:02:56 +00005944PyObject *
5945PyUnicode_EncodeLatin1(const Py_UNICODE *p,
5946 Py_ssize_t size,
5947 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005949 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950}
5951
Alexander Belopolsky40018472011-02-26 01:02:56 +00005952PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005953_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954{
5955 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 PyErr_BadArgument();
5957 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005959 if (PyUnicode_READY(unicode) == -1)
5960 return NULL;
5961 /* Fast path: if it is a one-byte string, construct
5962 bytes object directly. */
5963 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
5964 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
5965 PyUnicode_GET_LENGTH(unicode));
5966 /* Non-Latin-1 characters present. Defer to above function to
5967 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005970 errors);
5971}
5972
5973PyObject*
5974PyUnicode_AsLatin1String(PyObject *unicode)
5975{
5976 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977}
5978
5979/* --- 7-bit ASCII Codec -------------------------------------------------- */
5980
Alexander Belopolsky40018472011-02-26 01:02:56 +00005981PyObject *
5982PyUnicode_DecodeASCII(const char *s,
5983 Py_ssize_t size,
5984 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005986 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 PyUnicodeObject *v;
5988 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005989 Py_ssize_t startinpos;
5990 Py_ssize_t endinpos;
5991 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005992 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005993 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994 PyObject *errorHandler = NULL;
5995 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005996 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00005997
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005999 if (size == 1 && *(unsigned char*)s < 128)
6000 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6001
6002 /* Fast path. Assume the input actually *is* ASCII, and allocate
6003 a single-block Unicode object with that assumption. If there is
6004 an error, drop the object and start over. */
6005 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6006 if (v == NULL)
6007 goto onError;
6008 d = PyUnicode_1BYTE_DATA(v);
6009 for (i = 0; i < size; i++) {
6010 unsigned char ch = ((unsigned char*)s)[i];
6011 if (ch < 128)
6012 d[i] = ch;
6013 else
6014 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006015 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006016 if (i == size)
6017 return (PyObject*)v;
6018 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006019
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 v = _PyUnicode_New(size);
6021 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026 e = s + size;
6027 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 register unsigned char c = (unsigned char)*s;
6029 if (c < 128) {
6030 *p++ = c;
6031 ++s;
6032 }
6033 else {
6034 startinpos = s-starts;
6035 endinpos = startinpos + 1;
6036 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6037 if (unicode_decode_call_errorhandler(
6038 errors, &errorHandler,
6039 "ascii", "ordinal not in range(128)",
6040 &starts, &e, &startinpos, &endinpos, &exc, &s,
6041 &v, &outpos, &p))
6042 goto onError;
6043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006045 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6047 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006048 Py_XDECREF(errorHandler);
6049 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006050 if (PyUnicode_READY(v) == -1) {
6051 Py_DECREF(v);
6052 return NULL;
6053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006055
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006058 Py_XDECREF(errorHandler);
6059 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 return NULL;
6061}
6062
Alexander Belopolsky40018472011-02-26 01:02:56 +00006063PyObject *
6064PyUnicode_EncodeASCII(const Py_UNICODE *p,
6065 Py_ssize_t size,
6066 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006068 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069}
6070
Alexander Belopolsky40018472011-02-26 01:02:56 +00006071PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006072_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073{
6074 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 PyErr_BadArgument();
6076 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006078 if (PyUnicode_READY(unicode) == -1)
6079 return NULL;
6080 /* Fast path: if it is an ASCII-only string, construct bytes object
6081 directly. Else defer to above function to raise the exception. */
6082 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6083 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6084 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006087 errors);
6088}
6089
6090PyObject *
6091PyUnicode_AsASCIIString(PyObject *unicode)
6092{
6093 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094}
6095
Victor Stinner99b95382011-07-04 14:23:54 +02006096#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006097
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006098/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006099
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006100#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006101#define NEED_RETRY
6102#endif
6103
6104/* XXX This code is limited to "true" double-byte encodings, as
6105 a) it assumes an incomplete character consists of a single byte, and
6106 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006108
Alexander Belopolsky40018472011-02-26 01:02:56 +00006109static int
6110is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006111{
6112 const char *curr = s + offset;
6113
6114 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 const char *prev = CharPrev(s, curr);
6116 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006117 }
6118 return 0;
6119}
6120
6121/*
6122 * Decode MBCS string into unicode object. If 'final' is set, converts
6123 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6124 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006125static int
6126decode_mbcs(PyUnicodeObject **v,
6127 const char *s, /* MBCS string */
6128 int size, /* sizeof MBCS string */
6129 int final,
6130 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006131{
6132 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006133 Py_ssize_t n;
6134 DWORD usize;
6135 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006136
6137 assert(size >= 0);
6138
Victor Stinner554f3f02010-06-16 23:33:54 +00006139 /* check and handle 'errors' arg */
6140 if (errors==NULL || strcmp(errors, "strict")==0)
6141 flags = MB_ERR_INVALID_CHARS;
6142 else if (strcmp(errors, "ignore")==0)
6143 flags = 0;
6144 else {
6145 PyErr_Format(PyExc_ValueError,
6146 "mbcs encoding does not support errors='%s'",
6147 errors);
6148 return -1;
6149 }
6150
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006151 /* Skip trailing lead-byte unless 'final' is set */
6152 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006154
6155 /* First get the size of the result */
6156 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006157 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6158 if (usize==0)
6159 goto mbcs_decode_error;
6160 } else
6161 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006162
6163 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 /* Create unicode object */
6165 *v = _PyUnicode_New(usize);
6166 if (*v == NULL)
6167 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006168 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006169 }
6170 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 /* Extend unicode object */
6172 n = PyUnicode_GET_SIZE(*v);
6173 if (_PyUnicode_Resize(v, n + usize) < 0)
6174 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006175 }
6176
6177 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006178 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006180 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6181 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006183 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006184 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006185
6186mbcs_decode_error:
6187 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6188 we raise a UnicodeDecodeError - else it is a 'generic'
6189 windows error
6190 */
6191 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6192 /* Ideally, we should get reason from FormatMessage - this
6193 is the Windows 2000 English version of the message
6194 */
6195 PyObject *exc = NULL;
6196 const char *reason = "No mapping for the Unicode character exists "
6197 "in the target multi-byte code page.";
6198 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6199 if (exc != NULL) {
6200 PyCodec_StrictErrors(exc);
6201 Py_DECREF(exc);
6202 }
6203 } else {
6204 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6205 }
6206 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006207}
6208
Alexander Belopolsky40018472011-02-26 01:02:56 +00006209PyObject *
6210PyUnicode_DecodeMBCSStateful(const char *s,
6211 Py_ssize_t size,
6212 const char *errors,
6213 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006214{
6215 PyUnicodeObject *v = NULL;
6216 int done;
6217
6218 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006220
6221#ifdef NEED_RETRY
6222 retry:
6223 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006224 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006225 else
6226#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006227 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006228
6229 if (done < 0) {
6230 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006232 }
6233
6234 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006236
6237#ifdef NEED_RETRY
6238 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 s += done;
6240 size -= done;
6241 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006242 }
6243#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006244 if (PyUnicode_READY(v) == -1) {
6245 Py_DECREF(v);
6246 return NULL;
6247 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006248 return (PyObject *)v;
6249}
6250
Alexander Belopolsky40018472011-02-26 01:02:56 +00006251PyObject *
6252PyUnicode_DecodeMBCS(const char *s,
6253 Py_ssize_t size,
6254 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006255{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006256 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6257}
6258
6259/*
6260 * Convert unicode into string object (MBCS).
6261 * Returns 0 if succeed, -1 otherwise.
6262 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006263static int
6264encode_mbcs(PyObject **repr,
6265 const Py_UNICODE *p, /* unicode */
6266 int size, /* size of unicode */
6267 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006268{
Victor Stinner554f3f02010-06-16 23:33:54 +00006269 BOOL usedDefaultChar = FALSE;
6270 BOOL *pusedDefaultChar;
6271 int mbcssize;
6272 Py_ssize_t n;
6273 PyObject *exc = NULL;
6274 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006275
6276 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006277
Victor Stinner554f3f02010-06-16 23:33:54 +00006278 /* check and handle 'errors' arg */
6279 if (errors==NULL || strcmp(errors, "strict")==0) {
6280 flags = WC_NO_BEST_FIT_CHARS;
6281 pusedDefaultChar = &usedDefaultChar;
6282 } else if (strcmp(errors, "replace")==0) {
6283 flags = 0;
6284 pusedDefaultChar = NULL;
6285 } else {
6286 PyErr_Format(PyExc_ValueError,
6287 "mbcs encoding does not support errors='%s'",
6288 errors);
6289 return -1;
6290 }
6291
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006292 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006293 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006294 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6295 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 if (mbcssize == 0) {
6297 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6298 return -1;
6299 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006300 /* If we used a default char, then we failed! */
6301 if (pusedDefaultChar && *pusedDefaultChar)
6302 goto mbcs_encode_error;
6303 } else {
6304 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006305 }
6306
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006307 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 /* Create string object */
6309 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6310 if (*repr == NULL)
6311 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006312 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006313 }
6314 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 /* Extend string object */
6316 n = PyBytes_Size(*repr);
6317 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6318 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006319 }
6320
6321 /* Do the conversion */
6322 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006324 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6325 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6327 return -1;
6328 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006329 if (pusedDefaultChar && *pusedDefaultChar)
6330 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006331 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006332 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006333
6334mbcs_encode_error:
6335 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6336 Py_XDECREF(exc);
6337 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006338}
6339
Alexander Belopolsky40018472011-02-26 01:02:56 +00006340PyObject *
6341PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6342 Py_ssize_t size,
6343 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006344{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006345 PyObject *repr = NULL;
6346 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006347
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006348#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006350 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006351 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006352 else
6353#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006354 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006355
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006356 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 Py_XDECREF(repr);
6358 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006359 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006360
6361#ifdef NEED_RETRY
6362 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 p += INT_MAX;
6364 size -= INT_MAX;
6365 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006366 }
6367#endif
6368
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006369 return repr;
6370}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006371
Alexander Belopolsky40018472011-02-26 01:02:56 +00006372PyObject *
6373PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006374{
6375 if (!PyUnicode_Check(unicode)) {
6376 PyErr_BadArgument();
6377 return NULL;
6378 }
6379 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 PyUnicode_GET_SIZE(unicode),
6381 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006382}
6383
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006384#undef NEED_RETRY
6385
Victor Stinner99b95382011-07-04 14:23:54 +02006386#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006387
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388/* --- Character Mapping Codec -------------------------------------------- */
6389
Alexander Belopolsky40018472011-02-26 01:02:56 +00006390PyObject *
6391PyUnicode_DecodeCharmap(const char *s,
6392 Py_ssize_t size,
6393 PyObject *mapping,
6394 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006397 Py_ssize_t startinpos;
6398 Py_ssize_t endinpos;
6399 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 PyUnicodeObject *v;
6402 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006403 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 PyObject *errorHandler = NULL;
6405 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006406 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006407 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006408
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409 /* Default to Latin-1 */
6410 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412
6413 v = _PyUnicode_New(size);
6414 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006420 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 mapstring = PyUnicode_AS_UNICODE(mapping);
6422 maplen = PyUnicode_GET_SIZE(mapping);
6423 while (s < e) {
6424 unsigned char ch = *s;
6425 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 if (ch < maplen)
6428 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 if (x == 0xfffe) {
6431 /* undefined mapping */
6432 outpos = p-PyUnicode_AS_UNICODE(v);
6433 startinpos = s-starts;
6434 endinpos = startinpos+1;
6435 if (unicode_decode_call_errorhandler(
6436 errors, &errorHandler,
6437 "charmap", "character maps to <undefined>",
6438 &starts, &e, &startinpos, &endinpos, &exc, &s,
6439 &v, &outpos, &p)) {
6440 goto onError;
6441 }
6442 continue;
6443 }
6444 *p++ = x;
6445 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006446 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006447 }
6448 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 while (s < e) {
6450 unsigned char ch = *s;
6451 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006452
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6454 w = PyLong_FromLong((long)ch);
6455 if (w == NULL)
6456 goto onError;
6457 x = PyObject_GetItem(mapping, w);
6458 Py_DECREF(w);
6459 if (x == NULL) {
6460 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6461 /* No mapping found means: mapping is undefined. */
6462 PyErr_Clear();
6463 x = Py_None;
6464 Py_INCREF(x);
6465 } else
6466 goto onError;
6467 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006468
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 /* Apply mapping */
6470 if (PyLong_Check(x)) {
6471 long value = PyLong_AS_LONG(x);
6472 if (value < 0 || value > 65535) {
6473 PyErr_SetString(PyExc_TypeError,
6474 "character mapping must be in range(65536)");
6475 Py_DECREF(x);
6476 goto onError;
6477 }
6478 *p++ = (Py_UNICODE)value;
6479 }
6480 else if (x == Py_None) {
6481 /* undefined mapping */
6482 outpos = p-PyUnicode_AS_UNICODE(v);
6483 startinpos = s-starts;
6484 endinpos = startinpos+1;
6485 if (unicode_decode_call_errorhandler(
6486 errors, &errorHandler,
6487 "charmap", "character maps to <undefined>",
6488 &starts, &e, &startinpos, &endinpos, &exc, &s,
6489 &v, &outpos, &p)) {
6490 Py_DECREF(x);
6491 goto onError;
6492 }
6493 Py_DECREF(x);
6494 continue;
6495 }
6496 else if (PyUnicode_Check(x)) {
6497 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006498
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 if (targetsize == 1)
6500 /* 1-1 mapping */
6501 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006502
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 else if (targetsize > 1) {
6504 /* 1-n mapping */
6505 if (targetsize > extrachars) {
6506 /* resize first */
6507 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6508 Py_ssize_t needed = (targetsize - extrachars) + \
6509 (targetsize << 2);
6510 extrachars += needed;
6511 /* XXX overflow detection missing */
6512 if (_PyUnicode_Resize(&v,
6513 PyUnicode_GET_SIZE(v) + needed) < 0) {
6514 Py_DECREF(x);
6515 goto onError;
6516 }
6517 p = PyUnicode_AS_UNICODE(v) + oldpos;
6518 }
6519 Py_UNICODE_COPY(p,
6520 PyUnicode_AS_UNICODE(x),
6521 targetsize);
6522 p += targetsize;
6523 extrachars -= targetsize;
6524 }
6525 /* 1-0 mapping: skip the character */
6526 }
6527 else {
6528 /* wrong return value */
6529 PyErr_SetString(PyExc_TypeError,
6530 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006531 Py_DECREF(x);
6532 goto onError;
6533 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 Py_DECREF(x);
6535 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 }
6538 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6540 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006541 Py_XDECREF(errorHandler);
6542 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006543 if (PyUnicode_READY(v) == -1) {
6544 Py_DECREF(v);
6545 return NULL;
6546 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006548
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006550 Py_XDECREF(errorHandler);
6551 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 Py_XDECREF(v);
6553 return NULL;
6554}
6555
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006556/* Charmap encoding: the lookup table */
6557
Alexander Belopolsky40018472011-02-26 01:02:56 +00006558struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 PyObject_HEAD
6560 unsigned char level1[32];
6561 int count2, count3;
6562 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006563};
6564
6565static PyObject*
6566encoding_map_size(PyObject *obj, PyObject* args)
6567{
6568 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006569 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006571}
6572
6573static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006574 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 PyDoc_STR("Return the size (in bytes) of this object") },
6576 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006577};
6578
6579static void
6580encoding_map_dealloc(PyObject* o)
6581{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006582 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006583}
6584
6585static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006586 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 "EncodingMap", /*tp_name*/
6588 sizeof(struct encoding_map), /*tp_basicsize*/
6589 0, /*tp_itemsize*/
6590 /* methods */
6591 encoding_map_dealloc, /*tp_dealloc*/
6592 0, /*tp_print*/
6593 0, /*tp_getattr*/
6594 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006595 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 0, /*tp_repr*/
6597 0, /*tp_as_number*/
6598 0, /*tp_as_sequence*/
6599 0, /*tp_as_mapping*/
6600 0, /*tp_hash*/
6601 0, /*tp_call*/
6602 0, /*tp_str*/
6603 0, /*tp_getattro*/
6604 0, /*tp_setattro*/
6605 0, /*tp_as_buffer*/
6606 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6607 0, /*tp_doc*/
6608 0, /*tp_traverse*/
6609 0, /*tp_clear*/
6610 0, /*tp_richcompare*/
6611 0, /*tp_weaklistoffset*/
6612 0, /*tp_iter*/
6613 0, /*tp_iternext*/
6614 encoding_map_methods, /*tp_methods*/
6615 0, /*tp_members*/
6616 0, /*tp_getset*/
6617 0, /*tp_base*/
6618 0, /*tp_dict*/
6619 0, /*tp_descr_get*/
6620 0, /*tp_descr_set*/
6621 0, /*tp_dictoffset*/
6622 0, /*tp_init*/
6623 0, /*tp_alloc*/
6624 0, /*tp_new*/
6625 0, /*tp_free*/
6626 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006627};
6628
6629PyObject*
6630PyUnicode_BuildEncodingMap(PyObject* string)
6631{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006632 PyObject *result;
6633 struct encoding_map *mresult;
6634 int i;
6635 int need_dict = 0;
6636 unsigned char level1[32];
6637 unsigned char level2[512];
6638 unsigned char *mlevel1, *mlevel2, *mlevel3;
6639 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006640 int kind;
6641 void *data;
6642 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006644 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006645 PyErr_BadArgument();
6646 return NULL;
6647 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006648 kind = PyUnicode_KIND(string);
6649 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006650 memset(level1, 0xFF, sizeof level1);
6651 memset(level2, 0xFF, sizeof level2);
6652
6653 /* If there isn't a one-to-one mapping of NULL to \0,
6654 or if there are non-BMP characters, we need to use
6655 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006656 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006657 need_dict = 1;
6658 for (i = 1; i < 256; i++) {
6659 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006660 ch = PyUnicode_READ(kind, data, i);
6661 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006662 need_dict = 1;
6663 break;
6664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006665 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006666 /* unmapped character */
6667 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006668 l1 = ch >> 11;
6669 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006670 if (level1[l1] == 0xFF)
6671 level1[l1] = count2++;
6672 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006673 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006674 }
6675
6676 if (count2 >= 0xFF || count3 >= 0xFF)
6677 need_dict = 1;
6678
6679 if (need_dict) {
6680 PyObject *result = PyDict_New();
6681 PyObject *key, *value;
6682 if (!result)
6683 return NULL;
6684 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006685 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006686 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006687 if (!key || !value)
6688 goto failed1;
6689 if (PyDict_SetItem(result, key, value) == -1)
6690 goto failed1;
6691 Py_DECREF(key);
6692 Py_DECREF(value);
6693 }
6694 return result;
6695 failed1:
6696 Py_XDECREF(key);
6697 Py_XDECREF(value);
6698 Py_DECREF(result);
6699 return NULL;
6700 }
6701
6702 /* Create a three-level trie */
6703 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6704 16*count2 + 128*count3 - 1);
6705 if (!result)
6706 return PyErr_NoMemory();
6707 PyObject_Init(result, &EncodingMapType);
6708 mresult = (struct encoding_map*)result;
6709 mresult->count2 = count2;
6710 mresult->count3 = count3;
6711 mlevel1 = mresult->level1;
6712 mlevel2 = mresult->level23;
6713 mlevel3 = mresult->level23 + 16*count2;
6714 memcpy(mlevel1, level1, 32);
6715 memset(mlevel2, 0xFF, 16*count2);
6716 memset(mlevel3, 0, 128*count3);
6717 count3 = 0;
6718 for (i = 1; i < 256; i++) {
6719 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006720 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006721 /* unmapped character */
6722 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006723 o1 = PyUnicode_READ(kind, data, i)>>11;
6724 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006725 i2 = 16*mlevel1[o1] + o2;
6726 if (mlevel2[i2] == 0xFF)
6727 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006728 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006729 i3 = 128*mlevel2[i2] + o3;
6730 mlevel3[i3] = i;
6731 }
6732 return result;
6733}
6734
6735static int
6736encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6737{
6738 struct encoding_map *map = (struct encoding_map*)mapping;
6739 int l1 = c>>11;
6740 int l2 = (c>>7) & 0xF;
6741 int l3 = c & 0x7F;
6742 int i;
6743
6744#ifdef Py_UNICODE_WIDE
6745 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006747 }
6748#endif
6749 if (c == 0)
6750 return 0;
6751 /* level 1*/
6752 i = map->level1[l1];
6753 if (i == 0xFF) {
6754 return -1;
6755 }
6756 /* level 2*/
6757 i = map->level23[16*i+l2];
6758 if (i == 0xFF) {
6759 return -1;
6760 }
6761 /* level 3 */
6762 i = map->level23[16*map->count2 + 128*i + l3];
6763 if (i == 0) {
6764 return -1;
6765 }
6766 return i;
6767}
6768
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006769/* Lookup the character ch in the mapping. If the character
6770 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006771 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006772static PyObject *
6773charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774{
Christian Heimes217cfd12007-12-02 14:31:20 +00006775 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006776 PyObject *x;
6777
6778 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006780 x = PyObject_GetItem(mapping, w);
6781 Py_DECREF(w);
6782 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6784 /* No mapping found means: mapping is undefined. */
6785 PyErr_Clear();
6786 x = Py_None;
6787 Py_INCREF(x);
6788 return x;
6789 } else
6790 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006792 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006793 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006794 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006795 long value = PyLong_AS_LONG(x);
6796 if (value < 0 || value > 255) {
6797 PyErr_SetString(PyExc_TypeError,
6798 "character mapping must be in range(256)");
6799 Py_DECREF(x);
6800 return NULL;
6801 }
6802 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006804 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 /* wrong return value */
6808 PyErr_Format(PyExc_TypeError,
6809 "character mapping must return integer, bytes or None, not %.400s",
6810 x->ob_type->tp_name);
6811 Py_DECREF(x);
6812 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 }
6814}
6815
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006816static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006817charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006818{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006819 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6820 /* exponentially overallocate to minimize reallocations */
6821 if (requiredsize < 2*outsize)
6822 requiredsize = 2*outsize;
6823 if (_PyBytes_Resize(outobj, requiredsize))
6824 return -1;
6825 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006826}
6827
Benjamin Peterson14339b62009-01-31 16:36:08 +00006828typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006830} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006831/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006832 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006833 space is available. Return a new reference to the object that
6834 was put in the output buffer, or Py_None, if the mapping was undefined
6835 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006836 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006837static charmapencode_result
6838charmapencode_output(Py_UNICODE c, PyObject *mapping,
6839 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006840{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006841 PyObject *rep;
6842 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006843 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006844
Christian Heimes90aa7642007-12-19 02:45:37 +00006845 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006846 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006848 if (res == -1)
6849 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006850 if (outsize<requiredsize)
6851 if (charmapencode_resize(outobj, outpos, requiredsize))
6852 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006853 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006854 outstart[(*outpos)++] = (char)res;
6855 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006856 }
6857
6858 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006859 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006861 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 Py_DECREF(rep);
6863 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006864 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 if (PyLong_Check(rep)) {
6866 Py_ssize_t requiredsize = *outpos+1;
6867 if (outsize<requiredsize)
6868 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6869 Py_DECREF(rep);
6870 return enc_EXCEPTION;
6871 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006872 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006874 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 else {
6876 const char *repchars = PyBytes_AS_STRING(rep);
6877 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6878 Py_ssize_t requiredsize = *outpos+repsize;
6879 if (outsize<requiredsize)
6880 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6881 Py_DECREF(rep);
6882 return enc_EXCEPTION;
6883 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006884 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 memcpy(outstart + *outpos, repchars, repsize);
6886 *outpos += repsize;
6887 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006888 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006889 Py_DECREF(rep);
6890 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006891}
6892
6893/* handle an error in PyUnicode_EncodeCharmap
6894 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006895static int
6896charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006897 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006898 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006899 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006900 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006901{
6902 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006903 Py_ssize_t repsize;
6904 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006905 Py_UNICODE *uni2;
6906 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006907 Py_ssize_t collstartpos = *inpos;
6908 Py_ssize_t collendpos = *inpos+1;
6909 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006910 char *encoding = "charmap";
6911 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006912 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006913
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006914 /* find all unencodable characters */
6915 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006916 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006917 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 int res = encoding_map_lookup(p[collendpos], mapping);
6919 if (res != -1)
6920 break;
6921 ++collendpos;
6922 continue;
6923 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006924
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 rep = charmapencode_lookup(p[collendpos], mapping);
6926 if (rep==NULL)
6927 return -1;
6928 else if (rep!=Py_None) {
6929 Py_DECREF(rep);
6930 break;
6931 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006932 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006934 }
6935 /* cache callback name lookup
6936 * (if not done yet, i.e. it's the first error) */
6937 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 if ((errors==NULL) || (!strcmp(errors, "strict")))
6939 *known_errorHandler = 1;
6940 else if (!strcmp(errors, "replace"))
6941 *known_errorHandler = 2;
6942 else if (!strcmp(errors, "ignore"))
6943 *known_errorHandler = 3;
6944 else if (!strcmp(errors, "xmlcharrefreplace"))
6945 *known_errorHandler = 4;
6946 else
6947 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006948 }
6949 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006950 case 1: /* strict */
6951 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6952 return -1;
6953 case 2: /* replace */
6954 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006955 x = charmapencode_output('?', mapping, res, respos);
6956 if (x==enc_EXCEPTION) {
6957 return -1;
6958 }
6959 else if (x==enc_FAILED) {
6960 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6961 return -1;
6962 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006963 }
6964 /* fall through */
6965 case 3: /* ignore */
6966 *inpos = collendpos;
6967 break;
6968 case 4: /* xmlcharrefreplace */
6969 /* generate replacement (temporarily (mis)uses p) */
6970 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 char buffer[2+29+1+1];
6972 char *cp;
6973 sprintf(buffer, "&#%d;", (int)p[collpos]);
6974 for (cp = buffer; *cp; ++cp) {
6975 x = charmapencode_output(*cp, mapping, res, respos);
6976 if (x==enc_EXCEPTION)
6977 return -1;
6978 else if (x==enc_FAILED) {
6979 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6980 return -1;
6981 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006982 }
6983 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006984 *inpos = collendpos;
6985 break;
6986 default:
6987 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 encoding, reason, p, size, exceptionObject,
6989 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006990 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006992 if (PyBytes_Check(repunicode)) {
6993 /* Directly copy bytes result to output. */
6994 Py_ssize_t outsize = PyBytes_Size(*res);
6995 Py_ssize_t requiredsize;
6996 repsize = PyBytes_Size(repunicode);
6997 requiredsize = *respos + repsize;
6998 if (requiredsize > outsize)
6999 /* Make room for all additional bytes. */
7000 if (charmapencode_resize(res, respos, requiredsize)) {
7001 Py_DECREF(repunicode);
7002 return -1;
7003 }
7004 memcpy(PyBytes_AsString(*res) + *respos,
7005 PyBytes_AsString(repunicode), repsize);
7006 *respos += repsize;
7007 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007008 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007009 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007010 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007011 /* generate replacement */
7012 repsize = PyUnicode_GET_SIZE(repunicode);
7013 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 x = charmapencode_output(*uni2, mapping, res, respos);
7015 if (x==enc_EXCEPTION) {
7016 return -1;
7017 }
7018 else if (x==enc_FAILED) {
7019 Py_DECREF(repunicode);
7020 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7021 return -1;
7022 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007023 }
7024 *inpos = newpos;
7025 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007026 }
7027 return 0;
7028}
7029
Alexander Belopolsky40018472011-02-26 01:02:56 +00007030PyObject *
7031PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7032 Py_ssize_t size,
7033 PyObject *mapping,
7034 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007036 /* output object */
7037 PyObject *res = NULL;
7038 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007039 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007040 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007041 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007042 PyObject *errorHandler = NULL;
7043 PyObject *exc = NULL;
7044 /* the following variable is used for caching string comparisons
7045 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7046 * 3=ignore, 4=xmlcharrefreplace */
7047 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048
7049 /* Default to Latin-1 */
7050 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007053 /* allocate enough for a simple encoding without
7054 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007055 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007056 if (res == NULL)
7057 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007058 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007061 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 /* try to encode it */
7063 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7064 if (x==enc_EXCEPTION) /* error */
7065 goto onError;
7066 if (x==enc_FAILED) { /* unencodable character */
7067 if (charmap_encoding_error(p, size, &inpos, mapping,
7068 &exc,
7069 &known_errorHandler, &errorHandler, errors,
7070 &res, &respos)) {
7071 goto onError;
7072 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007073 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007074 else
7075 /* done with this character => adjust input position */
7076 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007079 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007080 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007081 if (_PyBytes_Resize(&res, respos) < 0)
7082 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007083
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007084 Py_XDECREF(exc);
7085 Py_XDECREF(errorHandler);
7086 return res;
7087
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007089 Py_XDECREF(res);
7090 Py_XDECREF(exc);
7091 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092 return NULL;
7093}
7094
Alexander Belopolsky40018472011-02-26 01:02:56 +00007095PyObject *
7096PyUnicode_AsCharmapString(PyObject *unicode,
7097 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098{
7099 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 PyErr_BadArgument();
7101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 }
7103 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 PyUnicode_GET_SIZE(unicode),
7105 mapping,
7106 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107}
7108
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007109/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007110static void
7111make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007112 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007113 Py_ssize_t startpos, Py_ssize_t endpos,
7114 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007116 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007117 *exceptionObject = _PyUnicodeTranslateError_Create(
7118 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119 }
7120 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7122 goto onError;
7123 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7124 goto onError;
7125 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7126 goto onError;
7127 return;
7128 onError:
7129 Py_DECREF(*exceptionObject);
7130 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 }
7132}
7133
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007134/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007135static void
7136raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007137 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007138 Py_ssize_t startpos, Py_ssize_t endpos,
7139 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007140{
7141 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007142 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007143 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007144 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007145}
7146
7147/* error handling callback helper:
7148 build arguments, call the callback and check the arguments,
7149 put the result into newpos and return the replacement string, which
7150 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007151static PyObject *
7152unicode_translate_call_errorhandler(const char *errors,
7153 PyObject **errorHandler,
7154 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007155 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007156 Py_ssize_t startpos, Py_ssize_t endpos,
7157 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007158{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007159 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007160
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007161 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007162 PyObject *restuple;
7163 PyObject *resunicode;
7164
7165 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007166 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007167 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007169 }
7170
7171 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007172 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007173 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007175
7176 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007178 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007179 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007180 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007181 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 Py_DECREF(restuple);
7183 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007184 }
7185 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007186 &resunicode, &i_newpos)) {
7187 Py_DECREF(restuple);
7188 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007189 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007190 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007191 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007192 else
7193 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007194 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007195 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7196 Py_DECREF(restuple);
7197 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007198 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007199 Py_INCREF(resunicode);
7200 Py_DECREF(restuple);
7201 return resunicode;
7202}
7203
7204/* Lookup the character ch in the mapping and put the result in result,
7205 which must be decrefed by the caller.
7206 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007207static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007208charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007209{
Christian Heimes217cfd12007-12-02 14:31:20 +00007210 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007211 PyObject *x;
7212
7213 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007215 x = PyObject_GetItem(mapping, w);
7216 Py_DECREF(w);
7217 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7219 /* No mapping found means: use 1:1 mapping. */
7220 PyErr_Clear();
7221 *result = NULL;
7222 return 0;
7223 } else
7224 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007225 }
7226 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 *result = x;
7228 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007229 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007230 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 long value = PyLong_AS_LONG(x);
7232 long max = PyUnicode_GetMax();
7233 if (value < 0 || value > max) {
7234 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007235 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 Py_DECREF(x);
7237 return -1;
7238 }
7239 *result = x;
7240 return 0;
7241 }
7242 else if (PyUnicode_Check(x)) {
7243 *result = x;
7244 return 0;
7245 }
7246 else {
7247 /* wrong return value */
7248 PyErr_SetString(PyExc_TypeError,
7249 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007250 Py_DECREF(x);
7251 return -1;
7252 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007253}
7254/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 if not reallocate and adjust various state variables.
7256 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007257static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007258charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007260{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007261 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007262 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007263 /* exponentially overallocate to minimize reallocations */
7264 if (requiredsize < 2 * oldsize)
7265 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007266 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7267 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007268 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007269 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007270 }
7271 return 0;
7272}
7273/* lookup the character, put the result in the output string and adjust
7274 various state variables. Return a new reference to the object that
7275 was put in the output buffer in *result, or Py_None, if the mapping was
7276 undefined (in which case no character was written).
7277 The called must decref result.
7278 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007279static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007280charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7281 PyObject *mapping, Py_UCS4 **output,
7282 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007283 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007285 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7286 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007288 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007289 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007290 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007291 }
7292 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007294 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007296 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007297 }
7298 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007299 Py_ssize_t repsize;
7300 if (PyUnicode_READY(*res) == -1)
7301 return -1;
7302 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007303 if (repsize==1) {
7304 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007305 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007306 }
7307 else if (repsize!=0) {
7308 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007309 Py_ssize_t requiredsize = *opos +
7310 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007312 Py_ssize_t i;
7313 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007314 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007315 for(i = 0; i < repsize; i++)
7316 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007318 }
7319 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007321 return 0;
7322}
7323
Alexander Belopolsky40018472011-02-26 01:02:56 +00007324PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007325_PyUnicode_TranslateCharmap(PyObject *input,
7326 PyObject *mapping,
7327 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007329 /* input object */
7330 char *idata;
7331 Py_ssize_t size, i;
7332 int kind;
7333 /* output buffer */
7334 Py_UCS4 *output = NULL;
7335 Py_ssize_t osize;
7336 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007337 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007338 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007339 char *reason = "character maps to <undefined>";
7340 PyObject *errorHandler = NULL;
7341 PyObject *exc = NULL;
7342 /* the following variable is used for caching string comparisons
7343 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7344 * 3=ignore, 4=xmlcharrefreplace */
7345 int known_errorHandler = -1;
7346
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 PyErr_BadArgument();
7349 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007352 if (PyUnicode_READY(input) == -1)
7353 return NULL;
7354 idata = (char*)PyUnicode_DATA(input);
7355 kind = PyUnicode_KIND(input);
7356 size = PyUnicode_GET_LENGTH(input);
7357 i = 0;
7358
7359 if (size == 0) {
7360 Py_INCREF(input);
7361 return input;
7362 }
7363
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007364 /* allocate enough for a simple 1:1 translation without
7365 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007366 osize = size;
7367 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7368 opos = 0;
7369 if (output == NULL) {
7370 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007372 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007374 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 /* try to encode it */
7376 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007377 if (charmaptranslate_output(input, i, mapping,
7378 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 Py_XDECREF(x);
7380 goto onError;
7381 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007382 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007384 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 else { /* untranslatable character */
7386 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7387 Py_ssize_t repsize;
7388 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007389 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007391 Py_ssize_t collstart = i;
7392 Py_ssize_t collend = i+1;
7393 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394
Benjamin Peterson29060642009-01-31 22:14:21 +00007395 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007396 while (collend < size) {
7397 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 goto onError;
7399 Py_XDECREF(x);
7400 if (x!=Py_None)
7401 break;
7402 ++collend;
7403 }
7404 /* cache callback name lookup
7405 * (if not done yet, i.e. it's the first error) */
7406 if (known_errorHandler==-1) {
7407 if ((errors==NULL) || (!strcmp(errors, "strict")))
7408 known_errorHandler = 1;
7409 else if (!strcmp(errors, "replace"))
7410 known_errorHandler = 2;
7411 else if (!strcmp(errors, "ignore"))
7412 known_errorHandler = 3;
7413 else if (!strcmp(errors, "xmlcharrefreplace"))
7414 known_errorHandler = 4;
7415 else
7416 known_errorHandler = 0;
7417 }
7418 switch (known_errorHandler) {
7419 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007420 raise_translate_exception(&exc, input, collstart,
7421 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007422 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 case 2: /* replace */
7424 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007425 for (coll = collstart; coll<collend; coll++)
7426 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 /* fall through */
7428 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007429 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007430 break;
7431 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007432 /* generate replacement (temporarily (mis)uses i) */
7433 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007434 char buffer[2+29+1+1];
7435 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007436 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7437 if (charmaptranslate_makespace(&output, &osize,
7438 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 goto onError;
7440 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007441 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007443 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 break;
7445 default:
7446 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007447 reason, input, &exc,
7448 collstart, collend, &newpos);
7449 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 goto onError;
7451 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007452 repsize = PyUnicode_GET_LENGTH(repunicode);
7453 if (charmaptranslate_makespace(&output, &osize,
7454 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 Py_DECREF(repunicode);
7456 goto onError;
7457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007458 for (uni2 = 0; repsize-->0; ++uni2)
7459 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7460 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007462 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007463 }
7464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007465 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7466 if (!res)
7467 goto onError;
7468 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007469 Py_XDECREF(exc);
7470 Py_XDECREF(errorHandler);
7471 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007474 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007475 Py_XDECREF(exc);
7476 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477 return NULL;
7478}
7479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007480/* Deprecated. Use PyUnicode_Translate instead. */
7481PyObject *
7482PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7483 Py_ssize_t size,
7484 PyObject *mapping,
7485 const char *errors)
7486{
7487 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7488 if (!unicode)
7489 return NULL;
7490 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7491}
7492
Alexander Belopolsky40018472011-02-26 01:02:56 +00007493PyObject *
7494PyUnicode_Translate(PyObject *str,
7495 PyObject *mapping,
7496 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497{
7498 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007499
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 str = PyUnicode_FromObject(str);
7501 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007503 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 Py_DECREF(str);
7505 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007506
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 Py_XDECREF(str);
7509 return NULL;
7510}
Tim Petersced69f82003-09-16 20:30:58 +00007511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007512static Py_UCS4
7513fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7514{
7515 /* No need to call PyUnicode_READY(self) because this function is only
7516 called as a callback from fixup() which does it already. */
7517 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7518 const int kind = PyUnicode_KIND(self);
7519 void *data = PyUnicode_DATA(self);
7520 Py_UCS4 maxchar = 0, ch, fixed;
7521 Py_ssize_t i;
7522
7523 for (i = 0; i < len; ++i) {
7524 ch = PyUnicode_READ(kind, data, i);
7525 fixed = 0;
7526 if (ch > 127) {
7527 if (Py_UNICODE_ISSPACE(ch))
7528 fixed = ' ';
7529 else {
7530 const int decimal = Py_UNICODE_TODECIMAL(ch);
7531 if (decimal >= 0)
7532 fixed = '0' + decimal;
7533 }
7534 if (fixed != 0) {
7535 if (fixed > maxchar)
7536 maxchar = fixed;
7537 PyUnicode_WRITE(kind, data, i, fixed);
7538 }
7539 else if (ch > maxchar)
7540 maxchar = ch;
7541 }
7542 else if (ch > maxchar)
7543 maxchar = ch;
7544 }
7545
7546 return maxchar;
7547}
7548
7549PyObject *
7550_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7551{
7552 if (!PyUnicode_Check(unicode)) {
7553 PyErr_BadInternalCall();
7554 return NULL;
7555 }
7556 if (PyUnicode_READY(unicode) == -1)
7557 return NULL;
7558 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7559 /* If the string is already ASCII, just return the same string */
7560 Py_INCREF(unicode);
7561 return unicode;
7562 }
7563 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7564}
7565
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007566PyObject *
7567PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7568 Py_ssize_t length)
7569{
7570 PyObject *result;
7571 Py_UNICODE *p; /* write pointer into result */
7572 Py_ssize_t i;
7573 /* Copy to a new string */
7574 result = (PyObject *)_PyUnicode_New(length);
7575 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7576 if (result == NULL)
7577 return result;
7578 p = PyUnicode_AS_UNICODE(result);
7579 /* Iterate over code points */
7580 for (i = 0; i < length; i++) {
7581 Py_UNICODE ch =s[i];
7582 if (ch > 127) {
7583 int decimal = Py_UNICODE_TODECIMAL(ch);
7584 if (decimal >= 0)
7585 p[i] = '0' + decimal;
7586 }
7587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007588 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7589 Py_DECREF(result);
7590 return NULL;
7591 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007592 return result;
7593}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007594/* --- Decimal Encoder ---------------------------------------------------- */
7595
Alexander Belopolsky40018472011-02-26 01:02:56 +00007596int
7597PyUnicode_EncodeDecimal(Py_UNICODE *s,
7598 Py_ssize_t length,
7599 char *output,
7600 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007601{
7602 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007603 PyObject *errorHandler = NULL;
7604 PyObject *exc = NULL;
7605 const char *encoding = "decimal";
7606 const char *reason = "invalid decimal Unicode string";
7607 /* the following variable is used for caching string comparisons
7608 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7609 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007610
7611 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 PyErr_BadArgument();
7613 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007614 }
7615
7616 p = s;
7617 end = s + length;
7618 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 register Py_UNICODE ch = *p;
7620 int decimal;
7621 PyObject *repunicode;
7622 Py_ssize_t repsize;
7623 Py_ssize_t newpos;
7624 Py_UNICODE *uni2;
7625 Py_UNICODE *collstart;
7626 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007627
Benjamin Peterson29060642009-01-31 22:14:21 +00007628 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007629 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 ++p;
7631 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007632 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 decimal = Py_UNICODE_TODECIMAL(ch);
7634 if (decimal >= 0) {
7635 *output++ = '0' + decimal;
7636 ++p;
7637 continue;
7638 }
7639 if (0 < ch && ch < 256) {
7640 *output++ = (char)ch;
7641 ++p;
7642 continue;
7643 }
7644 /* All other characters are considered unencodable */
7645 collstart = p;
7646 collend = p+1;
7647 while (collend < end) {
7648 if ((0 < *collend && *collend < 256) ||
7649 !Py_UNICODE_ISSPACE(*collend) ||
7650 Py_UNICODE_TODECIMAL(*collend))
7651 break;
7652 }
7653 /* cache callback name lookup
7654 * (if not done yet, i.e. it's the first error) */
7655 if (known_errorHandler==-1) {
7656 if ((errors==NULL) || (!strcmp(errors, "strict")))
7657 known_errorHandler = 1;
7658 else if (!strcmp(errors, "replace"))
7659 known_errorHandler = 2;
7660 else if (!strcmp(errors, "ignore"))
7661 known_errorHandler = 3;
7662 else if (!strcmp(errors, "xmlcharrefreplace"))
7663 known_errorHandler = 4;
7664 else
7665 known_errorHandler = 0;
7666 }
7667 switch (known_errorHandler) {
7668 case 1: /* strict */
7669 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7670 goto onError;
7671 case 2: /* replace */
7672 for (p = collstart; p < collend; ++p)
7673 *output++ = '?';
7674 /* fall through */
7675 case 3: /* ignore */
7676 p = collend;
7677 break;
7678 case 4: /* xmlcharrefreplace */
7679 /* generate replacement (temporarily (mis)uses p) */
7680 for (p = collstart; p < collend; ++p)
7681 output += sprintf(output, "&#%d;", (int)*p);
7682 p = collend;
7683 break;
7684 default:
7685 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7686 encoding, reason, s, length, &exc,
7687 collstart-s, collend-s, &newpos);
7688 if (repunicode == NULL)
7689 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007690 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007691 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007692 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7693 Py_DECREF(repunicode);
7694 goto onError;
7695 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007696 /* generate replacement */
7697 repsize = PyUnicode_GET_SIZE(repunicode);
7698 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7699 Py_UNICODE ch = *uni2;
7700 if (Py_UNICODE_ISSPACE(ch))
7701 *output++ = ' ';
7702 else {
7703 decimal = Py_UNICODE_TODECIMAL(ch);
7704 if (decimal >= 0)
7705 *output++ = '0' + decimal;
7706 else if (0 < ch && ch < 256)
7707 *output++ = (char)ch;
7708 else {
7709 Py_DECREF(repunicode);
7710 raise_encode_exception(&exc, encoding,
7711 s, length, collstart-s, collend-s, reason);
7712 goto onError;
7713 }
7714 }
7715 }
7716 p = s + newpos;
7717 Py_DECREF(repunicode);
7718 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007719 }
7720 /* 0-terminate the output string */
7721 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007722 Py_XDECREF(exc);
7723 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007724 return 0;
7725
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007727 Py_XDECREF(exc);
7728 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007729 return -1;
7730}
7731
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732/* --- Helpers ------------------------------------------------------------ */
7733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007734#include "stringlib/ucs1lib.h"
7735#include "stringlib/fastsearch.h"
7736#include "stringlib/partition.h"
7737#include "stringlib/split.h"
7738#include "stringlib/count.h"
7739#include "stringlib/find.h"
7740#include "stringlib/localeutil.h"
7741#include "stringlib/undef.h"
7742
7743#include "stringlib/ucs2lib.h"
7744#include "stringlib/fastsearch.h"
7745#include "stringlib/partition.h"
7746#include "stringlib/split.h"
7747#include "stringlib/count.h"
7748#include "stringlib/find.h"
7749#include "stringlib/localeutil.h"
7750#include "stringlib/undef.h"
7751
7752#include "stringlib/ucs4lib.h"
7753#include "stringlib/fastsearch.h"
7754#include "stringlib/partition.h"
7755#include "stringlib/split.h"
7756#include "stringlib/count.h"
7757#include "stringlib/find.h"
7758#include "stringlib/localeutil.h"
7759#include "stringlib/undef.h"
7760
7761static Py_ssize_t
7762any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7763 const Py_UCS1*, Py_ssize_t,
7764 Py_ssize_t, Py_ssize_t),
7765 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7766 const Py_UCS2*, Py_ssize_t,
7767 Py_ssize_t, Py_ssize_t),
7768 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7769 const Py_UCS4*, Py_ssize_t,
7770 Py_ssize_t, Py_ssize_t),
7771 PyObject* s1, PyObject* s2,
7772 Py_ssize_t start,
7773 Py_ssize_t end)
7774{
7775 int kind1, kind2, kind;
7776 void *buf1, *buf2;
7777 Py_ssize_t len1, len2, result;
7778
7779 kind1 = PyUnicode_KIND(s1);
7780 kind2 = PyUnicode_KIND(s2);
7781 kind = kind1 > kind2 ? kind1 : kind2;
7782 buf1 = PyUnicode_DATA(s1);
7783 buf2 = PyUnicode_DATA(s2);
7784 if (kind1 != kind)
7785 buf1 = _PyUnicode_AsKind(s1, kind);
7786 if (!buf1)
7787 return -2;
7788 if (kind2 != kind)
7789 buf2 = _PyUnicode_AsKind(s2, kind);
7790 if (!buf2) {
7791 if (kind1 != kind) PyMem_Free(buf1);
7792 return -2;
7793 }
7794 len1 = PyUnicode_GET_LENGTH(s1);
7795 len2 = PyUnicode_GET_LENGTH(s2);
7796
7797 switch(kind) {
7798 case PyUnicode_1BYTE_KIND:
7799 result = ucs1(buf1, len1, buf2, len2, start, end);
7800 break;
7801 case PyUnicode_2BYTE_KIND:
7802 result = ucs2(buf1, len1, buf2, len2, start, end);
7803 break;
7804 case PyUnicode_4BYTE_KIND:
7805 result = ucs4(buf1, len1, buf2, len2, start, end);
7806 break;
7807 default:
7808 assert(0); result = -2;
7809 }
7810
7811 if (kind1 != kind)
7812 PyMem_Free(buf1);
7813 if (kind2 != kind)
7814 PyMem_Free(buf2);
7815
7816 return result;
7817}
7818
7819Py_ssize_t
7820_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7821 Py_ssize_t n_buffer,
7822 void *digits, Py_ssize_t n_digits,
7823 Py_ssize_t min_width,
7824 const char *grouping,
7825 const char *thousands_sep)
7826{
7827 switch(kind) {
7828 case PyUnicode_1BYTE_KIND:
7829 return _PyUnicode_ucs1_InsertThousandsGrouping(
7830 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7831 min_width, grouping, thousands_sep);
7832 case PyUnicode_2BYTE_KIND:
7833 return _PyUnicode_ucs2_InsertThousandsGrouping(
7834 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7835 min_width, grouping, thousands_sep);
7836 case PyUnicode_4BYTE_KIND:
7837 return _PyUnicode_ucs4_InsertThousandsGrouping(
7838 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7839 min_width, grouping, thousands_sep);
7840 }
7841 assert(0);
7842 return -1;
7843}
7844
7845
Eric Smith8c663262007-08-25 02:26:07 +00007846#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007847#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007848
Thomas Wouters477c8d52006-05-27 19:21:47 +00007849#include "stringlib/count.h"
7850#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007851
Thomas Wouters477c8d52006-05-27 19:21:47 +00007852/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007853#define ADJUST_INDICES(start, end, len) \
7854 if (end > len) \
7855 end = len; \
7856 else if (end < 0) { \
7857 end += len; \
7858 if (end < 0) \
7859 end = 0; \
7860 } \
7861 if (start < 0) { \
7862 start += len; \
7863 if (start < 0) \
7864 start = 0; \
7865 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007866
Alexander Belopolsky40018472011-02-26 01:02:56 +00007867Py_ssize_t
7868PyUnicode_Count(PyObject *str,
7869 PyObject *substr,
7870 Py_ssize_t start,
7871 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007873 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007874 PyUnicodeObject* str_obj;
7875 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007876 int kind1, kind2, kind;
7877 void *buf1 = NULL, *buf2 = NULL;
7878 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007879
Thomas Wouters477c8d52006-05-27 19:21:47 +00007880 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007881 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007883 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007884 if (!sub_obj || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007885 Py_DECREF(str_obj);
7886 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887 }
Tim Petersced69f82003-09-16 20:30:58 +00007888
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007889 kind1 = PyUnicode_KIND(str_obj);
7890 kind2 = PyUnicode_KIND(sub_obj);
7891 kind = kind1 > kind2 ? kind1 : kind2;
7892 buf1 = PyUnicode_DATA(str_obj);
7893 if (kind1 != kind)
7894 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7895 if (!buf1)
7896 goto onError;
7897 buf2 = PyUnicode_DATA(sub_obj);
7898 if (kind2 != kind)
7899 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7900 if (!buf2)
7901 goto onError;
7902 len1 = PyUnicode_GET_LENGTH(str_obj);
7903 len2 = PyUnicode_GET_LENGTH(sub_obj);
7904
7905 ADJUST_INDICES(start, end, len1);
7906 switch(kind) {
7907 case PyUnicode_1BYTE_KIND:
7908 result = ucs1lib_count(
7909 ((Py_UCS1*)buf1) + start, end - start,
7910 buf2, len2, PY_SSIZE_T_MAX
7911 );
7912 break;
7913 case PyUnicode_2BYTE_KIND:
7914 result = ucs2lib_count(
7915 ((Py_UCS2*)buf1) + start, end - start,
7916 buf2, len2, PY_SSIZE_T_MAX
7917 );
7918 break;
7919 case PyUnicode_4BYTE_KIND:
7920 result = ucs4lib_count(
7921 ((Py_UCS4*)buf1) + start, end - start,
7922 buf2, len2, PY_SSIZE_T_MAX
7923 );
7924 break;
7925 default:
7926 assert(0); result = 0;
7927 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007928
7929 Py_DECREF(sub_obj);
7930 Py_DECREF(str_obj);
7931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007932 if (kind1 != kind)
7933 PyMem_Free(buf1);
7934 if (kind2 != kind)
7935 PyMem_Free(buf2);
7936
Guido van Rossumd57fd912000-03-10 22:53:23 +00007937 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007938 onError:
7939 Py_DECREF(sub_obj);
7940 Py_DECREF(str_obj);
7941 if (kind1 != kind && buf1)
7942 PyMem_Free(buf1);
7943 if (kind2 != kind && buf2)
7944 PyMem_Free(buf2);
7945 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946}
7947
Alexander Belopolsky40018472011-02-26 01:02:56 +00007948Py_ssize_t
7949PyUnicode_Find(PyObject *str,
7950 PyObject *sub,
7951 Py_ssize_t start,
7952 Py_ssize_t end,
7953 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007955 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00007956
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007958 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007960 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007961 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 Py_DECREF(str);
7963 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964 }
Tim Petersced69f82003-09-16 20:30:58 +00007965
Thomas Wouters477c8d52006-05-27 19:21:47 +00007966 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007967 result = any_find_slice(
7968 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
7969 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007970 );
7971 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007972 result = any_find_slice(
7973 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
7974 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007975 );
7976
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007978 Py_DECREF(sub);
7979
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 return result;
7981}
7982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007983Py_ssize_t
7984PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
7985 Py_ssize_t start, Py_ssize_t end,
7986 int direction)
7987{
7988 char *result;
7989 int kind;
7990 if (PyUnicode_READY(str) == -1)
7991 return -2;
7992 if (end > PyUnicode_GET_LENGTH(str))
7993 end = PyUnicode_GET_LENGTH(str);
7994 kind = PyUnicode_KIND(str);
7995 result = findchar(PyUnicode_1BYTE_DATA(str)
7996 + PyUnicode_KIND_SIZE(kind, start),
7997 kind,
7998 end-start, ch, direction);
7999 if (!result)
8000 return -1;
8001 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8002}
8003
Alexander Belopolsky40018472011-02-26 01:02:56 +00008004static int
8005tailmatch(PyUnicodeObject *self,
8006 PyUnicodeObject *substring,
8007 Py_ssize_t start,
8008 Py_ssize_t end,
8009 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008011 int kind_self;
8012 int kind_sub;
8013 void *data_self;
8014 void *data_sub;
8015 Py_ssize_t offset;
8016 Py_ssize_t i;
8017 Py_ssize_t end_sub;
8018
8019 if (PyUnicode_READY(self) == -1 ||
8020 PyUnicode_READY(substring) == -1)
8021 return 0;
8022
8023 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 return 1;
8025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8027 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008031 kind_self = PyUnicode_KIND(self);
8032 data_self = PyUnicode_DATA(self);
8033 kind_sub = PyUnicode_KIND(substring);
8034 data_sub = PyUnicode_DATA(substring);
8035 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8036
8037 if (direction > 0)
8038 offset = end;
8039 else
8040 offset = start;
8041
8042 if (PyUnicode_READ(kind_self, data_self, offset) ==
8043 PyUnicode_READ(kind_sub, data_sub, 0) &&
8044 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8045 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8046 /* If both are of the same kind, memcmp is sufficient */
8047 if (kind_self == kind_sub) {
8048 return ! memcmp((char *)data_self +
8049 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8050 data_sub,
8051 PyUnicode_GET_LENGTH(substring) *
8052 PyUnicode_CHARACTER_SIZE(substring));
8053 }
8054 /* otherwise we have to compare each character by first accesing it */
8055 else {
8056 /* We do not need to compare 0 and len(substring)-1 because
8057 the if statement above ensured already that they are equal
8058 when we end up here. */
8059 // TODO: honor direction and do a forward or backwards search
8060 for (i = 1; i < end_sub; ++i) {
8061 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8062 PyUnicode_READ(kind_sub, data_sub, i))
8063 return 0;
8064 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008065 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067 }
8068
8069 return 0;
8070}
8071
Alexander Belopolsky40018472011-02-26 01:02:56 +00008072Py_ssize_t
8073PyUnicode_Tailmatch(PyObject *str,
8074 PyObject *substr,
8075 Py_ssize_t start,
8076 Py_ssize_t end,
8077 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008079 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008080
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081 str = PyUnicode_FromObject(str);
8082 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084 substr = PyUnicode_FromObject(substr);
8085 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 Py_DECREF(str);
8087 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088 }
Tim Petersced69f82003-09-16 20:30:58 +00008089
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 (PyUnicodeObject *)substr,
8092 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093 Py_DECREF(str);
8094 Py_DECREF(substr);
8095 return result;
8096}
8097
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098/* Apply fixfct filter to the Unicode object self and return a
8099 reference to the modified object */
8100
Alexander Belopolsky40018472011-02-26 01:02:56 +00008101static PyObject *
8102fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008103 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008105 PyObject *u;
8106 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008108 if (PyUnicode_READY(self) == -1)
8109 return NULL;
8110 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8111 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8112 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008116 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8117 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 /* fix functions return the new maximum character in a string,
8120 if the kind of the resulting unicode object does not change,
8121 everything is fine. Otherwise we need to change the string kind
8122 and re-run the fix function. */
8123 maxchar_new = fixfct((PyUnicodeObject*)u);
8124 if (maxchar_new == 0)
8125 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8126 else if (maxchar_new <= 127)
8127 maxchar_new = 127;
8128 else if (maxchar_new <= 255)
8129 maxchar_new = 255;
8130 else if (maxchar_new <= 65535)
8131 maxchar_new = 65535;
8132 else
8133 maxchar_new = 1114111; /* 0x10ffff */
8134
8135 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 /* fixfct should return TRUE if it modified the buffer. If
8137 FALSE, return a reference to the original buffer instead
8138 (to save space, not time) */
8139 Py_INCREF(self);
8140 Py_DECREF(u);
8141 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008143 else if (maxchar_new == maxchar_old) {
8144 return u;
8145 }
8146 else {
8147 /* In case the maximum character changed, we need to
8148 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008149 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008150 if (v == NULL) {
8151 Py_DECREF(u);
8152 return NULL;
8153 }
8154 if (maxchar_new > maxchar_old) {
8155 /* If the maxchar increased so that the kind changed, not all
8156 characters are representable anymore and we need to fix the
8157 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008158 if (PyUnicode_CopyCharacters(v, 0,
8159 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008160 PyUnicode_GET_LENGTH(self)) < 0)
8161 {
8162 Py_DECREF(u);
8163 return NULL;
8164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008165 maxchar_old = fixfct((PyUnicodeObject*)v);
8166 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8167 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008168 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008169 if (PyUnicode_CopyCharacters(v, 0,
8170 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008171 PyUnicode_GET_LENGTH(self)) < 0)
8172 {
8173 Py_DECREF(u);
8174 return NULL;
8175 }
8176 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008177
8178 Py_DECREF(u);
8179 return v;
8180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181}
8182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008183static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008184fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008186 /* No need to call PyUnicode_READY(self) because this function is only
8187 called as a callback from fixup() which does it already. */
8188 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8189 const int kind = PyUnicode_KIND(self);
8190 void *data = PyUnicode_DATA(self);
8191 int touched = 0;
8192 Py_UCS4 maxchar = 0;
8193 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008195 for (i = 0; i < len; ++i) {
8196 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8197 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8198 if (up != ch) {
8199 if (up > maxchar)
8200 maxchar = up;
8201 PyUnicode_WRITE(kind, data, i, up);
8202 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008203 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008204 else if (ch > maxchar)
8205 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206 }
8207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008208 if (touched)
8209 return maxchar;
8210 else
8211 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212}
8213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008214static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008215fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008217 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8218 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8219 const int kind = PyUnicode_KIND(self);
8220 void *data = PyUnicode_DATA(self);
8221 int touched = 0;
8222 Py_UCS4 maxchar = 0;
8223 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008225 for(i = 0; i < len; ++i) {
8226 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8227 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8228 if (lo != ch) {
8229 if (lo > maxchar)
8230 maxchar = lo;
8231 PyUnicode_WRITE(kind, data, i, lo);
8232 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008234 else if (ch > maxchar)
8235 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236 }
8237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238 if (touched)
8239 return maxchar;
8240 else
8241 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242}
8243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008244static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008245fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008247 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8248 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8249 const int kind = PyUnicode_KIND(self);
8250 void *data = PyUnicode_DATA(self);
8251 int touched = 0;
8252 Py_UCS4 maxchar = 0;
8253 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008255 for(i = 0; i < len; ++i) {
8256 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8257 Py_UCS4 nu = 0;
8258
8259 if (Py_UNICODE_ISUPPER(ch))
8260 nu = Py_UNICODE_TOLOWER(ch);
8261 else if (Py_UNICODE_ISLOWER(ch))
8262 nu = Py_UNICODE_TOUPPER(ch);
8263
8264 if (nu != 0) {
8265 if (nu > maxchar)
8266 maxchar = nu;
8267 PyUnicode_WRITE(kind, data, i, nu);
8268 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008270 else if (ch > maxchar)
8271 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 }
8273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274 if (touched)
8275 return maxchar;
8276 else
8277 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278}
8279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008280static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008281fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008283 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8284 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8285 const int kind = PyUnicode_KIND(self);
8286 void *data = PyUnicode_DATA(self);
8287 int touched = 0;
8288 Py_UCS4 maxchar = 0;
8289 Py_ssize_t i = 0;
8290 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008291
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008292 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008294
8295 ch = PyUnicode_READ(kind, data, i);
8296 if (!Py_UNICODE_ISUPPER(ch)) {
8297 maxchar = Py_UNICODE_TOUPPER(ch);
8298 PyUnicode_WRITE(kind, data, i, maxchar);
8299 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008301 ++i;
8302 for(; i < len; ++i) {
8303 ch = PyUnicode_READ(kind, data, i);
8304 if (!Py_UNICODE_ISLOWER(ch)) {
8305 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8306 if (lo > maxchar)
8307 maxchar = lo;
8308 PyUnicode_WRITE(kind, data, i, lo);
8309 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008311 else if (ch > maxchar)
8312 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008314
8315 if (touched)
8316 return maxchar;
8317 else
8318 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319}
8320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008321static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008322fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008324 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8325 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8326 const int kind = PyUnicode_KIND(self);
8327 void *data = PyUnicode_DATA(self);
8328 Py_UCS4 maxchar = 0;
8329 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330 int previous_is_cased;
8331
8332 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008333 if (len == 1) {
8334 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8335 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8336 if (ti != ch) {
8337 PyUnicode_WRITE(kind, data, i, ti);
8338 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 }
8340 else
8341 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008344 for(; i < len; ++i) {
8345 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8346 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008347
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008349 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 nu = Py_UNICODE_TOTITLE(ch);
8352
8353 if (nu > maxchar)
8354 maxchar = nu;
8355 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008356
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 if (Py_UNICODE_ISLOWER(ch) ||
8358 Py_UNICODE_ISUPPER(ch) ||
8359 Py_UNICODE_ISTITLE(ch))
8360 previous_is_cased = 1;
8361 else
8362 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365}
8366
Tim Peters8ce9f162004-08-27 01:49:32 +00008367PyObject *
8368PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008371 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008373 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008374 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8375 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008376 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 Py_ssize_t sz, i, res_offset;
8378 Py_UCS4 maxchar = 0;
8379 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380
Tim Peters05eba1f2004-08-27 21:32:02 +00008381 fseq = PySequence_Fast(seq, "");
8382 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008383 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008384 }
8385
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008386 /* NOTE: the following code can't call back into Python code,
8387 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008388 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008389
Tim Peters05eba1f2004-08-27 21:32:02 +00008390 seqlen = PySequence_Fast_GET_SIZE(fseq);
8391 /* If empty sequence, return u"". */
8392 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008394 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008395 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008396 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008397 /* If singleton sequence with an exact Unicode, return that. */
8398 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 item = items[0];
8400 if (PyUnicode_CheckExact(item)) {
8401 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 goto Done;
8404 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008405 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008406 else {
8407 /* Set up sep and seplen */
8408 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008409 /* fall back to a blank space separator */
8410 sep = PyUnicode_FromOrdinal(' ');
8411 if (!sep || PyUnicode_READY(sep) == -1)
8412 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008413 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008414 else {
8415 if (!PyUnicode_Check(separator)) {
8416 PyErr_Format(PyExc_TypeError,
8417 "separator: expected str instance,"
8418 " %.80s found",
8419 Py_TYPE(separator)->tp_name);
8420 goto onError;
8421 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008422 if (PyUnicode_READY(separator) == -1)
8423 goto onError;
8424 sep = separator;
8425 seplen = PyUnicode_GET_LENGTH(separator);
8426 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8427 /* inc refcount to keep this code path symetric with the
8428 above case of a blank separator */
8429 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008430 }
8431 }
8432
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008433 /* There are at least two things to join, or else we have a subclass
8434 * of str in the sequence.
8435 * Do a pre-pass to figure out the total amount of space we'll
8436 * need (sz), and see whether all argument are strings.
8437 */
8438 sz = 0;
8439 for (i = 0; i < seqlen; i++) {
8440 const Py_ssize_t old_sz = sz;
8441 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 if (!PyUnicode_Check(item)) {
8443 PyErr_Format(PyExc_TypeError,
8444 "sequence item %zd: expected str instance,"
8445 " %.80s found",
8446 i, Py_TYPE(item)->tp_name);
8447 goto onError;
8448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 if (PyUnicode_READY(item) == -1)
8450 goto onError;
8451 sz += PyUnicode_GET_LENGTH(item);
8452 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8453 if (item_maxchar > maxchar)
8454 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008455 if (i != 0)
8456 sz += seplen;
8457 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8458 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008460 goto onError;
8461 }
8462 }
Tim Petersced69f82003-09-16 20:30:58 +00008463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008465 if (res == NULL)
8466 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008467
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008468 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008470 Py_ssize_t itemlen;
8471 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 /* Copy item, and maybe the separator. */
8474 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008475 if (PyUnicode_CopyCharacters(res, res_offset,
8476 sep, 0, seplen) < 0)
8477 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008478 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008480 if (PyUnicode_CopyCharacters(res, res_offset,
8481 item, 0, itemlen) < 0)
8482 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008483 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008485 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008486
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008488 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 Py_XDECREF(sep);
8490 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008493 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008494 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008495 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496 return NULL;
8497}
8498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008499#define FILL(kind, data, value, start, length) \
8500 do { \
8501 Py_ssize_t i_ = 0; \
8502 assert(kind != PyUnicode_WCHAR_KIND); \
8503 switch ((kind)) { \
8504 case PyUnicode_1BYTE_KIND: { \
8505 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8506 memset(to_, (unsigned char)value, length); \
8507 break; \
8508 } \
8509 case PyUnicode_2BYTE_KIND: { \
8510 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8511 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8512 break; \
8513 } \
8514 default: { \
8515 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8516 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8517 break; \
8518 } \
8519 } \
8520 } while (0)
8521
Alexander Belopolsky40018472011-02-26 01:02:56 +00008522static PyUnicodeObject *
8523pad(PyUnicodeObject *self,
8524 Py_ssize_t left,
8525 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 PyObject *u;
8529 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008530 int kind;
8531 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532
8533 if (left < 0)
8534 left = 0;
8535 if (right < 0)
8536 right = 0;
8537
Tim Peters7a29bd52001-09-12 03:03:31 +00008538 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539 Py_INCREF(self);
8540 return self;
8541 }
8542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008543 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8544 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008545 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8546 return NULL;
8547 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8549 if (fill > maxchar)
8550 maxchar = fill;
8551 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008552 if (!u)
8553 return NULL;
8554
8555 kind = PyUnicode_KIND(u);
8556 data = PyUnicode_DATA(u);
8557 if (left)
8558 FILL(kind, data, fill, 0, left);
8559 if (right)
8560 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008561 if (PyUnicode_CopyCharacters(u, left,
8562 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008563 _PyUnicode_LENGTH(self)) < 0)
8564 {
8565 Py_DECREF(u);
8566 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567 }
8568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008569 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008571#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572
Alexander Belopolsky40018472011-02-26 01:02:56 +00008573PyObject *
8574PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577
8578 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582 switch(PyUnicode_KIND(string)) {
8583 case PyUnicode_1BYTE_KIND:
8584 list = ucs1lib_splitlines(
8585 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8586 PyUnicode_GET_LENGTH(string), keepends);
8587 break;
8588 case PyUnicode_2BYTE_KIND:
8589 list = ucs2lib_splitlines(
8590 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8591 PyUnicode_GET_LENGTH(string), keepends);
8592 break;
8593 case PyUnicode_4BYTE_KIND:
8594 list = ucs4lib_splitlines(
8595 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8596 PyUnicode_GET_LENGTH(string), keepends);
8597 break;
8598 default:
8599 assert(0);
8600 list = 0;
8601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602 Py_DECREF(string);
8603 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604}
8605
Alexander Belopolsky40018472011-02-26 01:02:56 +00008606static PyObject *
8607split(PyUnicodeObject *self,
8608 PyUnicodeObject *substring,
8609 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 int kind1, kind2, kind;
8612 void *buf1, *buf2;
8613 Py_ssize_t len1, len2;
8614 PyObject* out;
8615
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008617 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 if (PyUnicode_READY(self) == -1)
8620 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622 if (substring == NULL)
8623 switch(PyUnicode_KIND(self)) {
8624 case PyUnicode_1BYTE_KIND:
8625 return ucs1lib_split_whitespace(
8626 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8627 PyUnicode_GET_LENGTH(self), maxcount
8628 );
8629 case PyUnicode_2BYTE_KIND:
8630 return ucs2lib_split_whitespace(
8631 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8632 PyUnicode_GET_LENGTH(self), maxcount
8633 );
8634 case PyUnicode_4BYTE_KIND:
8635 return ucs4lib_split_whitespace(
8636 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8637 PyUnicode_GET_LENGTH(self), maxcount
8638 );
8639 default:
8640 assert(0);
8641 return NULL;
8642 }
8643
8644 if (PyUnicode_READY(substring) == -1)
8645 return NULL;
8646
8647 kind1 = PyUnicode_KIND(self);
8648 kind2 = PyUnicode_KIND(substring);
8649 kind = kind1 > kind2 ? kind1 : kind2;
8650 buf1 = PyUnicode_DATA(self);
8651 buf2 = PyUnicode_DATA(substring);
8652 if (kind1 != kind)
8653 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8654 if (!buf1)
8655 return NULL;
8656 if (kind2 != kind)
8657 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8658 if (!buf2) {
8659 if (kind1 != kind) PyMem_Free(buf1);
8660 return NULL;
8661 }
8662 len1 = PyUnicode_GET_LENGTH(self);
8663 len2 = PyUnicode_GET_LENGTH(substring);
8664
8665 switch(kind) {
8666 case PyUnicode_1BYTE_KIND:
8667 out = ucs1lib_split(
8668 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8669 break;
8670 case PyUnicode_2BYTE_KIND:
8671 out = ucs2lib_split(
8672 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8673 break;
8674 case PyUnicode_4BYTE_KIND:
8675 out = ucs4lib_split(
8676 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8677 break;
8678 default:
8679 out = NULL;
8680 }
8681 if (kind1 != kind)
8682 PyMem_Free(buf1);
8683 if (kind2 != kind)
8684 PyMem_Free(buf2);
8685 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686}
8687
Alexander Belopolsky40018472011-02-26 01:02:56 +00008688static PyObject *
8689rsplit(PyUnicodeObject *self,
8690 PyUnicodeObject *substring,
8691 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008692{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 int kind1, kind2, kind;
8694 void *buf1, *buf2;
8695 Py_ssize_t len1, len2;
8696 PyObject* out;
8697
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008698 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008699 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 if (PyUnicode_READY(self) == -1)
8702 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 if (substring == NULL)
8705 switch(PyUnicode_KIND(self)) {
8706 case PyUnicode_1BYTE_KIND:
8707 return ucs1lib_rsplit_whitespace(
8708 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8709 PyUnicode_GET_LENGTH(self), maxcount
8710 );
8711 case PyUnicode_2BYTE_KIND:
8712 return ucs2lib_rsplit_whitespace(
8713 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8714 PyUnicode_GET_LENGTH(self), maxcount
8715 );
8716 case PyUnicode_4BYTE_KIND:
8717 return ucs4lib_rsplit_whitespace(
8718 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8719 PyUnicode_GET_LENGTH(self), maxcount
8720 );
8721 default:
8722 assert(0);
8723 return NULL;
8724 }
8725
8726 if (PyUnicode_READY(substring) == -1)
8727 return NULL;
8728
8729 kind1 = PyUnicode_KIND(self);
8730 kind2 = PyUnicode_KIND(substring);
8731 kind = kind1 > kind2 ? kind1 : kind2;
8732 buf1 = PyUnicode_DATA(self);
8733 buf2 = PyUnicode_DATA(substring);
8734 if (kind1 != kind)
8735 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8736 if (!buf1)
8737 return NULL;
8738 if (kind2 != kind)
8739 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8740 if (!buf2) {
8741 if (kind1 != kind) PyMem_Free(buf1);
8742 return NULL;
8743 }
8744 len1 = PyUnicode_GET_LENGTH(self);
8745 len2 = PyUnicode_GET_LENGTH(substring);
8746
8747 switch(kind) {
8748 case PyUnicode_1BYTE_KIND:
8749 out = ucs1lib_rsplit(
8750 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8751 break;
8752 case PyUnicode_2BYTE_KIND:
8753 out = ucs2lib_rsplit(
8754 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8755 break;
8756 case PyUnicode_4BYTE_KIND:
8757 out = ucs4lib_rsplit(
8758 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8759 break;
8760 default:
8761 out = NULL;
8762 }
8763 if (kind1 != kind)
8764 PyMem_Free(buf1);
8765 if (kind2 != kind)
8766 PyMem_Free(buf2);
8767 return out;
8768}
8769
8770static Py_ssize_t
8771anylib_find(int kind, void *buf1, Py_ssize_t len1,
8772 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8773{
8774 switch(kind) {
8775 case PyUnicode_1BYTE_KIND:
8776 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8777 case PyUnicode_2BYTE_KIND:
8778 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8779 case PyUnicode_4BYTE_KIND:
8780 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8781 }
8782 assert(0);
8783 return -1;
8784}
8785
8786static Py_ssize_t
8787anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8788 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8789{
8790 switch(kind) {
8791 case PyUnicode_1BYTE_KIND:
8792 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8793 case PyUnicode_2BYTE_KIND:
8794 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8795 case PyUnicode_4BYTE_KIND:
8796 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8797 }
8798 assert(0);
8799 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008800}
8801
Alexander Belopolsky40018472011-02-26 01:02:56 +00008802static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803replace(PyObject *self, PyObject *str1,
8804 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 PyObject *u;
8807 char *sbuf = PyUnicode_DATA(self);
8808 char *buf1 = PyUnicode_DATA(str1);
8809 char *buf2 = PyUnicode_DATA(str2);
8810 int srelease = 0, release1 = 0, release2 = 0;
8811 int skind = PyUnicode_KIND(self);
8812 int kind1 = PyUnicode_KIND(str1);
8813 int kind2 = PyUnicode_KIND(str2);
8814 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8815 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8816 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008817
8818 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008819 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008821 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008823 if (skind < kind1)
8824 /* substring too wide to be present */
8825 goto nothing;
8826
8827 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008828 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008829 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008830 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008831 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008833 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 Py_UCS4 u1, u2, maxchar;
8835 int mayshrink, rkind;
8836 u1 = PyUnicode_READ_CHAR(str1, 0);
8837 if (!findchar(sbuf, PyUnicode_KIND(self),
8838 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008839 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840 u2 = PyUnicode_READ_CHAR(str2, 0);
8841 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8842 /* Replacing u1 with u2 may cause a maxchar reduction in the
8843 result string. */
8844 mayshrink = maxchar > 127;
8845 if (u2 > maxchar) {
8846 maxchar = u2;
8847 mayshrink = 0;
8848 }
8849 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008850 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008852 if (PyUnicode_CopyCharacters(u, 0,
8853 (PyObject*)self, 0, slen) < 0)
8854 {
8855 Py_DECREF(u);
8856 return NULL;
8857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 rkind = PyUnicode_KIND(u);
8859 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8860 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008861 if (--maxcount < 0)
8862 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008863 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008865 if (mayshrink) {
8866 PyObject *tmp = u;
8867 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8868 PyUnicode_GET_LENGTH(tmp));
8869 Py_DECREF(tmp);
8870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872 int rkind = skind;
8873 char *res;
8874 if (kind1 < rkind) {
8875 /* widen substring */
8876 buf1 = _PyUnicode_AsKind(str1, rkind);
8877 if (!buf1) goto error;
8878 release1 = 1;
8879 }
8880 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008881 if (i < 0)
8882 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 if (rkind > kind2) {
8884 /* widen replacement */
8885 buf2 = _PyUnicode_AsKind(str2, rkind);
8886 if (!buf2) goto error;
8887 release2 = 1;
8888 }
8889 else if (rkind < kind2) {
8890 /* widen self and buf1 */
8891 rkind = kind2;
8892 if (release1) PyMem_Free(buf1);
8893 sbuf = _PyUnicode_AsKind(self, rkind);
8894 if (!sbuf) goto error;
8895 srelease = 1;
8896 buf1 = _PyUnicode_AsKind(str1, rkind);
8897 if (!buf1) goto error;
8898 release1 = 1;
8899 }
8900 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8901 if (!res) {
8902 PyErr_NoMemory();
8903 goto error;
8904 }
8905 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008906 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008907 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8908 buf2,
8909 PyUnicode_KIND_SIZE(rkind, len2));
8910 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008911
8912 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8914 slen-i,
8915 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008916 if (i == -1)
8917 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008918 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8919 buf2,
8920 PyUnicode_KIND_SIZE(rkind, len2));
8921 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923
8924 u = PyUnicode_FromKindAndData(rkind, res, slen);
8925 PyMem_Free(res);
8926 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930 Py_ssize_t n, i, j, ires;
8931 Py_ssize_t product, new_size;
8932 int rkind = skind;
8933 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 if (kind1 < rkind) {
8936 buf1 = _PyUnicode_AsKind(str1, rkind);
8937 if (!buf1) goto error;
8938 release1 = 1;
8939 }
8940 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008941 if (n == 0)
8942 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 if (kind2 < rkind) {
8944 buf2 = _PyUnicode_AsKind(str2, rkind);
8945 if (!buf2) goto error;
8946 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 else if (kind2 > rkind) {
8949 rkind = kind2;
8950 sbuf = _PyUnicode_AsKind(self, rkind);
8951 if (!sbuf) goto error;
8952 srelease = 1;
8953 if (release1) PyMem_Free(buf1);
8954 buf1 = _PyUnicode_AsKind(str1, rkind);
8955 if (!buf1) goto error;
8956 release1 = 1;
8957 }
8958 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
8959 PyUnicode_GET_LENGTH(str1))); */
8960 product = n * (len2-len1);
8961 if ((product / (len2-len1)) != n) {
8962 PyErr_SetString(PyExc_OverflowError,
8963 "replace string is too long");
8964 goto error;
8965 }
8966 new_size = slen + product;
8967 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
8968 PyErr_SetString(PyExc_OverflowError,
8969 "replace string is too long");
8970 goto error;
8971 }
8972 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
8973 if (!res)
8974 goto error;
8975 ires = i = 0;
8976 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008977 while (n-- > 0) {
8978 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979 j = anylib_find(rkind,
8980 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8981 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008982 if (j == -1)
8983 break;
8984 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008985 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008986 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8987 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8988 PyUnicode_KIND_SIZE(rkind, j-i));
8989 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008990 }
8991 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008992 if (len2 > 0) {
8993 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8994 buf2,
8995 PyUnicode_KIND_SIZE(rkind, len2));
8996 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008997 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008998 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008999 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009001 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9003 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9004 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009005 } else {
9006 /* interleave */
9007 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9009 buf2,
9010 PyUnicode_KIND_SIZE(rkind, len2));
9011 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009012 if (--n <= 0)
9013 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9015 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9016 PyUnicode_KIND_SIZE(rkind, 1));
9017 ires++;
9018 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009019 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009020 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9021 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9022 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009023 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 if (srelease)
9027 PyMem_FREE(sbuf);
9028 if (release1)
9029 PyMem_FREE(buf1);
9030 if (release2)
9031 PyMem_FREE(buf2);
9032 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009033
Benjamin Peterson29060642009-01-31 22:14:21 +00009034 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009035 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036 if (srelease)
9037 PyMem_FREE(sbuf);
9038 if (release1)
9039 PyMem_FREE(buf1);
9040 if (release2)
9041 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009042 if (PyUnicode_CheckExact(self)) {
9043 Py_INCREF(self);
9044 return (PyObject *) self;
9045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046 return PyUnicode_FromKindAndData(PyUnicode_KIND(self),
9047 PyUnicode_DATA(self),
9048 PyUnicode_GET_LENGTH(self));
9049 error:
9050 if (srelease && sbuf)
9051 PyMem_FREE(sbuf);
9052 if (release1 && buf1)
9053 PyMem_FREE(buf1);
9054 if (release2 && buf2)
9055 PyMem_FREE(buf2);
9056 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057}
9058
9059/* --- Unicode Object Methods --------------------------------------------- */
9060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009061PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009062 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063\n\
9064Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009065characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066
9067static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009068unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070 return fixup(self, fixtitle);
9071}
9072
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009073PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009074 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075\n\
9076Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009077have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009078
9079static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009080unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082 return fixup(self, fixcapitalize);
9083}
9084
9085#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009086PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009087 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088\n\
9089Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009090normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091
9092static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009093unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094{
9095 PyObject *list;
9096 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009097 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099 /* Split into words */
9100 list = split(self, NULL, -1);
9101 if (!list)
9102 return NULL;
9103
9104 /* Capitalize each word */
9105 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9106 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009107 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108 if (item == NULL)
9109 goto onError;
9110 Py_DECREF(PyList_GET_ITEM(list, i));
9111 PyList_SET_ITEM(list, i, item);
9112 }
9113
9114 /* Join the words to form a new string */
9115 item = PyUnicode_Join(NULL, list);
9116
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118 Py_DECREF(list);
9119 return (PyObject *)item;
9120}
9121#endif
9122
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009123/* Argument converter. Coerces to a single unicode character */
9124
9125static int
9126convert_uc(PyObject *obj, void *addr)
9127{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009129 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009130
Benjamin Peterson14339b62009-01-31 16:36:08 +00009131 uniobj = PyUnicode_FromObject(obj);
9132 if (uniobj == NULL) {
9133 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009135 return 0;
9136 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009137 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009138 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009140 Py_DECREF(uniobj);
9141 return 0;
9142 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009143 if (PyUnicode_READY(uniobj)) {
9144 Py_DECREF(uniobj);
9145 return 0;
9146 }
9147 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009148 Py_DECREF(uniobj);
9149 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009150}
9151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009152PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009153 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009155Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009156done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157
9158static PyObject *
9159unicode_center(PyUnicodeObject *self, PyObject *args)
9160{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009161 Py_ssize_t marg, left;
9162 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009163 Py_UCS4 fillchar = ' ';
9164
9165 if (PyUnicode_READY(self) == -1)
9166 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167
Thomas Woutersde017742006-02-16 19:34:37 +00009168 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169 return NULL;
9170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172 Py_INCREF(self);
9173 return (PyObject*) self;
9174 }
9175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177 left = marg / 2 + (marg & width & 1);
9178
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009179 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180}
9181
Marc-André Lemburge5034372000-08-08 08:04:29 +00009182#if 0
9183
9184/* This code should go into some future Unicode collation support
9185 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009186 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009187
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009188/* speedy UTF-16 code point order comparison */
9189/* gleaned from: */
9190/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9191
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009192static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009193{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009194 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009195 0, 0, 0, 0, 0, 0, 0, 0,
9196 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009197 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009198};
9199
Guido van Rossumd57fd912000-03-10 22:53:23 +00009200static int
9201unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9202{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009203 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009204
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205 Py_UNICODE *s1 = str1->str;
9206 Py_UNICODE *s2 = str2->str;
9207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009208 len1 = str1->_base._base.length;
9209 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009210
Guido van Rossumd57fd912000-03-10 22:53:23 +00009211 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009212 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009213
9214 c1 = *s1++;
9215 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009216
Benjamin Peterson29060642009-01-31 22:14:21 +00009217 if (c1 > (1<<11) * 26)
9218 c1 += utf16Fixup[c1>>11];
9219 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009220 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009221 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009222
9223 if (c1 != c2)
9224 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009225
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009226 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227 }
9228
9229 return (len1 < len2) ? -1 : (len1 != len2);
9230}
9231
Marc-André Lemburge5034372000-08-08 08:04:29 +00009232#else
9233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009234/* This function assumes that str1 and str2 are readied by the caller. */
9235
Marc-André Lemburge5034372000-08-08 08:04:29 +00009236static int
9237unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 int kind1, kind2;
9240 void *data1, *data2;
9241 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243 kind1 = PyUnicode_KIND(str1);
9244 kind2 = PyUnicode_KIND(str2);
9245 data1 = PyUnicode_DATA(str1);
9246 data2 = PyUnicode_DATA(str2);
9247 len1 = PyUnicode_GET_LENGTH(str1);
9248 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250 for (i = 0; i < len1 && i < len2; ++i) {
9251 Py_UCS4 c1, c2;
9252 c1 = PyUnicode_READ(kind1, data1, i);
9253 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009254
9255 if (c1 != c2)
9256 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009257 }
9258
9259 return (len1 < len2) ? -1 : (len1 != len2);
9260}
9261
9262#endif
9263
Alexander Belopolsky40018472011-02-26 01:02:56 +00009264int
9265PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009266{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9268 if (PyUnicode_READY(left) == -1 ||
9269 PyUnicode_READY(right) == -1)
9270 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009271 return unicode_compare((PyUnicodeObject *)left,
9272 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009274 PyErr_Format(PyExc_TypeError,
9275 "Can't compare %.100s and %.100s",
9276 left->ob_type->tp_name,
9277 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278 return -1;
9279}
9280
Martin v. Löwis5b222132007-06-10 09:51:05 +00009281int
9282PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9283{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009284 Py_ssize_t i;
9285 int kind;
9286 void *data;
9287 Py_UCS4 chr;
9288
Martin v. Löwis5b222132007-06-10 09:51:05 +00009289 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 if (PyUnicode_READY(uni) == -1)
9291 return -1;
9292 kind = PyUnicode_KIND(uni);
9293 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009294 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9296 if (chr != str[i])
9297 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009298 /* This check keeps Python strings that end in '\0' from comparing equal
9299 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009301 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009302 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009303 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009304 return 0;
9305}
9306
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009307
Benjamin Peterson29060642009-01-31 22:14:21 +00009308#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009309 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009310
Alexander Belopolsky40018472011-02-26 01:02:56 +00009311PyObject *
9312PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009313{
9314 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009315
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009316 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9317 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 if (PyUnicode_READY(left) == -1 ||
9319 PyUnicode_READY(right) == -1)
9320 return NULL;
9321 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9322 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009323 if (op == Py_EQ) {
9324 Py_INCREF(Py_False);
9325 return Py_False;
9326 }
9327 if (op == Py_NE) {
9328 Py_INCREF(Py_True);
9329 return Py_True;
9330 }
9331 }
9332 if (left == right)
9333 result = 0;
9334 else
9335 result = unicode_compare((PyUnicodeObject *)left,
9336 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009337
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009338 /* Convert the return value to a Boolean */
9339 switch (op) {
9340 case Py_EQ:
9341 v = TEST_COND(result == 0);
9342 break;
9343 case Py_NE:
9344 v = TEST_COND(result != 0);
9345 break;
9346 case Py_LE:
9347 v = TEST_COND(result <= 0);
9348 break;
9349 case Py_GE:
9350 v = TEST_COND(result >= 0);
9351 break;
9352 case Py_LT:
9353 v = TEST_COND(result == -1);
9354 break;
9355 case Py_GT:
9356 v = TEST_COND(result == 1);
9357 break;
9358 default:
9359 PyErr_BadArgument();
9360 return NULL;
9361 }
9362 Py_INCREF(v);
9363 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009364 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009365
Brian Curtindfc80e32011-08-10 20:28:54 -05009366 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009367}
9368
Alexander Belopolsky40018472011-02-26 01:02:56 +00009369int
9370PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009371{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009372 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373 int kind1, kind2, kind;
9374 void *buf1, *buf2;
9375 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009376 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009377
9378 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009379 sub = PyUnicode_FromObject(element);
9380 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009381 PyErr_Format(PyExc_TypeError,
9382 "'in <string>' requires string as left operand, not %s",
9383 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009384 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 if (PyUnicode_READY(sub) == -1)
9387 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009388
Thomas Wouters477c8d52006-05-27 19:21:47 +00009389 str = PyUnicode_FromObject(container);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 if (!str || PyUnicode_READY(container) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009391 Py_DECREF(sub);
9392 return -1;
9393 }
9394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 kind1 = PyUnicode_KIND(str);
9396 kind2 = PyUnicode_KIND(sub);
9397 kind = kind1 > kind2 ? kind1 : kind2;
9398 buf1 = PyUnicode_DATA(str);
9399 buf2 = PyUnicode_DATA(sub);
9400 if (kind1 != kind)
9401 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9402 if (!buf1) {
9403 Py_DECREF(sub);
9404 return -1;
9405 }
9406 if (kind2 != kind)
9407 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9408 if (!buf2) {
9409 Py_DECREF(sub);
9410 if (kind1 != kind) PyMem_Free(buf1);
9411 return -1;
9412 }
9413 len1 = PyUnicode_GET_LENGTH(str);
9414 len2 = PyUnicode_GET_LENGTH(sub);
9415
9416 switch(kind) {
9417 case PyUnicode_1BYTE_KIND:
9418 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9419 break;
9420 case PyUnicode_2BYTE_KIND:
9421 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9422 break;
9423 case PyUnicode_4BYTE_KIND:
9424 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9425 break;
9426 default:
9427 result = -1;
9428 assert(0);
9429 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009430
9431 Py_DECREF(str);
9432 Py_DECREF(sub);
9433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434 if (kind1 != kind)
9435 PyMem_Free(buf1);
9436 if (kind2 != kind)
9437 PyMem_Free(buf2);
9438
Guido van Rossum403d68b2000-03-13 15:55:09 +00009439 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009440}
9441
Guido van Rossumd57fd912000-03-10 22:53:23 +00009442/* Concat to string or Unicode object giving a new Unicode object. */
9443
Alexander Belopolsky40018472011-02-26 01:02:56 +00009444PyObject *
9445PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 PyObject *u = NULL, *v = NULL, *w;
9448 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449
9450 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009453 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009456 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457
9458 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009460 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009464 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466 }
9467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 if (PyUnicode_READY(u) == -1 || PyUnicode_READY(v) == -1)
9469 goto onError;
9470
9471 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009472 maxchar = PY_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 w = PyUnicode_New(
9476 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9477 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009479 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009480 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9481 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009482 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009483 v, 0,
9484 PyUnicode_GET_LENGTH(v)) < 0)
9485 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009486 Py_DECREF(u);
9487 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489
Benjamin Peterson29060642009-01-31 22:14:21 +00009490 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491 Py_XDECREF(u);
9492 Py_XDECREF(v);
9493 return NULL;
9494}
9495
Walter Dörwald1ab83302007-05-18 17:15:44 +00009496void
9497PyUnicode_Append(PyObject **pleft, PyObject *right)
9498{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009499 PyObject *new;
9500 if (*pleft == NULL)
9501 return;
9502 if (right == NULL || !PyUnicode_Check(*pleft)) {
9503 Py_DECREF(*pleft);
9504 *pleft = NULL;
9505 return;
9506 }
9507 new = PyUnicode_Concat(*pleft, right);
9508 Py_DECREF(*pleft);
9509 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009510}
9511
9512void
9513PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9514{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009515 PyUnicode_Append(pleft, right);
9516 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009517}
9518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009519PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009520 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009521\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009522Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009523string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009524interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525
9526static PyObject *
9527unicode_count(PyUnicodeObject *self, PyObject *args)
9528{
9529 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009530 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009531 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 int kind1, kind2, kind;
9534 void *buf1, *buf2;
9535 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536
Jesus Ceaac451502011-04-20 17:09:23 +02009537 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9538 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009539 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 kind1 = PyUnicode_KIND(self);
9542 kind2 = PyUnicode_KIND(substring);
9543 kind = kind1 > kind2 ? kind1 : kind2;
9544 buf1 = PyUnicode_DATA(self);
9545 buf2 = PyUnicode_DATA(substring);
9546 if (kind1 != kind)
9547 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9548 if (!buf1) {
9549 Py_DECREF(substring);
9550 return NULL;
9551 }
9552 if (kind2 != kind)
9553 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9554 if (!buf2) {
9555 Py_DECREF(substring);
9556 if (kind1 != kind) PyMem_Free(buf1);
9557 return NULL;
9558 }
9559 len1 = PyUnicode_GET_LENGTH(self);
9560 len2 = PyUnicode_GET_LENGTH(substring);
9561
9562 ADJUST_INDICES(start, end, len1);
9563 switch(kind) {
9564 case PyUnicode_1BYTE_KIND:
9565 iresult = ucs1lib_count(
9566 ((Py_UCS1*)buf1) + start, end - start,
9567 buf2, len2, PY_SSIZE_T_MAX
9568 );
9569 break;
9570 case PyUnicode_2BYTE_KIND:
9571 iresult = ucs2lib_count(
9572 ((Py_UCS2*)buf1) + start, end - start,
9573 buf2, len2, PY_SSIZE_T_MAX
9574 );
9575 break;
9576 case PyUnicode_4BYTE_KIND:
9577 iresult = ucs4lib_count(
9578 ((Py_UCS4*)buf1) + start, end - start,
9579 buf2, len2, PY_SSIZE_T_MAX
9580 );
9581 break;
9582 default:
9583 assert(0); iresult = 0;
9584 }
9585
9586 result = PyLong_FromSsize_t(iresult);
9587
9588 if (kind1 != kind)
9589 PyMem_Free(buf1);
9590 if (kind2 != kind)
9591 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592
9593 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009594
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595 return result;
9596}
9597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009598PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009599 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009600\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009601Encode S using the codec registered for encoding. Default encoding\n\
9602is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009603handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009604a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9605'xmlcharrefreplace' as well as any other name registered with\n\
9606codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009607
9608static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009609unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009610{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009611 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009612 char *encoding = NULL;
9613 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009614
Benjamin Peterson308d6372009-09-18 21:42:35 +00009615 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9616 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009617 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009618 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009619}
9620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009621PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009622 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623\n\
9624Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009625If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626
9627static PyObject*
9628unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9629{
9630 Py_UNICODE *e;
9631 Py_UNICODE *p;
9632 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009633 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635 PyUnicodeObject *u;
9636 int tabsize = 8;
9637
9638 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9642 return NULL;
9643
Thomas Wouters7e474022000-07-16 12:04:32 +00009644 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009645 i = 0; /* chars up to and including most recent \n or \r */
9646 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9648 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009650 if (tabsize > 0) {
9651 incr = tabsize - (j % tabsize); /* cannot overflow */
9652 if (j > PY_SSIZE_T_MAX - incr)
9653 goto overflow1;
9654 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009655 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009656 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009658 if (j > PY_SSIZE_T_MAX - 1)
9659 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660 j++;
9661 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009662 if (i > PY_SSIZE_T_MAX - j)
9663 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009665 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666 }
9667 }
9668
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009669 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009670 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009671
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672 /* Second pass: create output string and fill it */
9673 u = _PyUnicode_New(i + j);
9674 if (!u)
9675 return NULL;
9676
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009677 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 q = _PyUnicode_WSTR(u); /* next output char */
9679 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009681 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009683 if (tabsize > 0) {
9684 i = tabsize - (j % tabsize);
9685 j += i;
9686 while (i--) {
9687 if (q >= qe)
9688 goto overflow2;
9689 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009690 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009691 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009692 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009693 else {
9694 if (q >= qe)
9695 goto overflow2;
9696 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009697 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698 if (*p == '\n' || *p == '\r')
9699 j = 0;
9700 }
9701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 if (PyUnicode_READY(u) == -1) {
9703 Py_DECREF(u);
9704 return NULL;
9705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009707
9708 overflow2:
9709 Py_DECREF(u);
9710 overflow1:
9711 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9712 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713}
9714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009715PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009716 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717\n\
9718Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009719such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720arguments start and end are interpreted as in slice notation.\n\
9721\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009722Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009723
9724static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726{
Jesus Ceaac451502011-04-20 17:09:23 +02009727 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009728 Py_ssize_t start;
9729 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009730 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731
Jesus Ceaac451502011-04-20 17:09:23 +02009732 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9733 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 if (PyUnicode_READY(self) == -1)
9737 return NULL;
9738 if (PyUnicode_READY(substring) == -1)
9739 return NULL;
9740
9741 result = any_find_slice(
9742 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9743 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009744 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745
9746 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 if (result == -2)
9749 return NULL;
9750
Christian Heimes217cfd12007-12-02 14:31:20 +00009751 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009752}
9753
9754static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009755unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009757 Py_UCS4 ch;
9758
9759 if (PyUnicode_READY(self) == -1)
9760 return NULL;
9761 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762 PyErr_SetString(PyExc_IndexError, "string index out of range");
9763 return NULL;
9764 }
9765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009766 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9767 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768}
9769
Guido van Rossumc2504932007-09-18 19:42:40 +00009770/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009771 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009772static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009773unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774{
Guido van Rossumc2504932007-09-18 19:42:40 +00009775 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009776 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 if (_PyUnicode_HASH(self) != -1)
9779 return _PyUnicode_HASH(self);
9780 if (PyUnicode_READY(self) == -1)
9781 return -1;
9782 len = PyUnicode_GET_LENGTH(self);
9783
9784 /* The hash function as a macro, gets expanded three times below. */
9785#define HASH(P) \
9786 x = (Py_uhash_t)*P << 7; \
9787 while (--len >= 0) \
9788 x = (1000003*x) ^ (Py_uhash_t)*P++;
9789
9790 switch (PyUnicode_KIND(self)) {
9791 case PyUnicode_1BYTE_KIND: {
9792 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9793 HASH(c);
9794 break;
9795 }
9796 case PyUnicode_2BYTE_KIND: {
9797 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9798 HASH(s);
9799 break;
9800 }
9801 default: {
9802 Py_UCS4 *l;
9803 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9804 "Impossible switch case in unicode_hash");
9805 l = PyUnicode_4BYTE_DATA(self);
9806 HASH(l);
9807 break;
9808 }
9809 }
9810 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9811
Guido van Rossumc2504932007-09-18 19:42:40 +00009812 if (x == -1)
9813 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009814 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009815 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009819PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009820 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009822Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823
9824static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009827 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009828 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009829 Py_ssize_t start;
9830 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831
Jesus Ceaac451502011-04-20 17:09:23 +02009832 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9833 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009834 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009836 if (PyUnicode_READY(self) == -1)
9837 return NULL;
9838 if (PyUnicode_READY(substring) == -1)
9839 return NULL;
9840
9841 result = any_find_slice(
9842 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9843 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009844 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845
9846 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009848 if (result == -2)
9849 return NULL;
9850
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851 if (result < 0) {
9852 PyErr_SetString(PyExc_ValueError, "substring not found");
9853 return NULL;
9854 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009855
Christian Heimes217cfd12007-12-02 14:31:20 +00009856 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009857}
9858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009859PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009860 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009862Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009863at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864
9865static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009866unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 Py_ssize_t i, length;
9869 int kind;
9870 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009871 int cased;
9872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 if (PyUnicode_READY(self) == -1)
9874 return NULL;
9875 length = PyUnicode_GET_LENGTH(self);
9876 kind = PyUnicode_KIND(self);
9877 data = PyUnicode_DATA(self);
9878
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 if (length == 1)
9881 return PyBool_FromLong(
9882 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009884 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009886 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009887
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 for (i = 0; i < length; i++) {
9890 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009891
Benjamin Peterson29060642009-01-31 22:14:21 +00009892 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9893 return PyBool_FromLong(0);
9894 else if (!cased && Py_UNICODE_ISLOWER(ch))
9895 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009896 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009897 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898}
9899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009900PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009901 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009903Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009904at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905
9906static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009907unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009908{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009909 Py_ssize_t i, length;
9910 int kind;
9911 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912 int cased;
9913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914 if (PyUnicode_READY(self) == -1)
9915 return NULL;
9916 length = PyUnicode_GET_LENGTH(self);
9917 kind = PyUnicode_KIND(self);
9918 data = PyUnicode_DATA(self);
9919
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 if (length == 1)
9922 return PyBool_FromLong(
9923 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009925 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009927 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009928
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 for (i = 0; i < length; i++) {
9931 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009932
Benjamin Peterson29060642009-01-31 22:14:21 +00009933 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9934 return PyBool_FromLong(0);
9935 else if (!cased && Py_UNICODE_ISUPPER(ch))
9936 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009937 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009938 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939}
9940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009941PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009942 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009943\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009944Return True if S is a titlecased string and there is at least one\n\
9945character in S, i.e. upper- and titlecase characters may only\n\
9946follow uncased characters and lowercase characters only cased ones.\n\
9947Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948
9949static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009950unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 Py_ssize_t i, length;
9953 int kind;
9954 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955 int cased, previous_is_cased;
9956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 if (PyUnicode_READY(self) == -1)
9958 return NULL;
9959 length = PyUnicode_GET_LENGTH(self);
9960 kind = PyUnicode_KIND(self);
9961 data = PyUnicode_DATA(self);
9962
Guido van Rossumd57fd912000-03-10 22:53:23 +00009963 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 if (length == 1) {
9965 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
9966 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
9967 (Py_UNICODE_ISUPPER(ch) != 0));
9968 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009970 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009972 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009973
Guido van Rossumd57fd912000-03-10 22:53:23 +00009974 cased = 0;
9975 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 for (i = 0; i < length; i++) {
9977 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009978
Benjamin Peterson29060642009-01-31 22:14:21 +00009979 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
9980 if (previous_is_cased)
9981 return PyBool_FromLong(0);
9982 previous_is_cased = 1;
9983 cased = 1;
9984 }
9985 else if (Py_UNICODE_ISLOWER(ch)) {
9986 if (!previous_is_cased)
9987 return PyBool_FromLong(0);
9988 previous_is_cased = 1;
9989 cased = 1;
9990 }
9991 else
9992 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009994 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009995}
9996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009997PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009998 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009999\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010000Return True if all characters in S are whitespace\n\
10001and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010002
10003static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010004unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010005{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 Py_ssize_t i, length;
10007 int kind;
10008 void *data;
10009
10010 if (PyUnicode_READY(self) == -1)
10011 return NULL;
10012 length = PyUnicode_GET_LENGTH(self);
10013 kind = PyUnicode_KIND(self);
10014 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 if (length == 1)
10018 return PyBool_FromLong(
10019 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010021 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010023 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 for (i = 0; i < length; i++) {
10026 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010027 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010028 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010030 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031}
10032
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010033PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010034 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010035\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010036Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010037and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010038
10039static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010040unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010041{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 Py_ssize_t i, length;
10043 int kind;
10044 void *data;
10045
10046 if (PyUnicode_READY(self) == -1)
10047 return NULL;
10048 length = PyUnicode_GET_LENGTH(self);
10049 kind = PyUnicode_KIND(self);
10050 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010051
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010052 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 if (length == 1)
10054 return PyBool_FromLong(
10055 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010056
10057 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010059 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 for (i = 0; i < length; i++) {
10062 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010063 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010064 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010065 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010066}
10067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010068PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010069 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010070\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010071Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010072and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010073
10074static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010075unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010076{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 int kind;
10078 void *data;
10079 Py_ssize_t len, i;
10080
10081 if (PyUnicode_READY(self) == -1)
10082 return NULL;
10083
10084 kind = PyUnicode_KIND(self);
10085 data = PyUnicode_DATA(self);
10086 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010087
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010088 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 if (len == 1) {
10090 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10091 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10092 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010093
10094 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010096 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 for (i = 0; i < len; i++) {
10099 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010100 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010101 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010102 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010103 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010104}
10105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010106PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010107 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010108\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010109Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010110False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111
10112static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010113unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010114{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 Py_ssize_t i, length;
10116 int kind;
10117 void *data;
10118
10119 if (PyUnicode_READY(self) == -1)
10120 return NULL;
10121 length = PyUnicode_GET_LENGTH(self);
10122 kind = PyUnicode_KIND(self);
10123 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010124
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 if (length == 1)
10127 return PyBool_FromLong(
10128 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010129
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010130 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010132 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 for (i = 0; i < length; i++) {
10135 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010136 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010138 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139}
10140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010141PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010142 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010143\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010144Return True if all characters in S are digits\n\
10145and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146
10147static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010148unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 Py_ssize_t i, length;
10151 int kind;
10152 void *data;
10153
10154 if (PyUnicode_READY(self) == -1)
10155 return NULL;
10156 length = PyUnicode_GET_LENGTH(self);
10157 kind = PyUnicode_KIND(self);
10158 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 if (length == 1) {
10162 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10163 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010166 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010168 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 for (i = 0; i < length; i++) {
10171 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010172 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010174 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175}
10176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010177PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010178 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010180Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010181False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182
10183static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010184unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 Py_ssize_t i, length;
10187 int kind;
10188 void *data;
10189
10190 if (PyUnicode_READY(self) == -1)
10191 return NULL;
10192 length = PyUnicode_GET_LENGTH(self);
10193 kind = PyUnicode_KIND(self);
10194 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010195
Guido van Rossumd57fd912000-03-10 22:53:23 +000010196 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 if (length == 1)
10198 return PyBool_FromLong(
10199 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010201 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010203 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 for (i = 0; i < length; i++) {
10206 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010207 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010208 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010209 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210}
10211
Martin v. Löwis47383402007-08-15 07:32:56 +000010212int
10213PyUnicode_IsIdentifier(PyObject *self)
10214{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 int kind;
10216 void *data;
10217 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010218 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 if (PyUnicode_READY(self) == -1) {
10221 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010222 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 }
10224
10225 /* Special case for empty strings */
10226 if (PyUnicode_GET_LENGTH(self) == 0)
10227 return 0;
10228 kind = PyUnicode_KIND(self);
10229 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010230
10231 /* PEP 3131 says that the first character must be in
10232 XID_Start and subsequent characters in XID_Continue,
10233 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010234 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010235 letters, digits, underscore). However, given the current
10236 definition of XID_Start and XID_Continue, it is sufficient
10237 to check just for these, except that _ must be allowed
10238 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010240 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010241 return 0;
10242
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010243 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010245 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010246 return 1;
10247}
10248
10249PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010250 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010251\n\
10252Return True if S is a valid identifier according\n\
10253to the language definition.");
10254
10255static PyObject*
10256unicode_isidentifier(PyObject *self)
10257{
10258 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10259}
10260
Georg Brandl559e5d72008-06-11 18:37:52 +000010261PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010262 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010263\n\
10264Return True if all characters in S are considered\n\
10265printable in repr() or S is empty, False otherwise.");
10266
10267static PyObject*
10268unicode_isprintable(PyObject *self)
10269{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 Py_ssize_t i, length;
10271 int kind;
10272 void *data;
10273
10274 if (PyUnicode_READY(self) == -1)
10275 return NULL;
10276 length = PyUnicode_GET_LENGTH(self);
10277 kind = PyUnicode_KIND(self);
10278 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010279
10280 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 if (length == 1)
10282 return PyBool_FromLong(
10283 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 for (i = 0; i < length; i++) {
10286 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010287 Py_RETURN_FALSE;
10288 }
10289 }
10290 Py_RETURN_TRUE;
10291}
10292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010293PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010294 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295\n\
10296Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010297iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298
10299static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010300unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010301{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010302 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303}
10304
Martin v. Löwis18e16552006-02-15 17:27:45 +000010305static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306unicode_length(PyUnicodeObject *self)
10307{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 if (PyUnicode_READY(self) == -1)
10309 return -1;
10310 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311}
10312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010313PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010314 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010316Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010317done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318
10319static PyObject *
10320unicode_ljust(PyUnicodeObject *self, PyObject *args)
10321{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010322 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 Py_UCS4 fillchar = ' ';
10324
10325 if (PyUnicode_READY(self) == -1)
10326 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010327
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010328 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329 return NULL;
10330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332 Py_INCREF(self);
10333 return (PyObject*) self;
10334 }
10335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010337}
10338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010339PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010340 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010341\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010342Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343
10344static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010345unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347 return fixup(self, fixlower);
10348}
10349
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010350#define LEFTSTRIP 0
10351#define RIGHTSTRIP 1
10352#define BOTHSTRIP 2
10353
10354/* Arrays indexed by above */
10355static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10356
10357#define STRIPNAME(i) (stripformat[i]+3)
10358
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010359/* externally visible for str.strip(unicode) */
10360PyObject *
10361_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 void *data;
10364 int kind;
10365 Py_ssize_t i, j, len;
10366 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10369 return NULL;
10370
10371 kind = PyUnicode_KIND(self);
10372 data = PyUnicode_DATA(self);
10373 len = PyUnicode_GET_LENGTH(self);
10374 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10375 PyUnicode_DATA(sepobj),
10376 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010377
Benjamin Peterson14339b62009-01-31 16:36:08 +000010378 i = 0;
10379 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 while (i < len &&
10381 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010382 i++;
10383 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010384 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010385
Benjamin Peterson14339b62009-01-31 16:36:08 +000010386 j = len;
10387 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010388 do {
10389 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 } while (j >= i &&
10391 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010392 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010393 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010394
Benjamin Peterson14339b62009-01-31 16:36:08 +000010395 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010396 Py_INCREF(self);
10397 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010398 }
10399 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 return PyUnicode_Substring((PyObject*)self, i, j);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010401}
10402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403/* Assumes an already ready self string. */
10404
10405static PyObject *
10406substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len)
10407{
10408 const int kind = PyUnicode_KIND(self);
10409 void *data = PyUnicode_DATA(self);
10410 Py_UCS4 maxchar = 0;
10411 Py_ssize_t i;
10412 PyObject *unicode;
10413
10414 if (start < 0 || len < 0 || (start + len) > PyUnicode_GET_LENGTH(self)) {
10415 PyErr_BadInternalCall();
10416 return NULL;
10417 }
10418
10419 if (len == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) {
10420 Py_INCREF(self);
10421 return (PyObject*)self;
10422 }
10423
10424 for (i = 0; i < len; ++i) {
10425 const Py_UCS4 ch = PyUnicode_READ(kind, data, start + i);
10426 if (ch > maxchar)
10427 maxchar = ch;
10428 }
10429
10430 unicode = PyUnicode_New(len, maxchar);
10431 if (unicode == NULL)
10432 return NULL;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010433 if (PyUnicode_CopyCharacters(unicode, 0,
10434 (PyObject*)self, start, len) < 0)
10435 {
10436 Py_DECREF(unicode);
10437 return NULL;
10438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 return unicode;
10440}
10441
10442PyObject*
10443PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10444{
10445 unsigned char *data;
10446 int kind;
10447
10448 if (start == 0 && end == PyUnicode_GET_LENGTH(self)
10449 && PyUnicode_CheckExact(self))
10450 {
10451 Py_INCREF(self);
10452 return (PyObject *)self;
10453 }
10454
10455 if ((end - start) == 1)
10456 return unicode_getitem((PyUnicodeObject*)self, start);
10457
10458 if (PyUnicode_READY(self) == -1)
10459 return NULL;
10460 kind = PyUnicode_KIND(self);
10461 data = PyUnicode_1BYTE_DATA(self);
10462 return PyUnicode_FromKindAndData(kind, data + PyUnicode_KIND_SIZE(kind, start),
10463 end-start);
10464}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010465
10466static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010467do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010468{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 int kind;
10470 void *data;
10471 Py_ssize_t len, i, j;
10472
10473 if (PyUnicode_READY(self) == -1)
10474 return NULL;
10475
10476 kind = PyUnicode_KIND(self);
10477 data = PyUnicode_DATA(self);
10478 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010479
Benjamin Peterson14339b62009-01-31 16:36:08 +000010480 i = 0;
10481 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010483 i++;
10484 }
10485 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010486
Benjamin Peterson14339b62009-01-31 16:36:08 +000010487 j = len;
10488 if (striptype != LEFTSTRIP) {
10489 do {
10490 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010492 j++;
10493 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010494
Benjamin Peterson14339b62009-01-31 16:36:08 +000010495 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
10496 Py_INCREF(self);
10497 return (PyObject*)self;
10498 }
10499 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 return substring(self, i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501}
10502
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010503
10504static PyObject *
10505do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10506{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010507 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010508
Benjamin Peterson14339b62009-01-31 16:36:08 +000010509 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10510 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010511
Benjamin Peterson14339b62009-01-31 16:36:08 +000010512 if (sep != NULL && sep != Py_None) {
10513 if (PyUnicode_Check(sep))
10514 return _PyUnicode_XStrip(self, striptype, sep);
10515 else {
10516 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010517 "%s arg must be None or str",
10518 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010519 return NULL;
10520 }
10521 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010522
Benjamin Peterson14339b62009-01-31 16:36:08 +000010523 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010524}
10525
10526
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010527PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010528 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010529\n\
10530Return a copy of the string S with leading and trailing\n\
10531whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010532If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010533
10534static PyObject *
10535unicode_strip(PyUnicodeObject *self, PyObject *args)
10536{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010537 if (PyTuple_GET_SIZE(args) == 0)
10538 return do_strip(self, BOTHSTRIP); /* Common case */
10539 else
10540 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010541}
10542
10543
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010544PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010545 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010546\n\
10547Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010548If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010549
10550static PyObject *
10551unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10552{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010553 if (PyTuple_GET_SIZE(args) == 0)
10554 return do_strip(self, LEFTSTRIP); /* Common case */
10555 else
10556 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010557}
10558
10559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010560PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010561 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010562\n\
10563Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010564If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010565
10566static PyObject *
10567unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10568{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010569 if (PyTuple_GET_SIZE(args) == 0)
10570 return do_strip(self, RIGHTSTRIP); /* Common case */
10571 else
10572 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010573}
10574
10575
Guido van Rossumd57fd912000-03-10 22:53:23 +000010576static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010577unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578{
10579 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 Py_ssize_t nchars, n;
10581 size_t nbytes, char_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010582
Georg Brandl222de0f2009-04-12 12:01:50 +000010583 if (len < 1) {
10584 Py_INCREF(unicode_empty);
10585 return (PyObject *)unicode_empty;
10586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587
Tim Peters7a29bd52001-09-12 03:03:31 +000010588 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589 /* no repeat, return original string */
10590 Py_INCREF(str);
10591 return (PyObject*) str;
10592 }
Tim Peters8f422462000-09-09 06:13:41 +000010593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 if (PyUnicode_READY(str) == -1)
10595 return NULL;
10596
Tim Peters8f422462000-09-09 06:13:41 +000010597 /* ensure # of chars needed doesn't overflow int and # of bytes
10598 * needed doesn't overflow size_t
10599 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 nchars = len * PyUnicode_GET_LENGTH(str);
10601 if (nchars / len != PyUnicode_GET_LENGTH(str)) {
Tim Peters8f422462000-09-09 06:13:41 +000010602 PyErr_SetString(PyExc_OverflowError,
10603 "repeated string is too long");
10604 return NULL;
10605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 char_size = PyUnicode_CHARACTER_SIZE(str);
10607 nbytes = (nchars + 1) * char_size;
10608 if (nbytes / char_size != (size_t)(nchars + 1)) {
Tim Peters8f422462000-09-09 06:13:41 +000010609 PyErr_SetString(PyExc_OverflowError,
10610 "repeated string is too long");
10611 return NULL;
10612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614 if (!u)
10615 return NULL;
10616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 if (PyUnicode_GET_LENGTH(str) == 1) {
10618 const int kind = PyUnicode_KIND(str);
10619 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10620 void *to = PyUnicode_DATA(u);
10621 for (n = 0; n < len; ++n)
10622 PyUnicode_WRITE(kind, to, n, fill_char);
10623 }
10624 else {
10625 /* number of characters copied this far */
10626 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10627 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10628 char *to = (char *) PyUnicode_DATA(u);
10629 Py_MEMCPY(to, PyUnicode_DATA(str),
10630 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010631 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 n = (done <= nchars-done) ? done : nchars-done;
10633 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010634 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636 }
10637
10638 return (PyObject*) u;
10639}
10640
Alexander Belopolsky40018472011-02-26 01:02:56 +000010641PyObject *
10642PyUnicode_Replace(PyObject *obj,
10643 PyObject *subobj,
10644 PyObject *replobj,
10645 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646{
10647 PyObject *self;
10648 PyObject *str1;
10649 PyObject *str2;
10650 PyObject *result;
10651
10652 self = PyUnicode_FromObject(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 if (self == NULL || PyUnicode_READY(obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010654 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655 str1 = PyUnicode_FromObject(subobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 if (str1 == NULL || PyUnicode_READY(obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010657 Py_DECREF(self);
10658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010659 }
10660 str2 = PyUnicode_FromObject(replobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 if (str2 == NULL || PyUnicode_READY(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010662 Py_DECREF(self);
10663 Py_DECREF(str1);
10664 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010667 Py_DECREF(self);
10668 Py_DECREF(str1);
10669 Py_DECREF(str2);
10670 return result;
10671}
10672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010673PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010674 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010675\n\
10676Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010677old replaced by new. If the optional argument count is\n\
10678given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679
10680static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 PyObject *str1;
10684 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010685 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686 PyObject *result;
10687
Martin v. Löwis18e16552006-02-15 17:27:45 +000010688 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010691 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 str1 = PyUnicode_FromObject(str1);
10693 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10694 return NULL;
10695 str2 = PyUnicode_FromObject(str2);
10696 if (str2 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010697 Py_DECREF(str1);
10698 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010699 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700
10701 result = replace(self, str1, str2, maxcount);
10702
10703 Py_DECREF(str1);
10704 Py_DECREF(str2);
10705 return result;
10706}
10707
Alexander Belopolsky40018472011-02-26 01:02:56 +000010708static PyObject *
10709unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010711 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 Py_ssize_t isize;
10713 Py_ssize_t osize, squote, dquote, i, o;
10714 Py_UCS4 max, quote;
10715 int ikind, okind;
10716 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010719 return NULL;
10720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 isize = PyUnicode_GET_LENGTH(unicode);
10722 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 /* Compute length of output, quote characters, and
10725 maximum character */
10726 osize = 2; /* quotes */
10727 max = 127;
10728 squote = dquote = 0;
10729 ikind = PyUnicode_KIND(unicode);
10730 for (i = 0; i < isize; i++) {
10731 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10732 switch (ch) {
10733 case '\'': squote++; osize++; break;
10734 case '"': dquote++; osize++; break;
10735 case '\\': case '\t': case '\r': case '\n':
10736 osize += 2; break;
10737 default:
10738 /* Fast-path ASCII */
10739 if (ch < ' ' || ch == 0x7f)
10740 osize += 4; /* \xHH */
10741 else if (ch < 0x7f)
10742 osize++;
10743 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10744 osize++;
10745 max = ch > max ? ch : max;
10746 }
10747 else if (ch < 0x100)
10748 osize += 4; /* \xHH */
10749 else if (ch < 0x10000)
10750 osize += 6; /* \uHHHH */
10751 else
10752 osize += 10; /* \uHHHHHHHH */
10753 }
10754 }
10755
10756 quote = '\'';
10757 if (squote) {
10758 if (dquote)
10759 /* Both squote and dquote present. Use squote,
10760 and escape them */
10761 osize += squote;
10762 else
10763 quote = '"';
10764 }
10765
10766 repr = PyUnicode_New(osize, max);
10767 if (repr == NULL)
10768 return NULL;
10769 okind = PyUnicode_KIND(repr);
10770 odata = PyUnicode_DATA(repr);
10771
10772 PyUnicode_WRITE(okind, odata, 0, quote);
10773 PyUnicode_WRITE(okind, odata, osize-1, quote);
10774
10775 for (i = 0, o = 1; i < isize; i++) {
10776 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010777
10778 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 if ((ch == quote) || (ch == '\\')) {
10780 PyUnicode_WRITE(okind, odata, o++, '\\');
10781 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010782 continue;
10783 }
10784
Benjamin Peterson29060642009-01-31 22:14:21 +000010785 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010786 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 PyUnicode_WRITE(okind, odata, o++, '\\');
10788 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010789 }
10790 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 PyUnicode_WRITE(okind, odata, o++, '\\');
10792 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010793 }
10794 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 PyUnicode_WRITE(okind, odata, o++, '\\');
10796 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010797 }
10798
10799 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010800 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 PyUnicode_WRITE(okind, odata, o++, '\\');
10802 PyUnicode_WRITE(okind, odata, o++, 'x');
10803 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10804 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010805 }
10806
Georg Brandl559e5d72008-06-11 18:37:52 +000010807 /* Copy ASCII characters as-is */
10808 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010810 }
10811
Benjamin Peterson29060642009-01-31 22:14:21 +000010812 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010813 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010814 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010815 (categories Z* and C* except ASCII space)
10816 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010818 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 if (ch <= 0xff) {
10820 PyUnicode_WRITE(okind, odata, o++, '\\');
10821 PyUnicode_WRITE(okind, odata, o++, 'x');
10822 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10823 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010824 }
10825 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826 else if (ch >= 0x10000) {
10827 PyUnicode_WRITE(okind, odata, o++, '\\');
10828 PyUnicode_WRITE(okind, odata, o++, 'U');
10829 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10830 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10831 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10832 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10833 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10834 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10835 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10836 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010837 }
10838 /* Map 16-bit characters to '\uxxxx' */
10839 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 PyUnicode_WRITE(okind, odata, o++, '\\');
10841 PyUnicode_WRITE(okind, odata, o++, 'u');
10842 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10843 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10844 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10845 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010846 }
10847 }
10848 /* Copy characters as-is */
10849 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010851 }
10852 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010855 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856}
10857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010858PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010859 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860\n\
10861Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010862such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863arguments start and end are interpreted as in slice notation.\n\
10864\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010865Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866
10867static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869{
Jesus Ceaac451502011-04-20 17:09:23 +020010870 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010871 Py_ssize_t start;
10872 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010873 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874
Jesus Ceaac451502011-04-20 17:09:23 +020010875 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10876 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010877 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010879 if (PyUnicode_READY(self) == -1)
10880 return NULL;
10881 if (PyUnicode_READY(substring) == -1)
10882 return NULL;
10883
10884 result = any_find_slice(
10885 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10886 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010887 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010888
10889 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 if (result == -2)
10892 return NULL;
10893
Christian Heimes217cfd12007-12-02 14:31:20 +000010894 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010895}
10896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010897PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010898 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010899\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010900Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901
10902static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904{
Jesus Ceaac451502011-04-20 17:09:23 +020010905 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010906 Py_ssize_t start;
10907 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010908 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909
Jesus Ceaac451502011-04-20 17:09:23 +020010910 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10911 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010912 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 if (PyUnicode_READY(self) == -1)
10915 return NULL;
10916 if (PyUnicode_READY(substring) == -1)
10917 return NULL;
10918
10919 result = any_find_slice(
10920 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10921 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010922 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923
10924 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010926 if (result == -2)
10927 return NULL;
10928
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929 if (result < 0) {
10930 PyErr_SetString(PyExc_ValueError, "substring not found");
10931 return NULL;
10932 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933
Christian Heimes217cfd12007-12-02 14:31:20 +000010934 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935}
10936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010937PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010938 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010940Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010941done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942
10943static PyObject *
10944unicode_rjust(PyUnicodeObject *self, PyObject *args)
10945{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010946 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 Py_UCS4 fillchar = ' ';
10948
10949 if (PyUnicode_READY(self) == -1)
10950 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010951
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010952 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953 return NULL;
10954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956 Py_INCREF(self);
10957 return (PyObject*) self;
10958 }
10959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010960 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961}
10962
Alexander Belopolsky40018472011-02-26 01:02:56 +000010963PyObject *
10964PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965{
10966 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010967
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968 s = PyUnicode_FromObject(s);
10969 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010970 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010971 if (sep != NULL) {
10972 sep = PyUnicode_FromObject(sep);
10973 if (sep == NULL) {
10974 Py_DECREF(s);
10975 return NULL;
10976 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977 }
10978
10979 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
10980
10981 Py_DECREF(s);
10982 Py_XDECREF(sep);
10983 return result;
10984}
10985
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010986PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010987 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988\n\
10989Return a list of the words in S, using sep as the\n\
10990delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000010991splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000010992whitespace string is a separator and empty strings are\n\
10993removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994
10995static PyObject*
10996unicode_split(PyUnicodeObject *self, PyObject *args)
10997{
10998 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010999 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000
Martin v. Löwis18e16552006-02-15 17:27:45 +000011001 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002 return NULL;
11003
11004 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011005 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011007 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011009 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010}
11011
Thomas Wouters477c8d52006-05-27 19:21:47 +000011012PyObject *
11013PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11014{
11015 PyObject* str_obj;
11016 PyObject* sep_obj;
11017 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018 int kind1, kind2, kind;
11019 void *buf1 = NULL, *buf2 = NULL;
11020 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011021
11022 str_obj = PyUnicode_FromObject(str_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 if (!str_obj || PyUnicode_READY(str_in) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011024 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011025 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011027 Py_DECREF(str_obj);
11028 return NULL;
11029 }
11030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031 kind1 = PyUnicode_KIND(str_in);
11032 kind2 = PyUnicode_KIND(sep_obj);
11033 kind = kind1 > kind2 ? kind1 : kind2;
11034 buf1 = PyUnicode_DATA(str_in);
11035 if (kind1 != kind)
11036 buf1 = _PyUnicode_AsKind(str_in, kind);
11037 if (!buf1)
11038 goto onError;
11039 buf2 = PyUnicode_DATA(sep_obj);
11040 if (kind2 != kind)
11041 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11042 if (!buf2)
11043 goto onError;
11044 len1 = PyUnicode_GET_LENGTH(str_obj);
11045 len2 = PyUnicode_GET_LENGTH(sep_obj);
11046
11047 switch(PyUnicode_KIND(str_in)) {
11048 case PyUnicode_1BYTE_KIND:
11049 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11050 break;
11051 case PyUnicode_2BYTE_KIND:
11052 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11053 break;
11054 case PyUnicode_4BYTE_KIND:
11055 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11056 break;
11057 default:
11058 assert(0);
11059 out = 0;
11060 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011061
11062 Py_DECREF(sep_obj);
11063 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064 if (kind1 != kind)
11065 PyMem_Free(buf1);
11066 if (kind2 != kind)
11067 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011068
11069 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011070 onError:
11071 Py_DECREF(sep_obj);
11072 Py_DECREF(str_obj);
11073 if (kind1 != kind && buf1)
11074 PyMem_Free(buf1);
11075 if (kind2 != kind && buf2)
11076 PyMem_Free(buf2);
11077 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011078}
11079
11080
11081PyObject *
11082PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11083{
11084 PyObject* str_obj;
11085 PyObject* sep_obj;
11086 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 int kind1, kind2, kind;
11088 void *buf1 = NULL, *buf2 = NULL;
11089 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011090
11091 str_obj = PyUnicode_FromObject(str_in);
11092 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011093 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011094 sep_obj = PyUnicode_FromObject(sep_in);
11095 if (!sep_obj) {
11096 Py_DECREF(str_obj);
11097 return NULL;
11098 }
11099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 kind1 = PyUnicode_KIND(str_in);
11101 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011102 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103 buf1 = PyUnicode_DATA(str_in);
11104 if (kind1 != kind)
11105 buf1 = _PyUnicode_AsKind(str_in, kind);
11106 if (!buf1)
11107 goto onError;
11108 buf2 = PyUnicode_DATA(sep_obj);
11109 if (kind2 != kind)
11110 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11111 if (!buf2)
11112 goto onError;
11113 len1 = PyUnicode_GET_LENGTH(str_obj);
11114 len2 = PyUnicode_GET_LENGTH(sep_obj);
11115
11116 switch(PyUnicode_KIND(str_in)) {
11117 case PyUnicode_1BYTE_KIND:
11118 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11119 break;
11120 case PyUnicode_2BYTE_KIND:
11121 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11122 break;
11123 case PyUnicode_4BYTE_KIND:
11124 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11125 break;
11126 default:
11127 assert(0);
11128 out = 0;
11129 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011130
11131 Py_DECREF(sep_obj);
11132 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011133 if (kind1 != kind)
11134 PyMem_Free(buf1);
11135 if (kind2 != kind)
11136 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011137
11138 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 onError:
11140 Py_DECREF(sep_obj);
11141 Py_DECREF(str_obj);
11142 if (kind1 != kind && buf1)
11143 PyMem_Free(buf1);
11144 if (kind2 != kind && buf2)
11145 PyMem_Free(buf2);
11146 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011147}
11148
11149PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011150 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011151\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011152Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011153the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011154found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011155
11156static PyObject*
11157unicode_partition(PyUnicodeObject *self, PyObject *separator)
11158{
11159 return PyUnicode_Partition((PyObject *)self, separator);
11160}
11161
11162PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011163 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011164\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011165Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011166the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011167separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011168
11169static PyObject*
11170unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11171{
11172 return PyUnicode_RPartition((PyObject *)self, separator);
11173}
11174
Alexander Belopolsky40018472011-02-26 01:02:56 +000011175PyObject *
11176PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011177{
11178 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011179
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011180 s = PyUnicode_FromObject(s);
11181 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011182 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011183 if (sep != NULL) {
11184 sep = PyUnicode_FromObject(sep);
11185 if (sep == NULL) {
11186 Py_DECREF(s);
11187 return NULL;
11188 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011189 }
11190
11191 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11192
11193 Py_DECREF(s);
11194 Py_XDECREF(sep);
11195 return result;
11196}
11197
11198PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011199 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011200\n\
11201Return a list of the words in S, using sep as the\n\
11202delimiter string, starting at the end of the string and\n\
11203working to the front. If maxsplit is given, at most maxsplit\n\
11204splits are done. If sep is not specified, any whitespace string\n\
11205is a separator.");
11206
11207static PyObject*
11208unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11209{
11210 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011211 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011212
Martin v. Löwis18e16552006-02-15 17:27:45 +000011213 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011214 return NULL;
11215
11216 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011217 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011218 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011219 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011220 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011221 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011222}
11223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011224PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011225 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226\n\
11227Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011228Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011229is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230
11231static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011232unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011234 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011235 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011237 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11238 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239 return NULL;
11240
Guido van Rossum86662912000-04-11 15:38:46 +000011241 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242}
11243
11244static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011245PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246{
Walter Dörwald346737f2007-05-31 10:44:43 +000011247 if (PyUnicode_CheckExact(self)) {
11248 Py_INCREF(self);
11249 return self;
11250 } else
11251 /* Subtype -- return genuine unicode string with the same value. */
11252 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
11253 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254}
11255
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011256PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011257 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258\n\
11259Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011260and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261
11262static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011263unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265 return fixup(self, fixswapcase);
11266}
11267
Georg Brandlceee0772007-11-27 23:48:05 +000011268PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011270\n\
11271Return a translation table usable for str.translate().\n\
11272If there is only one argument, it must be a dictionary mapping Unicode\n\
11273ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011274Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011275If there are two arguments, they must be strings of equal length, and\n\
11276in the resulting dictionary, each character in x will be mapped to the\n\
11277character at the same position in y. If there is a third argument, it\n\
11278must be a string, whose characters will be mapped to None in the result.");
11279
11280static PyObject*
11281unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11282{
11283 PyObject *x, *y = NULL, *z = NULL;
11284 PyObject *new = NULL, *key, *value;
11285 Py_ssize_t i = 0;
11286 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011287
Georg Brandlceee0772007-11-27 23:48:05 +000011288 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11289 return NULL;
11290 new = PyDict_New();
11291 if (!new)
11292 return NULL;
11293 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 int x_kind, y_kind, z_kind;
11295 void *x_data, *y_data, *z_data;
11296
Georg Brandlceee0772007-11-27 23:48:05 +000011297 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011298 if (!PyUnicode_Check(x)) {
11299 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11300 "be a string if there is a second argument");
11301 goto err;
11302 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011304 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11305 "arguments must have equal length");
11306 goto err;
11307 }
11308 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 x_kind = PyUnicode_KIND(x);
11310 y_kind = PyUnicode_KIND(y);
11311 x_data = PyUnicode_DATA(x);
11312 y_data = PyUnicode_DATA(y);
11313 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11314 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11315 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011316 if (!key || !value)
11317 goto err;
11318 res = PyDict_SetItem(new, key, value);
11319 Py_DECREF(key);
11320 Py_DECREF(value);
11321 if (res < 0)
11322 goto err;
11323 }
11324 /* create entries for deleting chars in z */
11325 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 z_kind = PyUnicode_KIND(z);
11327 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011328 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011330 if (!key)
11331 goto err;
11332 res = PyDict_SetItem(new, key, Py_None);
11333 Py_DECREF(key);
11334 if (res < 0)
11335 goto err;
11336 }
11337 }
11338 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 int kind;
11340 void *data;
11341
Georg Brandlceee0772007-11-27 23:48:05 +000011342 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011343 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011344 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11345 "to maketrans it must be a dict");
11346 goto err;
11347 }
11348 /* copy entries into the new dict, converting string keys to int keys */
11349 while (PyDict_Next(x, &i, &key, &value)) {
11350 if (PyUnicode_Check(key)) {
11351 /* convert string keys to integer keys */
11352 PyObject *newkey;
11353 if (PyUnicode_GET_SIZE(key) != 1) {
11354 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11355 "table must be of length 1");
11356 goto err;
11357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 kind = PyUnicode_KIND(key);
11359 data = PyUnicode_DATA(key);
11360 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011361 if (!newkey)
11362 goto err;
11363 res = PyDict_SetItem(new, newkey, value);
11364 Py_DECREF(newkey);
11365 if (res < 0)
11366 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011367 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011368 /* just keep integer keys */
11369 if (PyDict_SetItem(new, key, value) < 0)
11370 goto err;
11371 } else {
11372 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11373 "be strings or integers");
11374 goto err;
11375 }
11376 }
11377 }
11378 return new;
11379 err:
11380 Py_DECREF(new);
11381 return NULL;
11382}
11383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011384PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011385 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386\n\
11387Return a copy of the string S, where all characters have been mapped\n\
11388through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011389Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011390Unmapped characters are left untouched. Characters mapped to None\n\
11391are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392
11393static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397}
11398
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011399PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011400 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011402Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403
11404static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011405unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407 return fixup(self, fixupper);
11408}
11409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011410PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011411 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011413Pad a numeric string S with zeros on the left, to fill a field\n\
11414of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415
11416static PyObject *
11417unicode_zfill(PyUnicodeObject *self, PyObject *args)
11418{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011419 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011421 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 int kind;
11423 void *data;
11424 Py_UCS4 chr;
11425
11426 if (PyUnicode_READY(self) == -1)
11427 return NULL;
11428
Martin v. Löwis18e16552006-02-15 17:27:45 +000011429 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430 return NULL;
11431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011433 if (PyUnicode_CheckExact(self)) {
11434 Py_INCREF(self);
11435 return (PyObject*) self;
11436 }
11437 else
11438 return PyUnicode_FromUnicode(
11439 PyUnicode_AS_UNICODE(self),
11440 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +000011441 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442 }
11443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445
11446 u = pad(self, fill, 0, '0');
11447
Walter Dörwald068325e2002-04-15 13:36:47 +000011448 if (u == NULL)
11449 return NULL;
11450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 kind = PyUnicode_KIND(u);
11452 data = PyUnicode_DATA(u);
11453 chr = PyUnicode_READ(kind, data, fill);
11454
11455 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 PyUnicode_WRITE(kind, data, 0, chr);
11458 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459 }
11460
11461 return (PyObject*) u;
11462}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463
11464#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011465static PyObject *
11466unicode__decimal2ascii(PyObject *self)
11467{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011469}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470#endif
11471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011472PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011475Return True if S starts with the specified prefix, False otherwise.\n\
11476With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011477With optional end, stop comparing S at that position.\n\
11478prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479
11480static PyObject *
11481unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011482 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011484 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011486 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011487 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011488 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489
Jesus Ceaac451502011-04-20 17:09:23 +020011490 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011492 if (PyTuple_Check(subobj)) {
11493 Py_ssize_t i;
11494 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11495 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011496 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011497 if (substring == NULL)
11498 return NULL;
11499 result = tailmatch(self, substring, start, end, -1);
11500 Py_DECREF(substring);
11501 if (result) {
11502 Py_RETURN_TRUE;
11503 }
11504 }
11505 /* nothing matched */
11506 Py_RETURN_FALSE;
11507 }
11508 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011509 if (substring == NULL) {
11510 if (PyErr_ExceptionMatches(PyExc_TypeError))
11511 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11512 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011513 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011514 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011515 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011517 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518}
11519
11520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011521PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011524Return True if S ends with the specified suffix, False otherwise.\n\
11525With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011526With optional end, stop comparing S at that position.\n\
11527suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528
11529static PyObject *
11530unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011533 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011535 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011536 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011537 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538
Jesus Ceaac451502011-04-20 17:09:23 +020011539 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011540 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011541 if (PyTuple_Check(subobj)) {
11542 Py_ssize_t i;
11543 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11544 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011545 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011546 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011547 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011548 result = tailmatch(self, substring, start, end, +1);
11549 Py_DECREF(substring);
11550 if (result) {
11551 Py_RETURN_TRUE;
11552 }
11553 }
11554 Py_RETURN_FALSE;
11555 }
11556 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011557 if (substring == NULL) {
11558 if (PyErr_ExceptionMatches(PyExc_TypeError))
11559 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11560 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011562 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011563 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011565 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566}
11567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011568#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011569
11570PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011571 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011572\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011573Return a formatted version of S, using substitutions from args and kwargs.\n\
11574The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011575
Eric Smith27bbca62010-11-04 17:06:58 +000011576PyDoc_STRVAR(format_map__doc__,
11577 "S.format_map(mapping) -> str\n\
11578\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011579Return a formatted version of S, using substitutions from mapping.\n\
11580The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011581
Eric Smith4a7d76d2008-05-30 18:10:19 +000011582static PyObject *
11583unicode__format__(PyObject* self, PyObject* args)
11584{
11585 PyObject *format_spec;
11586
11587 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11588 return NULL;
11589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11591 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011592}
11593
Eric Smith8c663262007-08-25 02:26:07 +000011594PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011595 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011596\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011597Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011598
11599static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011600unicode__sizeof__(PyUnicodeObject *v)
11601{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602 Py_ssize_t size;
11603
11604 /* If it's a compact object, account for base structure +
11605 character data. */
11606 if (PyUnicode_IS_COMPACT_ASCII(v))
11607 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11608 else if (PyUnicode_IS_COMPACT(v))
11609 size = sizeof(PyCompactUnicodeObject) +
11610 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11611 else {
11612 /* If it is a two-block object, account for base object, and
11613 for character block if present. */
11614 size = sizeof(PyUnicodeObject);
11615 if (v->data.any)
11616 size += (PyUnicode_GET_LENGTH(v) + 1) *
11617 PyUnicode_CHARACTER_SIZE(v);
11618 }
11619 /* If the wstr pointer is present, account for it unless it is shared
11620 with the data pointer. Since PyUnicode_DATA will crash if the object
11621 is not ready, check whether it's either not ready (in which case the
11622 data is entirely in wstr) or if the data is not shared. */
11623 if (_PyUnicode_WSTR(v) &&
11624 (!PyUnicode_IS_READY(v) ||
11625 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11626 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11627 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11628 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11629
11630 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011631}
11632
11633PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011634 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011635
11636static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011637unicode_getnewargs(PyUnicodeObject *v)
11638{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 PyObject *copy;
11640 unsigned char *data;
11641 int kind;
11642 if (PyUnicode_READY(v) == -1)
11643 return NULL;
11644 kind = PyUnicode_KIND(v);
11645 data = PyUnicode_1BYTE_DATA(v);
11646 copy = PyUnicode_FromKindAndData(kind, data, PyUnicode_GET_LENGTH(v));
11647 if (!copy)
11648 return NULL;
11649 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011650}
11651
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652static PyMethodDef unicode_methods[] = {
11653
11654 /* Order is according to common usage: often used methods should
11655 appear first, since lookup is done sequentially. */
11656
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011657 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011658 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11659 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011660 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011661 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11662 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11663 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11664 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11665 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11666 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11667 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011668 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011669 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11670 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11671 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011672 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011673 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11674 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11675 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011676 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011677 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011678 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011679 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011680 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11681 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11682 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11683 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11684 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11685 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11686 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11687 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11688 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11689 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11690 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11691 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11692 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11693 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011694 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011695 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011696 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011697 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011698 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011699 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011700 {"maketrans", (PyCFunction) unicode_maketrans,
11701 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011702 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011703#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011704 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705#endif
11706
11707#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011708 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011709 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710#endif
11711
Benjamin Peterson14339b62009-01-31 16:36:08 +000011712 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713 {NULL, NULL}
11714};
11715
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011716static PyObject *
11717unicode_mod(PyObject *v, PyObject *w)
11718{
Brian Curtindfc80e32011-08-10 20:28:54 -050011719 if (!PyUnicode_Check(v))
11720 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011721 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011722}
11723
11724static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011725 0, /*nb_add*/
11726 0, /*nb_subtract*/
11727 0, /*nb_multiply*/
11728 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011729};
11730
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011732 (lenfunc) unicode_length, /* sq_length */
11733 PyUnicode_Concat, /* sq_concat */
11734 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11735 (ssizeargfunc) unicode_getitem, /* sq_item */
11736 0, /* sq_slice */
11737 0, /* sq_ass_item */
11738 0, /* sq_ass_slice */
11739 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740};
11741
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011742static PyObject*
11743unicode_subscript(PyUnicodeObject* self, PyObject* item)
11744{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 if (PyUnicode_READY(self) == -1)
11746 return NULL;
11747
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011748 if (PyIndex_Check(item)) {
11749 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011750 if (i == -1 && PyErr_Occurred())
11751 return NULL;
11752 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011754 return unicode_getitem(self, i);
11755 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011756 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011758 Py_UNICODE* result_buf;
11759 PyObject* result;
11760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011762 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011763 return NULL;
11764 }
11765
11766 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 return PyUnicode_New(0, 0);
11768 } else if (start == 0 && step == 1 &&
11769 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011770 PyUnicode_CheckExact(self)) {
11771 Py_INCREF(self);
11772 return (PyObject *)self;
11773 } else if (step == 1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 return substring(self, start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011775 } else {
11776 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011777 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11778 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011779
Benjamin Peterson29060642009-01-31 22:14:21 +000011780 if (result_buf == NULL)
11781 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011782
11783 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11784 result_buf[i] = source_buf[cur];
11785 }
Tim Petersced69f82003-09-16 20:30:58 +000011786
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011787 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011788 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011789 return result;
11790 }
11791 } else {
11792 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11793 return NULL;
11794 }
11795}
11796
11797static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011798 (lenfunc)unicode_length, /* mp_length */
11799 (binaryfunc)unicode_subscript, /* mp_subscript */
11800 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011801};
11802
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804/* Helpers for PyUnicode_Format() */
11805
11806static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011807getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011809 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 (*p_argidx)++;
11812 if (arglen < 0)
11813 return args;
11814 else
11815 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816 }
11817 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011818 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819 return NULL;
11820}
11821
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011822/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011824static PyObject *
11825formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011827 char *p;
11828 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011830
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831 x = PyFloat_AsDouble(v);
11832 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011833 return NULL;
11834
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011836 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011837
Eric Smith0923d1d2009-04-16 20:16:10 +000011838 p = PyOS_double_to_string(x, type, prec,
11839 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011840 if (p == NULL)
11841 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011843 PyMem_Free(p);
11844 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845}
11846
Tim Peters38fd5b62000-09-21 05:43:11 +000011847static PyObject*
11848formatlong(PyObject *val, int flags, int prec, int type)
11849{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011850 char *buf;
11851 int len;
11852 PyObject *str; /* temporary string object. */
11853 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011854
Benjamin Peterson14339b62009-01-31 16:36:08 +000011855 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11856 if (!str)
11857 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011859 Py_DECREF(str);
11860 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011861}
11862
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011865 size_t buflen,
11866 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011868 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011869 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011870 if (PyUnicode_GET_LENGTH(v) == 1) {
11871 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011872 buf[1] = '\0';
11873 return 1;
11874 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011875 goto onError;
11876 }
11877 else {
11878 /* Integer input truncated to a character */
11879 long x;
11880 x = PyLong_AsLong(v);
11881 if (x == -1 && PyErr_Occurred())
11882 goto onError;
11883
11884 if (x < 0 || x > 0x10ffff) {
11885 PyErr_SetString(PyExc_OverflowError,
11886 "%c arg not in range(0x110000)");
11887 return -1;
11888 }
11889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011891 buf[1] = '\0';
11892 return 1;
11893 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011894
Benjamin Peterson29060642009-01-31 22:14:21 +000011895 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011896 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011897 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011898 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899}
11900
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011901/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011902 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011903*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011904#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011905
Alexander Belopolsky40018472011-02-26 01:02:56 +000011906PyObject *
11907PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 void *fmt;
11910 int fmtkind;
11911 PyObject *result;
11912 Py_UCS4 *res, *res0;
11913 Py_UCS4 max;
11914 int kind;
11915 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011919
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011921 PyErr_BadInternalCall();
11922 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011924 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11925 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011926 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 fmt = PyUnicode_DATA(uformat);
11928 fmtkind = PyUnicode_KIND(uformat);
11929 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11930 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931
11932 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11934 if (res0 == NULL) {
11935 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938
11939 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011940 arglen = PyTuple_Size(args);
11941 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942 }
11943 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 arglen = -1;
11945 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011947 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011948 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011949 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950
11951 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011953 if (--rescnt < 0) {
11954 rescnt = fmtcnt + 100;
11955 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11957 if (res0 == NULL){
11958 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011959 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 }
11961 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011962 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011965 }
11966 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011967 /* Got a format specifier */
11968 int flags = 0;
11969 Py_ssize_t width = -1;
11970 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 Py_UCS4 c = '\0';
11972 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011973 int isnumok;
11974 PyObject *v = NULL;
11975 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 void *pbuf;
11977 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011978 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 Py_ssize_t len, len1;
11980 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 fmtpos++;
11983 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
11984 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000011985 Py_ssize_t keylen;
11986 PyObject *key;
11987 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000011988
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 if (dict == NULL) {
11990 PyErr_SetString(PyExc_TypeError,
11991 "format requires a mapping");
11992 goto onError;
11993 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011995 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 /* Skip over balanced parentheses */
11998 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012002 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012006 if (fmtcnt < 0 || pcount > 0) {
12007 PyErr_SetString(PyExc_ValueError,
12008 "incomplete format key");
12009 goto onError;
12010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 key = substring(uformat, keystart, keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012012 if (key == NULL)
12013 goto onError;
12014 if (args_owned) {
12015 Py_DECREF(args);
12016 args_owned = 0;
12017 }
12018 args = PyObject_GetItem(dict, key);
12019 Py_DECREF(key);
12020 if (args == NULL) {
12021 goto onError;
12022 }
12023 args_owned = 1;
12024 arglen = -1;
12025 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012026 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012027 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012029 case '-': flags |= F_LJUST; continue;
12030 case '+': flags |= F_SIGN; continue;
12031 case ' ': flags |= F_BLANK; continue;
12032 case '#': flags |= F_ALT; continue;
12033 case '0': flags |= F_ZERO; continue;
12034 }
12035 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012036 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012037 if (c == '*') {
12038 v = getnextarg(args, arglen, &argidx);
12039 if (v == NULL)
12040 goto onError;
12041 if (!PyLong_Check(v)) {
12042 PyErr_SetString(PyExc_TypeError,
12043 "* wants int");
12044 goto onError;
12045 }
12046 width = PyLong_AsLong(v);
12047 if (width == -1 && PyErr_Occurred())
12048 goto onError;
12049 if (width < 0) {
12050 flags |= F_LJUST;
12051 width = -width;
12052 }
12053 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012055 }
12056 else if (c >= '0' && c <= '9') {
12057 width = c - '0';
12058 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012060 if (c < '0' || c > '9')
12061 break;
12062 if ((width*10) / 10 != width) {
12063 PyErr_SetString(PyExc_ValueError,
12064 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012065 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012066 }
12067 width = width*10 + (c - '0');
12068 }
12069 }
12070 if (c == '.') {
12071 prec = 0;
12072 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012074 if (c == '*') {
12075 v = getnextarg(args, arglen, &argidx);
12076 if (v == NULL)
12077 goto onError;
12078 if (!PyLong_Check(v)) {
12079 PyErr_SetString(PyExc_TypeError,
12080 "* wants int");
12081 goto onError;
12082 }
12083 prec = PyLong_AsLong(v);
12084 if (prec == -1 && PyErr_Occurred())
12085 goto onError;
12086 if (prec < 0)
12087 prec = 0;
12088 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012090 }
12091 else if (c >= '0' && c <= '9') {
12092 prec = c - '0';
12093 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012094 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012095 if (c < '0' || c > '9')
12096 break;
12097 if ((prec*10) / 10 != prec) {
12098 PyErr_SetString(PyExc_ValueError,
12099 "prec too big");
12100 goto onError;
12101 }
12102 prec = prec*10 + (c - '0');
12103 }
12104 }
12105 } /* prec */
12106 if (fmtcnt >= 0) {
12107 if (c == 'h' || c == 'l' || c == 'L') {
12108 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012110 }
12111 }
12112 if (fmtcnt < 0) {
12113 PyErr_SetString(PyExc_ValueError,
12114 "incomplete format");
12115 goto onError;
12116 }
12117 if (c != '%') {
12118 v = getnextarg(args, arglen, &argidx);
12119 if (v == NULL)
12120 goto onError;
12121 }
12122 sign = 0;
12123 fill = ' ';
12124 switch (c) {
12125
12126 case '%':
12127 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012129 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012131 len = 1;
12132 break;
12133
12134 case 's':
12135 case 'r':
12136 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012137 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012138 temp = v;
12139 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012140 }
12141 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012142 if (c == 's')
12143 temp = PyObject_Str(v);
12144 else if (c == 'r')
12145 temp = PyObject_Repr(v);
12146 else
12147 temp = PyObject_ASCII(v);
12148 if (temp == NULL)
12149 goto onError;
12150 if (PyUnicode_Check(temp))
12151 /* nothing to do */;
12152 else {
12153 Py_DECREF(temp);
12154 PyErr_SetString(PyExc_TypeError,
12155 "%s argument has non-string str()");
12156 goto onError;
12157 }
12158 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 if (PyUnicode_READY(temp) == -1) {
12160 Py_CLEAR(temp);
12161 goto onError;
12162 }
12163 pbuf = PyUnicode_DATA(temp);
12164 kind = PyUnicode_KIND(temp);
12165 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012166 if (prec >= 0 && len > prec)
12167 len = prec;
12168 break;
12169
12170 case 'i':
12171 case 'd':
12172 case 'u':
12173 case 'o':
12174 case 'x':
12175 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012176 isnumok = 0;
12177 if (PyNumber_Check(v)) {
12178 PyObject *iobj=NULL;
12179
12180 if (PyLong_Check(v)) {
12181 iobj = v;
12182 Py_INCREF(iobj);
12183 }
12184 else {
12185 iobj = PyNumber_Long(v);
12186 }
12187 if (iobj!=NULL) {
12188 if (PyLong_Check(iobj)) {
12189 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012190 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012191 Py_DECREF(iobj);
12192 if (!temp)
12193 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 if (PyUnicode_READY(temp) == -1) {
12195 Py_CLEAR(temp);
12196 goto onError;
12197 }
12198 pbuf = PyUnicode_DATA(temp);
12199 kind = PyUnicode_KIND(temp);
12200 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012201 sign = 1;
12202 }
12203 else {
12204 Py_DECREF(iobj);
12205 }
12206 }
12207 }
12208 if (!isnumok) {
12209 PyErr_Format(PyExc_TypeError,
12210 "%%%c format: a number is required, "
12211 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12212 goto onError;
12213 }
12214 if (flags & F_ZERO)
12215 fill = '0';
12216 break;
12217
12218 case 'e':
12219 case 'E':
12220 case 'f':
12221 case 'F':
12222 case 'g':
12223 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012224 temp = formatfloat(v, flags, prec, c);
12225 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012226 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012227 if (PyUnicode_READY(temp) == -1) {
12228 Py_CLEAR(temp);
12229 goto onError;
12230 }
12231 pbuf = PyUnicode_DATA(temp);
12232 kind = PyUnicode_KIND(temp);
12233 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012234 sign = 1;
12235 if (flags & F_ZERO)
12236 fill = '0';
12237 break;
12238
12239 case 'c':
12240 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012241 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012242 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
12243 if (len < 0)
12244 goto onError;
12245 break;
12246
12247 default:
12248 PyErr_Format(PyExc_ValueError,
12249 "unsupported format character '%c' (0x%x) "
12250 "at index %zd",
12251 (31<=c && c<=126) ? (char)c : '?',
12252 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012253 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012254 goto onError;
12255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012256 /* pbuf is initialized here. */
12257 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012258 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12260 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12261 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012262 len--;
12263 }
12264 else if (flags & F_SIGN)
12265 sign = '+';
12266 else if (flags & F_BLANK)
12267 sign = ' ';
12268 else
12269 sign = 0;
12270 }
12271 if (width < len)
12272 width = len;
12273 if (rescnt - (sign != 0) < width) {
12274 reslen -= rescnt;
12275 rescnt = width + fmtcnt + 100;
12276 reslen += rescnt;
12277 if (reslen < 0) {
12278 Py_XDECREF(temp);
12279 PyErr_NoMemory();
12280 goto onError;
12281 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12283 if (res0 == 0) {
12284 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 Py_XDECREF(temp);
12286 goto onError;
12287 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012289 }
12290 if (sign) {
12291 if (fill != ' ')
12292 *res++ = sign;
12293 rescnt--;
12294 if (width > len)
12295 width--;
12296 }
12297 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12299 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012300 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12302 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 }
12304 rescnt -= 2;
12305 width -= 2;
12306 if (width < 0)
12307 width = 0;
12308 len -= 2;
12309 }
12310 if (width > len && !(flags & F_LJUST)) {
12311 do {
12312 --rescnt;
12313 *res++ = fill;
12314 } while (--width > len);
12315 }
12316 if (fill == ' ') {
12317 if (sign)
12318 *res++ = sign;
12319 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12321 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12322 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12323 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012324 }
12325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 /* Copy all characters, preserving len */
12327 len1 = len;
12328 while (len1--) {
12329 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12330 rescnt--;
12331 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012332 while (--width >= len) {
12333 --rescnt;
12334 *res++ = ' ';
12335 }
12336 if (dict && (argidx < arglen) && c != '%') {
12337 PyErr_SetString(PyExc_TypeError,
12338 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012339 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012340 goto onError;
12341 }
12342 Py_XDECREF(temp);
12343 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012344 } /* until end */
12345 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012346 PyErr_SetString(PyExc_TypeError,
12347 "not all arguments converted during string formatting");
12348 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012349 }
12350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351
12352 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12353 if (*res > max)
12354 max = *res;
12355 result = PyUnicode_New(reslen - rescnt, max);
12356 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012357 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 kind = PyUnicode_KIND(result);
12359 for (res = res0; res < res0+reslen-rescnt; res++)
12360 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12361 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012363 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364 }
12365 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366 return (PyObject *)result;
12367
Benjamin Peterson29060642009-01-31 22:14:21 +000012368 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370 Py_DECREF(uformat);
12371 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012372 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373 }
12374 return NULL;
12375}
12376
Jeremy Hylton938ace62002-07-17 16:30:39 +000012377static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012378unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12379
Tim Peters6d6c1a32001-08-02 04:15:00 +000012380static PyObject *
12381unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12382{
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012384 static char *kwlist[] = {"object", "encoding", "errors", 0};
12385 char *encoding = NULL;
12386 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012387
Benjamin Peterson14339b62009-01-31 16:36:08 +000012388 if (type != &PyUnicode_Type)
12389 return unicode_subtype_new(type, args, kwds);
12390 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012391 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012392 return NULL;
12393 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012395 if (encoding == NULL && errors == NULL)
12396 return PyObject_Str(x);
12397 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012398 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012399}
12400
Guido van Rossume023fe02001-08-30 03:12:59 +000012401static PyObject *
12402unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12403{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012404 PyUnicodeObject *tmp, *pnew;
12405 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012406 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012407
Benjamin Peterson14339b62009-01-31 16:36:08 +000012408 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12409 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12410 if (tmp == NULL)
12411 return NULL;
12412 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012413 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12414 // it seems kind of strange that tp_alloc gets passed the size
12415 // of the unicode string because there will follow another
12416 // malloc.
12417 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12418 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012419 if (pnew == NULL) {
12420 Py_DECREF(tmp);
12421 return NULL;
12422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12424 if (_PyUnicode_WSTR(pnew) == NULL) {
12425 err = PyErr_NoMemory();
12426 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012427 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12429 _PyUnicode_WSTR_LENGTH(pnew) = n;
12430 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12431 _PyUnicode_STATE(pnew).interned = 0;
12432 _PyUnicode_STATE(pnew).kind = 0;
12433 _PyUnicode_STATE(pnew).compact = 0;
12434 _PyUnicode_STATE(pnew).ready = 0;
12435 _PyUnicode_STATE(pnew).ascii = 0;
12436 pnew->data.any = NULL;
12437 _PyUnicode_LENGTH(pnew) = 0;
12438 pnew->_base.utf8 = NULL;
12439 pnew->_base.utf8_length = 0;
12440
12441 if (PyUnicode_READY(pnew) == -1) {
12442 PyObject_FREE(_PyUnicode_WSTR(pnew));
12443 goto onError;
12444 }
12445
Benjamin Peterson14339b62009-01-31 16:36:08 +000012446 Py_DECREF(tmp);
12447 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448
12449 onError:
12450 _Py_ForgetReference((PyObject *)pnew);
12451 PyObject_Del(pnew);
12452 Py_DECREF(tmp);
12453 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012454}
12455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012456PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012457 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012458\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012459Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012460encoding defaults to the current default string encoding.\n\
12461errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012462
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012463static PyObject *unicode_iter(PyObject *seq);
12464
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012466 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012467 "str", /* tp_name */
12468 sizeof(PyUnicodeObject), /* tp_size */
12469 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012471 (destructor)unicode_dealloc, /* tp_dealloc */
12472 0, /* tp_print */
12473 0, /* tp_getattr */
12474 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012475 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012476 unicode_repr, /* tp_repr */
12477 &unicode_as_number, /* tp_as_number */
12478 &unicode_as_sequence, /* tp_as_sequence */
12479 &unicode_as_mapping, /* tp_as_mapping */
12480 (hashfunc) unicode_hash, /* tp_hash*/
12481 0, /* tp_call*/
12482 (reprfunc) unicode_str, /* tp_str */
12483 PyObject_GenericGetAttr, /* tp_getattro */
12484 0, /* tp_setattro */
12485 0, /* tp_as_buffer */
12486 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012487 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012488 unicode_doc, /* tp_doc */
12489 0, /* tp_traverse */
12490 0, /* tp_clear */
12491 PyUnicode_RichCompare, /* tp_richcompare */
12492 0, /* tp_weaklistoffset */
12493 unicode_iter, /* tp_iter */
12494 0, /* tp_iternext */
12495 unicode_methods, /* tp_methods */
12496 0, /* tp_members */
12497 0, /* tp_getset */
12498 &PyBaseObject_Type, /* tp_base */
12499 0, /* tp_dict */
12500 0, /* tp_descr_get */
12501 0, /* tp_descr_set */
12502 0, /* tp_dictoffset */
12503 0, /* tp_init */
12504 0, /* tp_alloc */
12505 unicode_new, /* tp_new */
12506 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507};
12508
12509/* Initialize the Unicode implementation */
12510
Thomas Wouters78890102000-07-22 19:25:51 +000012511void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012512{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012513 int i;
12514
Thomas Wouters477c8d52006-05-27 19:21:47 +000012515 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012517 0x000A, /* LINE FEED */
12518 0x000D, /* CARRIAGE RETURN */
12519 0x001C, /* FILE SEPARATOR */
12520 0x001D, /* GROUP SEPARATOR */
12521 0x001E, /* RECORD SEPARATOR */
12522 0x0085, /* NEXT LINE */
12523 0x2028, /* LINE SEPARATOR */
12524 0x2029, /* PARAGRAPH SEPARATOR */
12525 };
12526
Fred Drakee4315f52000-05-09 19:53:39 +000012527 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012529 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012530 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012531
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012532 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012533 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012534 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012535 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012536
12537 /* initialize the linebreak bloom filter */
12538 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539 PyUnicode_2BYTE_KIND, linebreak,
12540 sizeof(linebreak) / sizeof(linebreak[0]));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012541
12542 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543}
12544
12545/* Finalize the Unicode implementation */
12546
Christian Heimesa156e092008-02-16 07:38:31 +000012547int
12548PyUnicode_ClearFreeList(void)
12549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012551}
12552
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553void
Thomas Wouters78890102000-07-22 19:25:51 +000012554_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012556 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012558 Py_XDECREF(unicode_empty);
12559 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012560
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012561 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012562 if (unicode_latin1[i]) {
12563 Py_DECREF(unicode_latin1[i]);
12564 unicode_latin1[i] = NULL;
12565 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012566 }
Christian Heimesa156e092008-02-16 07:38:31 +000012567 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012569
Walter Dörwald16807132007-05-25 13:52:07 +000012570void
12571PyUnicode_InternInPlace(PyObject **p)
12572{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012573 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12574 PyObject *t;
12575 if (s == NULL || !PyUnicode_Check(s))
12576 Py_FatalError(
12577 "PyUnicode_InternInPlace: unicode strings only please!");
12578 /* If it's a subclass, we don't really know what putting
12579 it in the interned dict might do. */
12580 if (!PyUnicode_CheckExact(s))
12581 return;
12582 if (PyUnicode_CHECK_INTERNED(s))
12583 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 if (PyUnicode_READY(s) == -1) {
12585 assert(0 && "ready fail in intern...");
12586 return;
12587 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012588 if (interned == NULL) {
12589 interned = PyDict_New();
12590 if (interned == NULL) {
12591 PyErr_Clear(); /* Don't leave an exception */
12592 return;
12593 }
12594 }
12595 /* It might be that the GetItem call fails even
12596 though the key is present in the dictionary,
12597 namely when this happens during a stack overflow. */
12598 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012599 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012600 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012601
Benjamin Peterson29060642009-01-31 22:14:21 +000012602 if (t) {
12603 Py_INCREF(t);
12604 Py_DECREF(*p);
12605 *p = t;
12606 return;
12607 }
Walter Dörwald16807132007-05-25 13:52:07 +000012608
Benjamin Peterson14339b62009-01-31 16:36:08 +000012609 PyThreadState_GET()->recursion_critical = 1;
12610 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12611 PyErr_Clear();
12612 PyThreadState_GET()->recursion_critical = 0;
12613 return;
12614 }
12615 PyThreadState_GET()->recursion_critical = 0;
12616 /* The two references in interned are not counted by refcnt.
12617 The deallocator will take care of this */
12618 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012619 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012620}
12621
12622void
12623PyUnicode_InternImmortal(PyObject **p)
12624{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12626
Benjamin Peterson14339b62009-01-31 16:36:08 +000012627 PyUnicode_InternInPlace(p);
12628 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012630 Py_INCREF(*p);
12631 }
Walter Dörwald16807132007-05-25 13:52:07 +000012632}
12633
12634PyObject *
12635PyUnicode_InternFromString(const char *cp)
12636{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012637 PyObject *s = PyUnicode_FromString(cp);
12638 if (s == NULL)
12639 return NULL;
12640 PyUnicode_InternInPlace(&s);
12641 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012642}
12643
Alexander Belopolsky40018472011-02-26 01:02:56 +000012644void
12645_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012646{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012647 PyObject *keys;
12648 PyUnicodeObject *s;
12649 Py_ssize_t i, n;
12650 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012651
Benjamin Peterson14339b62009-01-31 16:36:08 +000012652 if (interned == NULL || !PyDict_Check(interned))
12653 return;
12654 keys = PyDict_Keys(interned);
12655 if (keys == NULL || !PyList_Check(keys)) {
12656 PyErr_Clear();
12657 return;
12658 }
Walter Dörwald16807132007-05-25 13:52:07 +000012659
Benjamin Peterson14339b62009-01-31 16:36:08 +000012660 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12661 detector, interned unicode strings are not forcibly deallocated;
12662 rather, we give them their stolen references back, and then clear
12663 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012664
Benjamin Peterson14339b62009-01-31 16:36:08 +000012665 n = PyList_GET_SIZE(keys);
12666 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012667 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012668 for (i = 0; i < n; i++) {
12669 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 if (PyUnicode_READY(s) == -1)
12671 fprintf(stderr, "could not ready string\n");
12672 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012673 case SSTATE_NOT_INTERNED:
12674 /* XXX Shouldn't happen */
12675 break;
12676 case SSTATE_INTERNED_IMMORTAL:
12677 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012679 break;
12680 case SSTATE_INTERNED_MORTAL:
12681 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012683 break;
12684 default:
12685 Py_FatalError("Inconsistent interned string state.");
12686 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012687 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012688 }
12689 fprintf(stderr, "total size of all interned strings: "
12690 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12691 "mortal/immortal\n", mortal_size, immortal_size);
12692 Py_DECREF(keys);
12693 PyDict_Clear(interned);
12694 Py_DECREF(interned);
12695 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012696}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012697
12698
12699/********************* Unicode Iterator **************************/
12700
12701typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012702 PyObject_HEAD
12703 Py_ssize_t it_index;
12704 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012705} unicodeiterobject;
12706
12707static void
12708unicodeiter_dealloc(unicodeiterobject *it)
12709{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012710 _PyObject_GC_UNTRACK(it);
12711 Py_XDECREF(it->it_seq);
12712 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012713}
12714
12715static int
12716unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12717{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012718 Py_VISIT(it->it_seq);
12719 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012720}
12721
12722static PyObject *
12723unicodeiter_next(unicodeiterobject *it)
12724{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012725 PyUnicodeObject *seq;
12726 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012727
Benjamin Peterson14339b62009-01-31 16:36:08 +000012728 assert(it != NULL);
12729 seq = it->it_seq;
12730 if (seq == NULL)
12731 return NULL;
12732 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12735 int kind = PyUnicode_KIND(seq);
12736 void *data = PyUnicode_DATA(seq);
12737 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12738 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012739 if (item != NULL)
12740 ++it->it_index;
12741 return item;
12742 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012743
Benjamin Peterson14339b62009-01-31 16:36:08 +000012744 Py_DECREF(seq);
12745 it->it_seq = NULL;
12746 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012747}
12748
12749static PyObject *
12750unicodeiter_len(unicodeiterobject *it)
12751{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012752 Py_ssize_t len = 0;
12753 if (it->it_seq)
12754 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12755 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012756}
12757
12758PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12759
12760static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012761 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012762 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012763 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012764};
12765
12766PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012767 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12768 "str_iterator", /* tp_name */
12769 sizeof(unicodeiterobject), /* tp_basicsize */
12770 0, /* tp_itemsize */
12771 /* methods */
12772 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12773 0, /* tp_print */
12774 0, /* tp_getattr */
12775 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012776 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012777 0, /* tp_repr */
12778 0, /* tp_as_number */
12779 0, /* tp_as_sequence */
12780 0, /* tp_as_mapping */
12781 0, /* tp_hash */
12782 0, /* tp_call */
12783 0, /* tp_str */
12784 PyObject_GenericGetAttr, /* tp_getattro */
12785 0, /* tp_setattro */
12786 0, /* tp_as_buffer */
12787 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12788 0, /* tp_doc */
12789 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12790 0, /* tp_clear */
12791 0, /* tp_richcompare */
12792 0, /* tp_weaklistoffset */
12793 PyObject_SelfIter, /* tp_iter */
12794 (iternextfunc)unicodeiter_next, /* tp_iternext */
12795 unicodeiter_methods, /* tp_methods */
12796 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012797};
12798
12799static PyObject *
12800unicode_iter(PyObject *seq)
12801{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012802 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012803
Benjamin Peterson14339b62009-01-31 16:36:08 +000012804 if (!PyUnicode_Check(seq)) {
12805 PyErr_BadInternalCall();
12806 return NULL;
12807 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012808 if (PyUnicode_READY(seq) == -1)
12809 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012810 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12811 if (it == NULL)
12812 return NULL;
12813 it->it_index = 0;
12814 Py_INCREF(seq);
12815 it->it_seq = (PyUnicodeObject *)seq;
12816 _PyObject_GC_TRACK(it);
12817 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012818}
12819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012820#define UNIOP(x) Py_UNICODE_##x
12821#define UNIOP_t Py_UNICODE
12822#include "uniops.h"
12823#undef UNIOP
12824#undef UNIOP_t
12825#define UNIOP(x) Py_UCS4_##x
12826#define UNIOP_t Py_UCS4
12827#include "uniops.h"
12828#undef UNIOP
12829#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012830
Victor Stinner71133ff2010-09-01 23:43:53 +000012831Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012832PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012833{
12834 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12835 Py_UNICODE *copy;
12836 Py_ssize_t size;
12837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012838 if (!PyUnicode_Check(unicode)) {
12839 PyErr_BadArgument();
12840 return NULL;
12841 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012842 /* Ensure we won't overflow the size. */
12843 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12844 PyErr_NoMemory();
12845 return NULL;
12846 }
12847 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12848 size *= sizeof(Py_UNICODE);
12849 copy = PyMem_Malloc(size);
12850 if (copy == NULL) {
12851 PyErr_NoMemory();
12852 return NULL;
12853 }
12854 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12855 return copy;
12856}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012857
Georg Brandl66c221e2010-10-14 07:04:07 +000012858/* A _string module, to export formatter_parser and formatter_field_name_split
12859 to the string.Formatter class implemented in Python. */
12860
12861static PyMethodDef _string_methods[] = {
12862 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12863 METH_O, PyDoc_STR("split the argument as a field name")},
12864 {"formatter_parser", (PyCFunction) formatter_parser,
12865 METH_O, PyDoc_STR("parse the argument as a format string")},
12866 {NULL, NULL}
12867};
12868
12869static struct PyModuleDef _string_module = {
12870 PyModuleDef_HEAD_INIT,
12871 "_string",
12872 PyDoc_STR("string helper module"),
12873 0,
12874 _string_methods,
12875 NULL,
12876 NULL,
12877 NULL,
12878 NULL
12879};
12880
12881PyMODINIT_FUNC
12882PyInit__string(void)
12883{
12884 return PyModule_Create(&_string_module);
12885}
12886
12887
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012888#ifdef __cplusplus
12889}
12890#endif