blob: b53c210428892d16363ac7b8beb4968b05720a3d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
109#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
110#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
111#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
112#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
113#define _PyUnicode_KIND(op) \
114 (assert(PyUnicode_Check(op)), \
115 ((PyASCIIObject *)(op))->state.kind)
116#define _PyUnicode_GET_LENGTH(op) \
117 (assert(PyUnicode_Check(op)), \
118 ((PyASCIIObject *)(op))->length)
119
120
Walter Dörwald16807132007-05-25 13:52:07 +0000121/* This dictionary holds all interned unicode strings. Note that references
122 to strings in this dictionary are *not* counted in the string's ob_refcnt.
123 When the interned string reaches a refcnt of 0 the string deallocation
124 function will delete the reference from this dictionary.
125
126 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000127 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000128*/
129static PyObject *interned;
130
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000131/* The empty Unicode object is shared to improve performance. */
132static PyUnicodeObject *unicode_empty;
133
134/* Single character Unicode strings in the Latin-1 range are being
135 shared as well. */
136static PyUnicodeObject *unicode_latin1[256];
137
Christian Heimes190d79e2008-01-30 11:58:22 +0000138/* Fast detection of the most frequent whitespace characters */
139const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000141/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000142/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000143/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000144/* case 0x000C: * FORM FEED */
145/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000146 0, 1, 1, 1, 1, 1, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000148/* case 0x001C: * FILE SEPARATOR */
149/* case 0x001D: * GROUP SEPARATOR */
150/* case 0x001E: * RECORD SEPARATOR */
151/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000152 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000153/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000154 1, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000158
Benjamin Peterson14339b62009-01-31 16:36:08 +0000159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000167};
168
Alexander Belopolsky40018472011-02-26 01:02:56 +0000169static PyObject *
170unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000171 PyObject **errorHandler,const char *encoding, const char *reason,
172 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
173 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
174
Alexander Belopolsky40018472011-02-26 01:02:56 +0000175static void
176raise_encode_exception(PyObject **exceptionObject,
177 const char *encoding,
178 const Py_UNICODE *unicode, Py_ssize_t size,
179 Py_ssize_t startpos, Py_ssize_t endpos,
180 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000181
Christian Heimes190d79e2008-01-30 11:58:22 +0000182/* Same for linebreaks */
183static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000185/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000186/* 0x000B, * LINE TABULATION */
187/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000188/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000189 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* 0x001C, * FILE SEPARATOR */
192/* 0x001D, * GROUP SEPARATOR */
193/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 0, 0, 0, 0, 1, 1, 1, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
196 0, 0, 0, 0, 0, 0, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000199
Benjamin Peterson14339b62009-01-31 16:36:08 +0000200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000208};
209
210
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000211Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000212PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000213{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000214#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000216#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 /* This is actually an illegal character, so it should
218 not be passed to unichr. */
219 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000220#endif
221}
222
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223/* --- Bloom Filters ----------------------------------------------------- */
224
225/* stuff to implement simple "bloom filters" for Unicode characters.
226 to keep things simple, we use a single bitmask, using the least 5
227 bits from each unicode characters as the bit index. */
228
229/* the linebreak mask is set up by Unicode_Init below */
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#if LONG_BIT >= 128
232#define BLOOM_WIDTH 128
233#elif LONG_BIT >= 64
234#define BLOOM_WIDTH 64
235#elif LONG_BIT >= 32
236#define BLOOM_WIDTH 32
237#else
238#error "LONG_BIT is smaller than 32"
239#endif
240
Thomas Wouters477c8d52006-05-27 19:21:47 +0000241#define BLOOM_MASK unsigned long
242
243static BLOOM_MASK bloom_linebreak;
244
Antoine Pitrouf068f942010-01-13 14:19:12 +0000245#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
246#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000247
Benjamin Peterson29060642009-01-31 22:14:21 +0000248#define BLOOM_LINEBREAK(ch) \
249 ((ch) < 128U ? ascii_linebreak[(ch)] : \
250 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000251
Alexander Belopolsky40018472011-02-26 01:02:56 +0000252Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200253make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000254{
255 /* calculate simple bloom-style bitmask for a given unicode string */
256
Antoine Pitrouf068f942010-01-13 14:19:12 +0000257 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000258 Py_ssize_t i;
259
260 mask = 0;
261 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200262 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000263
264 return mask;
265}
266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200267#define BLOOM_MEMBER(mask, chr, str) \
268 (BLOOM(mask, chr) \
269 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271/* --- Unicode Object ----------------------------------------------------- */
272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200273static PyObject *
274substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len);
275
276static PyObject *
277fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
278
279Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
280 Py_ssize_t size, Py_UCS4 ch,
281 int direction)
282{
283 /* like wcschr, but doesn't stop at NULL characters */
284 Py_ssize_t i;
285 if (direction == 1) {
286 for(i = 0; i < size; i++)
287 if (PyUnicode_READ(kind, s, i) == ch)
288 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
289 }
290 else {
291 for(i = size-1; i >= 0; i--)
292 if (PyUnicode_READ(kind, s, i) == ch)
293 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
294 }
295 return NULL;
296}
297
Alexander Belopolsky40018472011-02-26 01:02:56 +0000298static int
299unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200300 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
302 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200304 /* Resizing is only supported for old unicode objects. */
305 assert(!PyUnicode_IS_COMPACT(unicode));
306 assert(_PyUnicode_WSTR(unicode) != NULL);
307
308 /* ... and only if they have not been readied yet, because
309 callees usually rely on the wstr representation when resizing. */
310 assert(unicode->data.any == NULL);
311
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000312 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200313 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 /* Resizing shared object (unicode_empty or single character
317 objects) in-place is not allowed. Use PyUnicode_Resize()
318 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
322 _PyUnicode_WSTR(unicode)[0] < 256U &&
323 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000325 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 return -1;
327 }
328
Thomas Wouters477c8d52006-05-27 19:21:47 +0000329 /* We allocate one more byte to make sure the string is Ux0000 terminated.
330 The overallocation is also used by fastsearch, which assumes that it's
331 safe to look at str[length] (without making any assumptions about what
332 it contains). */
333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200334 oldstr = _PyUnicode_WSTR(unicode);
335 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
336 sizeof(Py_UNICODE) * (length + 1));
337 if (!_PyUnicode_WSTR(unicode)) {
338 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 PyErr_NoMemory();
340 return -1;
341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200342 _PyUnicode_WSTR(unicode)[length] = 0;
343 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000344
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 if (unicode->data.any != NULL) {
347 PyObject_FREE(unicode->data.any);
348 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
349 PyObject_FREE(unicode->_base.utf8);
350 }
351 unicode->_base.utf8 = NULL;
352 unicode->_base.utf8_length = 0;
353 unicode->data.any = NULL;
354 _PyUnicode_LENGTH(unicode) = 0;
355 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
356 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 _PyUnicode_HASH(unicode) = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000359
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 return 0;
361}
362
363/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000364 Ux0000 terminated; some code (e.g. new_identifier)
365 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366
367 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369
370*/
371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200372#ifdef Py_DEBUG
373int unicode_old_new_calls = 0;
374#endif
375
Alexander Belopolsky40018472011-02-26 01:02:56 +0000376static PyUnicodeObject *
377_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378{
379 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200380 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381
Thomas Wouters477c8d52006-05-27 19:21:47 +0000382 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383 if (length == 0 && unicode_empty != NULL) {
384 Py_INCREF(unicode_empty);
385 return unicode_empty;
386 }
387
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000388 /* Ensure we won't overflow the size. */
389 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
390 return (PyUnicodeObject *)PyErr_NoMemory();
391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200392 if (length < 0) {
393 PyErr_SetString(PyExc_SystemError,
394 "Negative size passed to _PyUnicode_New");
395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396 }
397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200398#ifdef Py_DEBUG
399 ++unicode_old_new_calls;
400#endif
401
402 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
403 if (unicode == NULL)
404 return NULL;
405 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
406 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
407 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 PyErr_NoMemory();
409 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200411
Jeremy Hyltond8082792003-09-16 19:41:39 +0000412 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000413 * the caller fails before initializing str -- unicode_resize()
414 * reads str[0], and the Keep-Alive optimization can keep memory
415 * allocated for str alive across a call to unicode_dealloc(unicode).
416 * We don't want unicode_resize to read uninitialized memory in
417 * that case.
418 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419 _PyUnicode_WSTR(unicode)[0] = 0;
420 _PyUnicode_WSTR(unicode)[length] = 0;
421 _PyUnicode_WSTR_LENGTH(unicode) = length;
422 _PyUnicode_HASH(unicode) = -1;
423 _PyUnicode_STATE(unicode).interned = 0;
424 _PyUnicode_STATE(unicode).kind = 0;
425 _PyUnicode_STATE(unicode).compact = 0;
426 _PyUnicode_STATE(unicode).ready = 0;
427 _PyUnicode_STATE(unicode).ascii = 0;
428 unicode->data.any = NULL;
429 _PyUnicode_LENGTH(unicode) = 0;
430 unicode->_base.utf8 = NULL;
431 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000433
Benjamin Peterson29060642009-01-31 22:14:21 +0000434 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000435 /* XXX UNREF/NEWREF interface should be more symmetrical */
436 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000437 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000438 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440}
441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200442#ifdef Py_DEBUG
443int unicode_new_new_calls = 0;
444
445/* Functions wrapping macros for use in debugger */
446char *_PyUnicode_utf8(void *unicode){
447 return _PyUnicode_UTF8(unicode);
448}
449
450void *_PyUnicode_compact_data(void *unicode) {
451 return _PyUnicode_COMPACT_DATA(unicode);
452}
453void *_PyUnicode_data(void *unicode){
454 printf("obj %p\n", unicode);
455 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
456 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
457 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
458 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
459 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
460 return PyUnicode_DATA(unicode);
461}
462#endif
463
464PyObject *
465PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
466{
467 PyObject *obj;
468 PyCompactUnicodeObject *unicode;
469 void *data;
470 int kind_state;
471 int is_sharing = 0, is_ascii = 0;
472 Py_ssize_t char_size;
473 Py_ssize_t struct_size;
474
475 /* Optimization for empty strings */
476 if (size == 0 && unicode_empty != NULL) {
477 Py_INCREF(unicode_empty);
478 return (PyObject *)unicode_empty;
479 }
480
481#ifdef Py_DEBUG
482 ++unicode_new_new_calls;
483#endif
484
485 struct_size = sizeof(PyCompactUnicodeObject);
486 if (maxchar < 128) {
487 kind_state = PyUnicode_1BYTE_KIND;
488 char_size = 1;
489 is_ascii = 1;
490 struct_size = sizeof(PyASCIIObject);
491 }
492 else if (maxchar < 256) {
493 kind_state = PyUnicode_1BYTE_KIND;
494 char_size = 1;
495 }
496 else if (maxchar < 65536) {
497 kind_state = PyUnicode_2BYTE_KIND;
498 char_size = 2;
499 if (sizeof(wchar_t) == 2)
500 is_sharing = 1;
501 }
502 else {
503 kind_state = PyUnicode_4BYTE_KIND;
504 char_size = 4;
505 if (sizeof(wchar_t) == 4)
506 is_sharing = 1;
507 }
508
509 /* Ensure we won't overflow the size. */
510 if (size < 0) {
511 PyErr_SetString(PyExc_SystemError,
512 "Negative size passed to PyUnicode_New");
513 return NULL;
514 }
515 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
516 return PyErr_NoMemory();
517
518 /* Duplicated allocation code from _PyObject_New() instead of a call to
519 * PyObject_New() so we are able to allocate space for the object and
520 * it's data buffer.
521 */
522 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
523 if (obj == NULL)
524 return PyErr_NoMemory();
525 obj = PyObject_INIT(obj, &PyUnicode_Type);
526 if (obj == NULL)
527 return NULL;
528
529 unicode = (PyCompactUnicodeObject *)obj;
530 if (is_ascii)
531 data = ((PyASCIIObject*)obj) + 1;
532 else
533 data = unicode + 1;
534 _PyUnicode_LENGTH(unicode) = size;
535 _PyUnicode_HASH(unicode) = -1;
536 _PyUnicode_STATE(unicode).interned = 0;
537 _PyUnicode_STATE(unicode).kind = kind_state;
538 _PyUnicode_STATE(unicode).compact = 1;
539 _PyUnicode_STATE(unicode).ready = 1;
540 _PyUnicode_STATE(unicode).ascii = is_ascii;
541 if (is_ascii) {
542 ((char*)data)[size] = 0;
543 _PyUnicode_WSTR(unicode) = NULL;
544 }
545 else if (kind_state == PyUnicode_1BYTE_KIND) {
546 ((char*)data)[size] = 0;
547 _PyUnicode_WSTR(unicode) = NULL;
548 _PyUnicode_WSTR_LENGTH(unicode) = 0;
549 unicode->utf8_length = 0;
550 unicode->utf8 = NULL;
551 }
552 else {
553 unicode->utf8 = NULL;
554 if (kind_state == PyUnicode_2BYTE_KIND)
555 ((Py_UCS2*)data)[size] = 0;
556 else /* kind_state == PyUnicode_4BYTE_KIND */
557 ((Py_UCS4*)data)[size] = 0;
558 if (is_sharing) {
559 _PyUnicode_WSTR_LENGTH(unicode) = size;
560 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
561 }
562 else {
563 _PyUnicode_WSTR_LENGTH(unicode) = 0;
564 _PyUnicode_WSTR(unicode) = NULL;
565 }
566 }
567 return obj;
568}
569
570#if SIZEOF_WCHAR_T == 2
571/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
572 will decode surrogate pairs, the other conversions are implemented as macros
573 for efficency.
574
575 This function assumes that unicode can hold one more code point than wstr
576 characters for a terminating null character. */
577static int
578unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
579 PyUnicodeObject *unicode)
580{
581 const wchar_t *iter;
582 Py_UCS4 *ucs4_out;
583
584 assert(unicode && PyUnicode_Check(unicode));
585 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
586 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
587
588 for (iter = begin; iter < end; ) {
589 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
590 _PyUnicode_GET_LENGTH(unicode)));
591 if (*iter >= 0xD800 && *iter <= 0xDBFF
592 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
593 {
594 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
595 iter += 2;
596 }
597 else {
598 *ucs4_out++ = *iter;
599 iter++;
600 }
601 }
602 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
603 _PyUnicode_GET_LENGTH(unicode)));
604
605 return 0;
606}
607#endif
608
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200609Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
611 PyObject *from, Py_ssize_t from_start,
612 Py_ssize_t how_many)
613{
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200614 unsigned int from_kind;
615 unsigned int to_kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200616
617 assert(PyUnicode_Check(from));
618 assert(PyUnicode_Check(to));
619
620 if (PyUnicode_READY(from))
621 return -1;
622 if (PyUnicode_READY(to))
623 return -1;
624
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200625 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200626 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
627 PyErr_Format(PyExc_ValueError,
628 "Cannot write %zi characters at %zi "
629 "in a string of %zi characters",
630 how_many, to_start, PyUnicode_GET_LENGTH(to));
631 return -1;
632 }
633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634 from_kind = PyUnicode_KIND(from);
635 to_kind = PyUnicode_KIND(to);
636
637 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200638 /* fast path */
639 Py_MEMCPY((char*)PyUnicode_DATA(to)
640 + PyUnicode_KIND_SIZE(to_kind, to_start),
641 (char*)PyUnicode_DATA(from)
642 + PyUnicode_KIND_SIZE(from_kind, from_start),
643 PyUnicode_KIND_SIZE(to_kind, how_many));
644 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200646
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200647 if (from_kind > to_kind) {
648 /* slow path to check for character overflow */
649 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
650 void *from_data = PyUnicode_DATA(from);
651 void *to_data = PyUnicode_DATA(to);
652 Py_UCS4 ch, maxchar;
653 Py_ssize_t i;
654 int overflow;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200656 maxchar = 0;
657 for (i=0; i < how_many; i++) {
658 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
659 if (ch > maxchar) {
660 maxchar = ch;
661 if (maxchar > to_maxchar) {
662 overflow = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200663 break;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200665 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200666 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
667 }
668 if (!overflow)
669 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200670 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200671 else if (from_kind == PyUnicode_1BYTE_KIND && to_kind == PyUnicode_2BYTE_KIND)
672 {
673 _PyUnicode_CONVERT_BYTES(
674 Py_UCS1, Py_UCS2,
675 PyUnicode_1BYTE_DATA(from) + from_start,
676 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
677 PyUnicode_2BYTE_DATA(to) + to_start
678 );
679 return how_many;
680 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200681 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200682 && to_kind == PyUnicode_4BYTE_KIND)
683 {
684 _PyUnicode_CONVERT_BYTES(
685 Py_UCS1, Py_UCS4,
686 PyUnicode_1BYTE_DATA(from) + from_start,
687 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
688 PyUnicode_4BYTE_DATA(to) + to_start
689 );
690 return how_many;
691 }
692 else if (from_kind == PyUnicode_2BYTE_KIND
693 && to_kind == PyUnicode_4BYTE_KIND)
694 {
695 _PyUnicode_CONVERT_BYTES(
696 Py_UCS2, Py_UCS4,
697 PyUnicode_2BYTE_DATA(from) + from_start,
698 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
699 PyUnicode_4BYTE_DATA(to) + to_start
700 );
701 return how_many;
702 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200703 PyErr_Format(PyExc_ValueError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200704 "Cannot copy UCS%u characters "
705 "into a string of UCS%u characters",
Victor Stinner157f83f2011-09-28 21:41:31 +0200706 1 << (from_kind - 1),
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200707 1 << (to_kind -1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200708 return -1;
709}
710
Victor Stinner17222162011-09-28 22:15:37 +0200711/* Find the maximum code point and count the number of surrogate pairs so a
712 correct string length can be computed before converting a string to UCS4.
713 This function counts single surrogates as a character and not as a pair.
714
715 Return 0 on success, or -1 on error. */
716static int
717find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
718 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200719{
720 const wchar_t *iter;
721
722 if (num_surrogates == NULL || maxchar == NULL) {
723 PyErr_SetString(PyExc_SystemError,
724 "unexpected NULL arguments to "
725 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
726 return -1;
727 }
728
729 *num_surrogates = 0;
730 *maxchar = 0;
731
732 for (iter = begin; iter < end; ) {
733 if (*iter > *maxchar)
734 *maxchar = *iter;
735#if SIZEOF_WCHAR_T == 2
736 if (*iter >= 0xD800 && *iter <= 0xDBFF
737 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
738 {
739 Py_UCS4 surrogate_val;
740 surrogate_val = (((iter[0] & 0x3FF)<<10)
741 | (iter[1] & 0x3FF)) + 0x10000;
742 ++(*num_surrogates);
743 if (surrogate_val > *maxchar)
744 *maxchar = surrogate_val;
745 iter += 2;
746 }
747 else
748 iter++;
749#else
750 iter++;
751#endif
752 }
753 return 0;
754}
755
756#ifdef Py_DEBUG
757int unicode_ready_calls = 0;
758#endif
759
760int
761_PyUnicode_Ready(PyUnicodeObject *unicode)
762{
763 wchar_t *end;
764 Py_UCS4 maxchar = 0;
765 Py_ssize_t num_surrogates;
766#if SIZEOF_WCHAR_T == 2
767 Py_ssize_t length_wo_surrogates;
768#endif
769
770 assert(PyUnicode_Check(unicode));
771
772 if (unicode->data.any != NULL) {
773 assert(PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
774 return 0;
775 }
776
777 /* _PyUnicode_Ready() is only intented for old-style API usage where
778 * strings were created using _PyObject_New() and where no canonical
779 * representation (the str field) has been set yet aka strings
780 * which are not yet ready.
781 */
782 assert(_PyUnicode_WSTR(unicode) != NULL);
783 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
784 assert(!PyUnicode_IS_COMPACT(unicode));
785 assert(!PyUnicode_IS_READY(unicode));
786 /* Actually, it should neither be interned nor be anything else: */
787 assert(_PyUnicode_STATE(unicode).interned == 0);
788 assert(unicode->_base.utf8 == NULL);
789
790#ifdef Py_DEBUG
791 ++unicode_ready_calls;
792#endif
793
794 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200795 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200796 &maxchar,
797 &num_surrogates) == -1) {
798 assert(0 && "PyUnicode_FindMaxCharAndNumSurrogatePairs failed");
799 return -1;
800 }
801
802 if (maxchar < 256) {
803 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
804 if (!unicode->data.any) {
805 PyErr_NoMemory();
806 return -1;
807 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200808 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 _PyUnicode_WSTR(unicode), end,
810 PyUnicode_1BYTE_DATA(unicode));
811 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
812 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
813 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
814 if (maxchar < 128) {
815 unicode->_base.utf8 = unicode->data.any;
816 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
817 }
818 else {
819 unicode->_base.utf8 = NULL;
820 unicode->_base.utf8_length = 0;
821 }
822 PyObject_FREE(_PyUnicode_WSTR(unicode));
823 _PyUnicode_WSTR(unicode) = NULL;
824 _PyUnicode_WSTR_LENGTH(unicode) = 0;
825 }
826 /* In this case we might have to convert down from 4-byte native
827 wchar_t to 2-byte unicode. */
828 else if (maxchar < 65536) {
829 assert(num_surrogates == 0 &&
830 "FindMaxCharAndNumSurrogatePairs() messed up");
831
832 if (sizeof(wchar_t) == 2) {
833 /* We can share representations and are done. */
834 unicode->data.any = _PyUnicode_WSTR(unicode);
835 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
836 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
837 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
838 unicode->_base.utf8 = NULL;
839 unicode->_base.utf8_length = 0;
840 }
841 else {
842 assert(sizeof(wchar_t) == 4);
843
844 unicode->data.any = PyObject_MALLOC(
845 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
846 if (!unicode->data.any) {
847 PyErr_NoMemory();
848 return -1;
849 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200850 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200851 _PyUnicode_WSTR(unicode), end,
852 PyUnicode_2BYTE_DATA(unicode));
853 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
854 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
855 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
856 unicode->_base.utf8 = NULL;
857 unicode->_base.utf8_length = 0;
858 PyObject_FREE(_PyUnicode_WSTR(unicode));
859 _PyUnicode_WSTR(unicode) = NULL;
860 _PyUnicode_WSTR_LENGTH(unicode) = 0;
861 }
862 }
863 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
864 else {
865#if SIZEOF_WCHAR_T == 2
866 /* in case the native representation is 2-bytes, we need to allocate a
867 new normalized 4-byte version. */
868 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
869 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
870 if (!unicode->data.any) {
871 PyErr_NoMemory();
872 return -1;
873 }
874 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
875 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
876 unicode->_base.utf8 = NULL;
877 unicode->_base.utf8_length = 0;
878 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
879 unicode) < 0) {
880 assert(0 && "ConvertWideCharToUCS4 failed");
881 return -1;
882 }
883 PyObject_FREE(_PyUnicode_WSTR(unicode));
884 _PyUnicode_WSTR(unicode) = NULL;
885 _PyUnicode_WSTR_LENGTH(unicode) = 0;
886#else
887 assert(num_surrogates == 0);
888
889 unicode->data.any = _PyUnicode_WSTR(unicode);
890 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
891 unicode->_base.utf8 = NULL;
892 unicode->_base.utf8_length = 0;
893 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
894#endif
895 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
896 }
897 _PyUnicode_STATE(unicode).ready = 1;
898 return 0;
899}
900
Alexander Belopolsky40018472011-02-26 01:02:56 +0000901static void
902unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000903{
Walter Dörwald16807132007-05-25 13:52:07 +0000904 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000905 case SSTATE_NOT_INTERNED:
906 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000907
Benjamin Peterson29060642009-01-31 22:14:21 +0000908 case SSTATE_INTERNED_MORTAL:
909 /* revive dead object temporarily for DelItem */
910 Py_REFCNT(unicode) = 3;
911 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
912 Py_FatalError(
913 "deletion of interned string failed");
914 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000915
Benjamin Peterson29060642009-01-31 22:14:21 +0000916 case SSTATE_INTERNED_IMMORTAL:
917 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000918
Benjamin Peterson29060642009-01-31 22:14:21 +0000919 default:
920 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000921 }
922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923 if (_PyUnicode_WSTR(unicode) &&
924 (!PyUnicode_IS_READY(unicode) ||
925 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
926 PyObject_DEL(_PyUnicode_WSTR(unicode));
927 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
928 PyObject_DEL(unicode->_base.utf8);
929
930 if (PyUnicode_IS_COMPACT(unicode)) {
931 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000932 }
933 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934 if (unicode->data.any)
935 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000936 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000937 }
938}
939
Alexander Belopolsky40018472011-02-26 01:02:56 +0000940static int
941_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000942{
943 register PyUnicodeObject *v;
944
945 /* Argument checks */
946 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000947 PyErr_BadInternalCall();
948 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000949 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000950 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
952 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000953 PyErr_BadInternalCall();
954 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000955 }
956
957 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958 possible since these are being shared.
959 The same goes for new-representation unicode objects or objects which
960 have already been readied.
961 For these, we simply return a fresh copy with the same Unicode content.
962 */
963 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
964 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
965 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000966 PyUnicodeObject *w = _PyUnicode_New(length);
967 if (w == NULL)
968 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
970 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000971 Py_DECREF(*unicode);
972 *unicode = w;
973 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000974 }
975
976 /* Note that we don't have to modify *unicode for unshared Unicode
977 objects, since we can modify them in-place. */
978 return unicode_resize(v, length);
979}
980
Alexander Belopolsky40018472011-02-26 01:02:56 +0000981int
982PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000983{
984 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
985}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987static PyObject*
988get_latin1_char(unsigned char ch)
989{
990 PyUnicodeObject *unicode = unicode_latin1[ch];
991 if (!unicode) {
992 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
993 if (!unicode)
994 return NULL;
995 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
996 unicode_latin1[ch] = unicode;
997 }
998 Py_INCREF(unicode);
999 return (PyObject *)unicode;
1000}
1001
Alexander Belopolsky40018472011-02-26 01:02:56 +00001002PyObject *
1003PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001004{
1005 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006 Py_UCS4 maxchar = 0;
1007 Py_ssize_t num_surrogates;
1008
1009 if (u == NULL)
1010 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001011
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001012 /* If the Unicode data is known at construction time, we can apply
1013 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001015 /* Optimization for empty strings */
1016 if (size == 0 && unicode_empty != NULL) {
1017 Py_INCREF(unicode_empty);
1018 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001019 }
Tim Petersced69f82003-09-16 20:30:58 +00001020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001021 /* Single character Unicode objects in the Latin-1 range are
1022 shared when using this constructor */
1023 if (size == 1 && *u < 256)
1024 return get_latin1_char((unsigned char)*u);
1025
1026 /* If not empty and not single character, copy the Unicode data
1027 into the new object */
Victor Stinner17222162011-09-28 22:15:37 +02001028 if (find_maxchar_surrogates(u, u + size, &maxchar,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001029 &num_surrogates) == -1)
1030 return NULL;
1031
1032 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1033 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034 if (!unicode)
1035 return NULL;
1036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001037 switch (PyUnicode_KIND(unicode)) {
1038 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001039 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1041 break;
1042 case PyUnicode_2BYTE_KIND:
1043#if Py_UNICODE_SIZE == 2
1044 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1045#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001046 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1048#endif
1049 break;
1050 case PyUnicode_4BYTE_KIND:
1051#if SIZEOF_WCHAR_T == 2
1052 /* This is the only case which has to process surrogates, thus
1053 a simple copy loop is not enough and we need a function. */
1054 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1055 Py_DECREF(unicode);
1056 return NULL;
1057 }
1058#else
1059 assert(num_surrogates == 0);
1060 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1061#endif
1062 break;
1063 default:
1064 assert(0 && "Impossible state");
1065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001066
1067 return (PyObject *)unicode;
1068}
1069
Alexander Belopolsky40018472011-02-26 01:02:56 +00001070PyObject *
1071PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001072{
1073 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001074
Benjamin Peterson14339b62009-01-31 16:36:08 +00001075 if (size < 0) {
1076 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001077 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001078 return NULL;
1079 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001080
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001081 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001082 some optimizations which share commonly used objects.
1083 Also, this means the input must be UTF-8, so fall back to the
1084 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001085 if (u != NULL) {
1086
Benjamin Peterson29060642009-01-31 22:14:21 +00001087 /* Optimization for empty strings */
1088 if (size == 0 && unicode_empty != NULL) {
1089 Py_INCREF(unicode_empty);
1090 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001091 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001092
1093 /* Single characters are shared when using this constructor.
1094 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 if (size == 1 && Py_CHARMASK(*u) < 128)
1096 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001097
1098 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001099 }
1100
Walter Dörwald55507312007-05-18 13:12:10 +00001101 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001102 if (!unicode)
1103 return NULL;
1104
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001105 return (PyObject *)unicode;
1106}
1107
Alexander Belopolsky40018472011-02-26 01:02:56 +00001108PyObject *
1109PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001110{
1111 size_t size = strlen(u);
1112 if (size > PY_SSIZE_T_MAX) {
1113 PyErr_SetString(PyExc_OverflowError, "input too long");
1114 return NULL;
1115 }
1116
1117 return PyUnicode_FromStringAndSize(u, size);
1118}
1119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120PyObject*
1121PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001122{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 PyObject *res;
1124 unsigned char max = 127;
1125 Py_ssize_t i;
1126 for (i = 0; i < size; i++) {
1127 if (u[i] & 0x80) {
1128 max = 255;
1129 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001130 }
1131 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001132 res = PyUnicode_New(size, max);
1133 if (!res)
1134 return NULL;
1135 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1136 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001137}
1138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139PyObject*
1140PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1141{
1142 PyObject *res;
1143 Py_UCS2 max = 0;
1144 Py_ssize_t i;
1145 for (i = 0; i < size; i++)
1146 if (u[i] > max)
1147 max = u[i];
1148 res = PyUnicode_New(size, max);
1149 if (!res)
1150 return NULL;
1151 if (max >= 256)
1152 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1153 else
1154 for (i = 0; i < size; i++)
1155 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1156 return res;
1157}
1158
1159PyObject*
1160PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1161{
1162 PyObject *res;
1163 Py_UCS4 max = 0;
1164 Py_ssize_t i;
1165 for (i = 0; i < size; i++)
1166 if (u[i] > max)
1167 max = u[i];
1168 res = PyUnicode_New(size, max);
1169 if (!res)
1170 return NULL;
1171 if (max >= 0x10000)
1172 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1173 else {
1174 int kind = PyUnicode_KIND(res);
1175 void *data = PyUnicode_DATA(res);
1176 for (i = 0; i < size; i++)
1177 PyUnicode_WRITE(kind, data, i, u[i]);
1178 }
1179 return res;
1180}
1181
1182PyObject*
1183PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1184{
1185 switch(kind) {
1186 case PyUnicode_1BYTE_KIND:
1187 return PyUnicode_FromUCS1(buffer, size);
1188 case PyUnicode_2BYTE_KIND:
1189 return PyUnicode_FromUCS2(buffer, size);
1190 case PyUnicode_4BYTE_KIND:
1191 return PyUnicode_FromUCS4(buffer, size);
1192 }
1193 assert(0);
1194 return NULL;
1195}
1196
1197
1198/* Widen Unicode objects to larger buffers.
1199 Return NULL if the string is too wide already. */
1200
1201void*
1202_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1203{
1204 Py_ssize_t i;
1205 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1206 void *d = PyUnicode_DATA(s);
1207 unsigned int skind = PyUnicode_KIND(s);
1208 if (PyUnicode_KIND(s) >= kind) {
1209 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1210 return NULL;
1211 }
1212 switch(kind) {
1213 case PyUnicode_2BYTE_KIND: {
1214 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1215 if (!result) {
1216 PyErr_NoMemory();
1217 return 0;
1218 }
1219 for (i = 0; i < len; i++)
1220 result[i] = ((Py_UCS1*)d)[i];
1221 return result;
1222 }
1223 case PyUnicode_4BYTE_KIND: {
1224 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1225 if (!result) {
1226 PyErr_NoMemory();
1227 return 0;
1228 }
1229 for (i = 0; i < len; i++)
1230 result[i] = PyUnicode_READ(skind, d, i);
1231 return result;
1232 }
1233 }
1234 Py_FatalError("invalid kind");
1235 return NULL;
1236}
1237
1238static Py_UCS4*
1239as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1240 int copy_null)
1241{
1242 int kind;
1243 void *data;
1244 Py_ssize_t len, targetlen;
1245 if (PyUnicode_READY(string) == -1)
1246 return NULL;
1247 kind = PyUnicode_KIND(string);
1248 data = PyUnicode_DATA(string);
1249 len = PyUnicode_GET_LENGTH(string);
1250 targetlen = len;
1251 if (copy_null)
1252 targetlen++;
1253 if (!target) {
1254 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1255 PyErr_NoMemory();
1256 return NULL;
1257 }
1258 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1259 if (!target) {
1260 PyErr_NoMemory();
1261 return NULL;
1262 }
1263 }
1264 else {
1265 if (targetsize < targetlen) {
1266 PyErr_Format(PyExc_SystemError,
1267 "string is longer than the buffer");
1268 if (copy_null && 0 < targetsize)
1269 target[0] = 0;
1270 return NULL;
1271 }
1272 }
1273 if (kind != PyUnicode_4BYTE_KIND) {
1274 Py_ssize_t i;
1275 for (i = 0; i < len; i++)
1276 target[i] = PyUnicode_READ(kind, data, i);
1277 }
1278 else
1279 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1280 if (copy_null)
1281 target[len] = 0;
1282 return target;
1283}
1284
1285Py_UCS4*
1286PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1287 int copy_null)
1288{
1289 if (target == NULL || targetsize < 1) {
1290 PyErr_BadInternalCall();
1291 return NULL;
1292 }
1293 return as_ucs4(string, target, targetsize, copy_null);
1294}
1295
1296Py_UCS4*
1297PyUnicode_AsUCS4Copy(PyObject *string)
1298{
1299 return as_ucs4(string, NULL, 0, 1);
1300}
1301
1302#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001303
Alexander Belopolsky40018472011-02-26 01:02:56 +00001304PyObject *
1305PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001308 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001310 PyErr_BadInternalCall();
1311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 }
1313
Martin v. Löwis790465f2008-04-05 20:41:37 +00001314 if (size == -1) {
1315 size = wcslen(w);
1316 }
1317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001319}
1320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001322
Walter Dörwald346737f2007-05-31 10:44:43 +00001323static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001324makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1325 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001326{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001327 *fmt++ = '%';
1328 if (width) {
1329 if (zeropad)
1330 *fmt++ = '0';
1331 fmt += sprintf(fmt, "%d", width);
1332 }
1333 if (precision)
1334 fmt += sprintf(fmt, ".%d", precision);
1335 if (longflag)
1336 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001337 else if (longlongflag) {
1338 /* longlongflag should only ever be nonzero on machines with
1339 HAVE_LONG_LONG defined */
1340#ifdef HAVE_LONG_LONG
1341 char *f = PY_FORMAT_LONG_LONG;
1342 while (*f)
1343 *fmt++ = *f++;
1344#else
1345 /* we shouldn't ever get here */
1346 assert(0);
1347 *fmt++ = 'l';
1348#endif
1349 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001350 else if (size_tflag) {
1351 char *f = PY_FORMAT_SIZE_T;
1352 while (*f)
1353 *fmt++ = *f++;
1354 }
1355 *fmt++ = c;
1356 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001357}
1358
Victor Stinner96865452011-03-01 23:44:09 +00001359/* helper for PyUnicode_FromFormatV() */
1360
1361static const char*
1362parse_format_flags(const char *f,
1363 int *p_width, int *p_precision,
1364 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1365{
1366 int width, precision, longflag, longlongflag, size_tflag;
1367
1368 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1369 f++;
1370 width = 0;
1371 while (Py_ISDIGIT((unsigned)*f))
1372 width = (width*10) + *f++ - '0';
1373 precision = 0;
1374 if (*f == '.') {
1375 f++;
1376 while (Py_ISDIGIT((unsigned)*f))
1377 precision = (precision*10) + *f++ - '0';
1378 if (*f == '%') {
1379 /* "%.3%s" => f points to "3" */
1380 f--;
1381 }
1382 }
1383 if (*f == '\0') {
1384 /* bogus format "%.1" => go backward, f points to "1" */
1385 f--;
1386 }
1387 if (p_width != NULL)
1388 *p_width = width;
1389 if (p_precision != NULL)
1390 *p_precision = precision;
1391
1392 /* Handle %ld, %lu, %lld and %llu. */
1393 longflag = 0;
1394 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001395 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001396
1397 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001398 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001399 longflag = 1;
1400 ++f;
1401 }
1402#ifdef HAVE_LONG_LONG
1403 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001404 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001405 longlongflag = 1;
1406 f += 2;
1407 }
1408#endif
1409 }
1410 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001411 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001412 size_tflag = 1;
1413 ++f;
1414 }
1415 if (p_longflag != NULL)
1416 *p_longflag = longflag;
1417 if (p_longlongflag != NULL)
1418 *p_longlongflag = longlongflag;
1419 if (p_size_tflag != NULL)
1420 *p_size_tflag = size_tflag;
1421 return f;
1422}
1423
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001424/* maximum number of characters required for output of %ld. 21 characters
1425 allows for 64-bit integers (in decimal) and an optional sign. */
1426#define MAX_LONG_CHARS 21
1427/* maximum number of characters required for output of %lld.
1428 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1429 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1430#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1431
Walter Dörwaldd2034312007-05-18 16:29:38 +00001432PyObject *
1433PyUnicode_FromFormatV(const char *format, va_list vargs)
1434{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001435 va_list count;
1436 Py_ssize_t callcount = 0;
1437 PyObject **callresults = NULL;
1438 PyObject **callresult = NULL;
1439 Py_ssize_t n = 0;
1440 int width = 0;
1441 int precision = 0;
1442 int zeropad;
1443 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001445 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001446 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1448 Py_UCS4 argmaxchar;
1449 Py_ssize_t numbersize = 0;
1450 char *numberresults = NULL;
1451 char *numberresult = NULL;
1452 Py_ssize_t i;
1453 int kind;
1454 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001455
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001456 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001457 /* step 1: count the number of %S/%R/%A/%s format specifications
1458 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1459 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460 * result in an array)
1461 * also esimate a upper bound for all the number formats in the string,
1462 * numbers will be formated in step 3 and be keept in a '\0'-separated
1463 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001464 for (f = format; *f; f++) {
1465 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001466 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1468 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1469 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1470 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001473#ifdef HAVE_LONG_LONG
1474 if (longlongflag) {
1475 if (width < MAX_LONG_LONG_CHARS)
1476 width = MAX_LONG_LONG_CHARS;
1477 }
1478 else
1479#endif
1480 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1481 including sign. Decimal takes the most space. This
1482 isn't enough for octal. If a width is specified we
1483 need more (which we allocate later). */
1484 if (width < MAX_LONG_CHARS)
1485 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486
1487 /* account for the size + '\0' to separate numbers
1488 inside of the numberresults buffer */
1489 numbersize += (width + 1);
1490 }
1491 }
1492 else if ((unsigned char)*f > 127) {
1493 PyErr_Format(PyExc_ValueError,
1494 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1495 "string, got a non-ASCII byte: 0x%02x",
1496 (unsigned char)*f);
1497 return NULL;
1498 }
1499 }
1500 /* step 2: allocate memory for the results of
1501 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1502 if (callcount) {
1503 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1504 if (!callresults) {
1505 PyErr_NoMemory();
1506 return NULL;
1507 }
1508 callresult = callresults;
1509 }
1510 /* step 2.5: allocate memory for the results of formating numbers */
1511 if (numbersize) {
1512 numberresults = PyObject_Malloc(numbersize);
1513 if (!numberresults) {
1514 PyErr_NoMemory();
1515 goto fail;
1516 }
1517 numberresult = numberresults;
1518 }
1519
1520 /* step 3: format numbers and figure out how large a buffer we need */
1521 for (f = format; *f; f++) {
1522 if (*f == '%') {
1523 const char* p;
1524 int longflag;
1525 int longlongflag;
1526 int size_tflag;
1527 int numprinted;
1528
1529 p = f;
1530 zeropad = (f[1] == '0');
1531 f = parse_format_flags(f, &width, &precision,
1532 &longflag, &longlongflag, &size_tflag);
1533 switch (*f) {
1534 case 'c':
1535 {
1536 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001537 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 n++;
1539 break;
1540 }
1541 case '%':
1542 n++;
1543 break;
1544 case 'i':
1545 case 'd':
1546 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1547 width, precision, *f);
1548 if (longflag)
1549 numprinted = sprintf(numberresult, fmt,
1550 va_arg(count, long));
1551#ifdef HAVE_LONG_LONG
1552 else if (longlongflag)
1553 numprinted = sprintf(numberresult, fmt,
1554 va_arg(count, PY_LONG_LONG));
1555#endif
1556 else if (size_tflag)
1557 numprinted = sprintf(numberresult, fmt,
1558 va_arg(count, Py_ssize_t));
1559 else
1560 numprinted = sprintf(numberresult, fmt,
1561 va_arg(count, int));
1562 n += numprinted;
1563 /* advance by +1 to skip over the '\0' */
1564 numberresult += (numprinted + 1);
1565 assert(*(numberresult - 1) == '\0');
1566 assert(*(numberresult - 2) != '\0');
1567 assert(numprinted >= 0);
1568 assert(numberresult <= numberresults + numbersize);
1569 break;
1570 case 'u':
1571 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1572 width, precision, 'u');
1573 if (longflag)
1574 numprinted = sprintf(numberresult, fmt,
1575 va_arg(count, unsigned long));
1576#ifdef HAVE_LONG_LONG
1577 else if (longlongflag)
1578 numprinted = sprintf(numberresult, fmt,
1579 va_arg(count, unsigned PY_LONG_LONG));
1580#endif
1581 else if (size_tflag)
1582 numprinted = sprintf(numberresult, fmt,
1583 va_arg(count, size_t));
1584 else
1585 numprinted = sprintf(numberresult, fmt,
1586 va_arg(count, unsigned int));
1587 n += numprinted;
1588 numberresult += (numprinted + 1);
1589 assert(*(numberresult - 1) == '\0');
1590 assert(*(numberresult - 2) != '\0');
1591 assert(numprinted >= 0);
1592 assert(numberresult <= numberresults + numbersize);
1593 break;
1594 case 'x':
1595 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1596 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1597 n += numprinted;
1598 numberresult += (numprinted + 1);
1599 assert(*(numberresult - 1) == '\0');
1600 assert(*(numberresult - 2) != '\0');
1601 assert(numprinted >= 0);
1602 assert(numberresult <= numberresults + numbersize);
1603 break;
1604 case 'p':
1605 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1606 /* %p is ill-defined: ensure leading 0x. */
1607 if (numberresult[1] == 'X')
1608 numberresult[1] = 'x';
1609 else if (numberresult[1] != 'x') {
1610 memmove(numberresult + 2, numberresult,
1611 strlen(numberresult) + 1);
1612 numberresult[0] = '0';
1613 numberresult[1] = 'x';
1614 numprinted += 2;
1615 }
1616 n += numprinted;
1617 numberresult += (numprinted + 1);
1618 assert(*(numberresult - 1) == '\0');
1619 assert(*(numberresult - 2) != '\0');
1620 assert(numprinted >= 0);
1621 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001622 break;
1623 case 's':
1624 {
1625 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001626 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001627 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1628 if (!str)
1629 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001630 /* since PyUnicode_DecodeUTF8 returns already flexible
1631 unicode objects, there is no need to call ready on them */
1632 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001633 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001634 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001635 /* Remember the str and switch to the next slot */
1636 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001637 break;
1638 }
1639 case 'U':
1640 {
1641 PyObject *obj = va_arg(count, PyObject *);
1642 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001643 if (PyUnicode_READY(obj) == -1)
1644 goto fail;
1645 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001646 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001648 break;
1649 }
1650 case 'V':
1651 {
1652 PyObject *obj = va_arg(count, PyObject *);
1653 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001654 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001655 assert(obj || str);
1656 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001657 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 if (PyUnicode_READY(obj) == -1)
1659 goto fail;
1660 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001661 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001663 *callresult++ = NULL;
1664 }
1665 else {
1666 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1667 if (!str_obj)
1668 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001669 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001670 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001672 *callresult++ = str_obj;
1673 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001674 break;
1675 }
1676 case 'S':
1677 {
1678 PyObject *obj = va_arg(count, PyObject *);
1679 PyObject *str;
1680 assert(obj);
1681 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001683 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001685 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001686 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001687 /* Remember the str and switch to the next slot */
1688 *callresult++ = str;
1689 break;
1690 }
1691 case 'R':
1692 {
1693 PyObject *obj = va_arg(count, PyObject *);
1694 PyObject *repr;
1695 assert(obj);
1696 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001698 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001700 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001702 /* Remember the repr and switch to the next slot */
1703 *callresult++ = repr;
1704 break;
1705 }
1706 case 'A':
1707 {
1708 PyObject *obj = va_arg(count, PyObject *);
1709 PyObject *ascii;
1710 assert(obj);
1711 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001712 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001713 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001715 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001717 /* Remember the repr and switch to the next slot */
1718 *callresult++ = ascii;
1719 break;
1720 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001721 default:
1722 /* if we stumble upon an unknown
1723 formatting code, copy the rest of
1724 the format string to the output
1725 string. (we cannot just skip the
1726 code, since there's no way to know
1727 what's in the argument list) */
1728 n += strlen(p);
1729 goto expand;
1730 }
1731 } else
1732 n++;
1733 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001734 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001735 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001737 we don't have to resize the string.
1738 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001740 if (!string)
1741 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742 kind = PyUnicode_KIND(string);
1743 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001744 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001747 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001748 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001749 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001750
1751 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1753 /* checking for == because the last argument could be a empty
1754 string, which causes i to point to end, the assert at the end of
1755 the loop */
1756 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001757
Benjamin Peterson14339b62009-01-31 16:36:08 +00001758 switch (*f) {
1759 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001760 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 const int ordinal = va_arg(vargs, int);
1762 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001763 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001764 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001765 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001766 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001767 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001768 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 case 'p':
1770 /* unused, since we already have the result */
1771 if (*f == 'p')
1772 (void) va_arg(vargs, void *);
1773 else
1774 (void) va_arg(vargs, int);
1775 /* extract the result from numberresults and append. */
1776 for (; *numberresult; ++i, ++numberresult)
1777 PyUnicode_WRITE(kind, data, i, *numberresult);
1778 /* skip over the separating '\0' */
1779 assert(*numberresult == '\0');
1780 numberresult++;
1781 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001782 break;
1783 case 's':
1784 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001785 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001786 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001787 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 size = PyUnicode_GET_LENGTH(*callresult);
1789 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001790 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1791 *callresult, 0,
1792 size) < 0)
1793 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001795 /* We're done with the unicode()/repr() => forget it */
1796 Py_DECREF(*callresult);
1797 /* switch to next unicode()/repr() result */
1798 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001799 break;
1800 }
1801 case 'U':
1802 {
1803 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001804 Py_ssize_t size;
1805 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1806 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001807 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1808 obj, 0,
1809 size) < 0)
1810 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001812 break;
1813 }
1814 case 'V':
1815 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001817 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001818 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001819 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 size = PyUnicode_GET_LENGTH(obj);
1821 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001822 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1823 obj, 0,
1824 size) < 0)
1825 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001827 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 size = PyUnicode_GET_LENGTH(*callresult);
1829 assert(PyUnicode_KIND(*callresult) <=
1830 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001831 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1832 *callresult,
1833 0, size) < 0)
1834 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001836 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001837 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001838 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001839 break;
1840 }
1841 case 'S':
1842 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001843 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001844 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001845 /* unused, since we already have the result */
1846 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001847 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001848 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1849 *callresult, 0,
1850 PyUnicode_GET_LENGTH(*callresult)) < 0)
1851 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001852 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001853 /* We're done with the unicode()/repr() => forget it */
1854 Py_DECREF(*callresult);
1855 /* switch to next unicode()/repr() result */
1856 ++callresult;
1857 break;
1858 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001859 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001861 break;
1862 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001863 for (; *p; ++p, ++i)
1864 PyUnicode_WRITE(kind, data, i, *p);
1865 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001866 goto end;
1867 }
Victor Stinner1205f272010-09-11 00:54:47 +00001868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001869 else {
1870 assert(i < PyUnicode_GET_LENGTH(string));
1871 PyUnicode_WRITE(kind, data, i++, *f);
1872 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001875
Benjamin Peterson29060642009-01-31 22:14:21 +00001876 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001877 if (callresults)
1878 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001879 if (numberresults)
1880 PyObject_Free(numberresults);
1881 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001882 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001883 if (callresults) {
1884 PyObject **callresult2 = callresults;
1885 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001886 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001887 ++callresult2;
1888 }
1889 PyObject_Free(callresults);
1890 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001891 if (numberresults)
1892 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001893 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001894}
1895
Walter Dörwaldd2034312007-05-18 16:29:38 +00001896PyObject *
1897PyUnicode_FromFormat(const char *format, ...)
1898{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001899 PyObject* ret;
1900 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001901
1902#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001903 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001904#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001905 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001906#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001907 ret = PyUnicode_FromFormatV(format, vargs);
1908 va_end(vargs);
1909 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001910}
1911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912#ifdef HAVE_WCHAR_H
1913
Victor Stinner5593d8a2010-10-02 11:11:27 +00001914/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1915 convert a Unicode object to a wide character string.
1916
Victor Stinnerd88d9832011-09-06 02:00:05 +02001917 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001918 character) required to convert the unicode object. Ignore size argument.
1919
Victor Stinnerd88d9832011-09-06 02:00:05 +02001920 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001921 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001922 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001923static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001924unicode_aswidechar(PyUnicodeObject *unicode,
1925 wchar_t *w,
1926 Py_ssize_t size)
1927{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001928 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 const wchar_t *wstr;
1930
1931 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1932 if (wstr == NULL)
1933 return -1;
1934
Victor Stinner5593d8a2010-10-02 11:11:27 +00001935 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001936 if (size > res)
1937 size = res + 1;
1938 else
1939 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001941 return res;
1942 }
1943 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001944 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001945}
1946
1947Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001948PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001949 wchar_t *w,
1950 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951{
1952 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001953 PyErr_BadInternalCall();
1954 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001956 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957}
1958
Victor Stinner137c34c2010-09-29 10:25:54 +00001959wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001960PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001961 Py_ssize_t *size)
1962{
1963 wchar_t* buffer;
1964 Py_ssize_t buflen;
1965
1966 if (unicode == NULL) {
1967 PyErr_BadInternalCall();
1968 return NULL;
1969 }
1970
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001971 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972 if (buflen == -1)
1973 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001974 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001975 PyErr_NoMemory();
1976 return NULL;
1977 }
1978
Victor Stinner137c34c2010-09-29 10:25:54 +00001979 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1980 if (buffer == NULL) {
1981 PyErr_NoMemory();
1982 return NULL;
1983 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001984 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 if (buflen == -1)
1986 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001987 if (size != NULL)
1988 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001989 return buffer;
1990}
1991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993
Alexander Belopolsky40018472011-02-26 01:02:56 +00001994PyObject *
1995PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001998 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001999 PyErr_SetString(PyExc_ValueError,
2000 "chr() arg not in range(0x110000)");
2001 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002002 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 if (ordinal < 256)
2005 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 v = PyUnicode_New(1, ordinal);
2008 if (v == NULL)
2009 return NULL;
2010 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2011 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002012}
2013
Alexander Belopolsky40018472011-02-26 01:02:56 +00002014PyObject *
2015PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002016{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002017 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002018 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002019 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002020 Py_INCREF(obj);
2021 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002022 }
2023 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002024 /* For a Unicode subtype that's not a Unicode object,
2025 return a true Unicode object with the same data. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 if (PyUnicode_READY(obj) == -1)
2027 return NULL;
2028 return substring((PyUnicodeObject *)obj, 0, PyUnicode_GET_LENGTH(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002029 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002030 PyErr_Format(PyExc_TypeError,
2031 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002032 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002033 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002034}
2035
Alexander Belopolsky40018472011-02-26 01:02:56 +00002036PyObject *
2037PyUnicode_FromEncodedObject(register PyObject *obj,
2038 const char *encoding,
2039 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002040{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002041 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002042 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002043
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002045 PyErr_BadInternalCall();
2046 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002048
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002049 /* Decoding bytes objects is the most common case and should be fast */
2050 if (PyBytes_Check(obj)) {
2051 if (PyBytes_GET_SIZE(obj) == 0) {
2052 Py_INCREF(unicode_empty);
2053 v = (PyObject *) unicode_empty;
2054 }
2055 else {
2056 v = PyUnicode_Decode(
2057 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2058 encoding, errors);
2059 }
2060 return v;
2061 }
2062
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002063 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002064 PyErr_SetString(PyExc_TypeError,
2065 "decoding str is not supported");
2066 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002067 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002068
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002069 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2070 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2071 PyErr_Format(PyExc_TypeError,
2072 "coercing to str: need bytes, bytearray "
2073 "or buffer-like object, %.80s found",
2074 Py_TYPE(obj)->tp_name);
2075 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002076 }
Tim Petersced69f82003-09-16 20:30:58 +00002077
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002078 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002079 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002080 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 }
Tim Petersced69f82003-09-16 20:30:58 +00002082 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002083 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002084
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002085 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002086 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087}
2088
Victor Stinner600d3be2010-06-10 12:00:55 +00002089/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002090 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2091 1 on success. */
2092static int
2093normalize_encoding(const char *encoding,
2094 char *lower,
2095 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002097 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002098 char *l;
2099 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002100
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002101 e = encoding;
2102 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002103 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002104 while (*e) {
2105 if (l == l_end)
2106 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002107 if (Py_ISUPPER(*e)) {
2108 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002109 }
2110 else if (*e == '_') {
2111 *l++ = '-';
2112 e++;
2113 }
2114 else {
2115 *l++ = *e++;
2116 }
2117 }
2118 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002119 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002120}
2121
Alexander Belopolsky40018472011-02-26 01:02:56 +00002122PyObject *
2123PyUnicode_Decode(const char *s,
2124 Py_ssize_t size,
2125 const char *encoding,
2126 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002127{
2128 PyObject *buffer = NULL, *unicode;
2129 Py_buffer info;
2130 char lower[11]; /* Enough for any encoding shortcut */
2131
2132 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002133 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002134
2135 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002136 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002137 if ((strcmp(lower, "utf-8") == 0) ||
2138 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002139 return PyUnicode_DecodeUTF8(s, size, errors);
2140 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002141 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002142 (strcmp(lower, "iso-8859-1") == 0))
2143 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002144#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002145 else if (strcmp(lower, "mbcs") == 0)
2146 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002147#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002148 else if (strcmp(lower, "ascii") == 0)
2149 return PyUnicode_DecodeASCII(s, size, errors);
2150 else if (strcmp(lower, "utf-16") == 0)
2151 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2152 else if (strcmp(lower, "utf-32") == 0)
2153 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155
2156 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002157 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002158 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002159 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002160 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 if (buffer == NULL)
2162 goto onError;
2163 unicode = PyCodec_Decode(buffer, encoding, errors);
2164 if (unicode == NULL)
2165 goto onError;
2166 if (!PyUnicode_Check(unicode)) {
2167 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002168 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002169 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 Py_DECREF(unicode);
2171 goto onError;
2172 }
2173 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 if (PyUnicode_READY(unicode)) {
2175 Py_DECREF(unicode);
2176 return NULL;
2177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002179
Benjamin Peterson29060642009-01-31 22:14:21 +00002180 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181 Py_XDECREF(buffer);
2182 return NULL;
2183}
2184
Alexander Belopolsky40018472011-02-26 01:02:56 +00002185PyObject *
2186PyUnicode_AsDecodedObject(PyObject *unicode,
2187 const char *encoding,
2188 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002189{
2190 PyObject *v;
2191
2192 if (!PyUnicode_Check(unicode)) {
2193 PyErr_BadArgument();
2194 goto onError;
2195 }
2196
2197 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002198 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002199
2200 /* Decode via the codec registry */
2201 v = PyCodec_Decode(unicode, encoding, errors);
2202 if (v == NULL)
2203 goto onError;
2204 return v;
2205
Benjamin Peterson29060642009-01-31 22:14:21 +00002206 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002207 return NULL;
2208}
2209
Alexander Belopolsky40018472011-02-26 01:02:56 +00002210PyObject *
2211PyUnicode_AsDecodedUnicode(PyObject *unicode,
2212 const char *encoding,
2213 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002214{
2215 PyObject *v;
2216
2217 if (!PyUnicode_Check(unicode)) {
2218 PyErr_BadArgument();
2219 goto onError;
2220 }
2221
2222 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002223 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002224
2225 /* Decode via the codec registry */
2226 v = PyCodec_Decode(unicode, encoding, errors);
2227 if (v == NULL)
2228 goto onError;
2229 if (!PyUnicode_Check(v)) {
2230 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002231 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002232 Py_TYPE(v)->tp_name);
2233 Py_DECREF(v);
2234 goto onError;
2235 }
2236 return v;
2237
Benjamin Peterson29060642009-01-31 22:14:21 +00002238 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002239 return NULL;
2240}
2241
Alexander Belopolsky40018472011-02-26 01:02:56 +00002242PyObject *
2243PyUnicode_Encode(const Py_UNICODE *s,
2244 Py_ssize_t size,
2245 const char *encoding,
2246 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247{
2248 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002249
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 unicode = PyUnicode_FromUnicode(s, size);
2251 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002253 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2254 Py_DECREF(unicode);
2255 return v;
2256}
2257
Alexander Belopolsky40018472011-02-26 01:02:56 +00002258PyObject *
2259PyUnicode_AsEncodedObject(PyObject *unicode,
2260 const char *encoding,
2261 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002262{
2263 PyObject *v;
2264
2265 if (!PyUnicode_Check(unicode)) {
2266 PyErr_BadArgument();
2267 goto onError;
2268 }
2269
2270 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002271 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002272
2273 /* Encode via the codec registry */
2274 v = PyCodec_Encode(unicode, encoding, errors);
2275 if (v == NULL)
2276 goto onError;
2277 return v;
2278
Benjamin Peterson29060642009-01-31 22:14:21 +00002279 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002280 return NULL;
2281}
2282
Victor Stinnerad158722010-10-27 00:25:46 +00002283PyObject *
2284PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002285{
Victor Stinner99b95382011-07-04 14:23:54 +02002286#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002287 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2288 PyUnicode_GET_SIZE(unicode),
2289 NULL);
2290#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002292#else
Victor Stinner793b5312011-04-27 00:24:21 +02002293 PyInterpreterState *interp = PyThreadState_GET()->interp;
2294 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2295 cannot use it to encode and decode filenames before it is loaded. Load
2296 the Python codec requires to encode at least its own filename. Use the C
2297 version of the locale codec until the codec registry is initialized and
2298 the Python codec is loaded.
2299
2300 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2301 cannot only rely on it: check also interp->fscodec_initialized for
2302 subinterpreters. */
2303 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002304 return PyUnicode_AsEncodedString(unicode,
2305 Py_FileSystemDefaultEncoding,
2306 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002307 }
2308 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002309 /* locale encoding with surrogateescape */
2310 wchar_t *wchar;
2311 char *bytes;
2312 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002313 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002314
2315 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2316 if (wchar == NULL)
2317 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002318 bytes = _Py_wchar2char(wchar, &error_pos);
2319 if (bytes == NULL) {
2320 if (error_pos != (size_t)-1) {
2321 char *errmsg = strerror(errno);
2322 PyObject *exc = NULL;
2323 if (errmsg == NULL)
2324 errmsg = "Py_wchar2char() failed";
2325 raise_encode_exception(&exc,
2326 "filesystemencoding",
2327 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2328 error_pos, error_pos+1,
2329 errmsg);
2330 Py_XDECREF(exc);
2331 }
2332 else
2333 PyErr_NoMemory();
2334 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002335 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002336 }
2337 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002338
2339 bytes_obj = PyBytes_FromString(bytes);
2340 PyMem_Free(bytes);
2341 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002342 }
Victor Stinnerad158722010-10-27 00:25:46 +00002343#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002344}
2345
Alexander Belopolsky40018472011-02-26 01:02:56 +00002346PyObject *
2347PyUnicode_AsEncodedString(PyObject *unicode,
2348 const char *encoding,
2349 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350{
2351 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002352 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002353
Guido van Rossumd57fd912000-03-10 22:53:23 +00002354 if (!PyUnicode_Check(unicode)) {
2355 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002356 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357 }
Fred Drakee4315f52000-05-09 19:53:39 +00002358
Victor Stinner2f283c22011-03-02 01:21:46 +00002359 if (encoding == NULL) {
2360 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002362 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002363 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002364 }
Fred Drakee4315f52000-05-09 19:53:39 +00002365
2366 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002367 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002368 if ((strcmp(lower, "utf-8") == 0) ||
2369 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002370 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002371 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002372 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002373 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002375 }
Victor Stinner37296e82010-06-10 13:36:23 +00002376 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002377 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002378 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002379 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002380#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002381 else if (strcmp(lower, "mbcs") == 0)
2382 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2383 PyUnicode_GET_SIZE(unicode),
2384 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002385#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002386 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002388 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389
2390 /* Encode via the codec registry */
2391 v = PyCodec_Encode(unicode, encoding, errors);
2392 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002393 return NULL;
2394
2395 /* The normal path */
2396 if (PyBytes_Check(v))
2397 return v;
2398
2399 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002400 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002401 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002402 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002403
2404 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2405 "encoder %s returned bytearray instead of bytes",
2406 encoding);
2407 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002408 Py_DECREF(v);
2409 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002410 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002411
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002412 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2413 Py_DECREF(v);
2414 return b;
2415 }
2416
2417 PyErr_Format(PyExc_TypeError,
2418 "encoder did not return a bytes object (type=%.400s)",
2419 Py_TYPE(v)->tp_name);
2420 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002421 return NULL;
2422}
2423
Alexander Belopolsky40018472011-02-26 01:02:56 +00002424PyObject *
2425PyUnicode_AsEncodedUnicode(PyObject *unicode,
2426 const char *encoding,
2427 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002428{
2429 PyObject *v;
2430
2431 if (!PyUnicode_Check(unicode)) {
2432 PyErr_BadArgument();
2433 goto onError;
2434 }
2435
2436 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002437 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002438
2439 /* Encode via the codec registry */
2440 v = PyCodec_Encode(unicode, encoding, errors);
2441 if (v == NULL)
2442 goto onError;
2443 if (!PyUnicode_Check(v)) {
2444 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002445 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002446 Py_TYPE(v)->tp_name);
2447 Py_DECREF(v);
2448 goto onError;
2449 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002450 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002451
Benjamin Peterson29060642009-01-31 22:14:21 +00002452 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453 return NULL;
2454}
2455
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002456PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002457PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002458 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002459 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2460}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002461
Christian Heimes5894ba72007-11-04 11:43:14 +00002462PyObject*
2463PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2464{
Victor Stinner99b95382011-07-04 14:23:54 +02002465#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002466 return PyUnicode_DecodeMBCS(s, size, NULL);
2467#elif defined(__APPLE__)
2468 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2469#else
Victor Stinner793b5312011-04-27 00:24:21 +02002470 PyInterpreterState *interp = PyThreadState_GET()->interp;
2471 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2472 cannot use it to encode and decode filenames before it is loaded. Load
2473 the Python codec requires to encode at least its own filename. Use the C
2474 version of the locale codec until the codec registry is initialized and
2475 the Python codec is loaded.
2476
2477 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2478 cannot only rely on it: check also interp->fscodec_initialized for
2479 subinterpreters. */
2480 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002481 return PyUnicode_Decode(s, size,
2482 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002483 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002484 }
2485 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002486 /* locale encoding with surrogateescape */
2487 wchar_t *wchar;
2488 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002489 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002490
2491 if (s[size] != '\0' || size != strlen(s)) {
2492 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2493 return NULL;
2494 }
2495
Victor Stinner168e1172010-10-16 23:16:16 +00002496 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002497 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002498 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002499
Victor Stinner168e1172010-10-16 23:16:16 +00002500 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002501 PyMem_Free(wchar);
2502 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002503 }
Victor Stinnerad158722010-10-27 00:25:46 +00002504#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002505}
2506
Martin v. Löwis011e8422009-05-05 04:43:17 +00002507
2508int
2509PyUnicode_FSConverter(PyObject* arg, void* addr)
2510{
2511 PyObject *output = NULL;
2512 Py_ssize_t size;
2513 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002514 if (arg == NULL) {
2515 Py_DECREF(*(PyObject**)addr);
2516 return 1;
2517 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002518 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002519 output = arg;
2520 Py_INCREF(output);
2521 }
2522 else {
2523 arg = PyUnicode_FromObject(arg);
2524 if (!arg)
2525 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002526 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002527 Py_DECREF(arg);
2528 if (!output)
2529 return 0;
2530 if (!PyBytes_Check(output)) {
2531 Py_DECREF(output);
2532 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2533 return 0;
2534 }
2535 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002536 size = PyBytes_GET_SIZE(output);
2537 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002538 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002539 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002540 Py_DECREF(output);
2541 return 0;
2542 }
2543 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002544 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002545}
2546
2547
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002548int
2549PyUnicode_FSDecoder(PyObject* arg, void* addr)
2550{
2551 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002552 if (arg == NULL) {
2553 Py_DECREF(*(PyObject**)addr);
2554 return 1;
2555 }
2556 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557 if (PyUnicode_READY(arg))
2558 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002559 output = arg;
2560 Py_INCREF(output);
2561 }
2562 else {
2563 arg = PyBytes_FromObject(arg);
2564 if (!arg)
2565 return 0;
2566 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2567 PyBytes_GET_SIZE(arg));
2568 Py_DECREF(arg);
2569 if (!output)
2570 return 0;
2571 if (!PyUnicode_Check(output)) {
2572 Py_DECREF(output);
2573 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2574 return 0;
2575 }
2576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002577 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2578 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002579 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2580 Py_DECREF(output);
2581 return 0;
2582 }
2583 *(PyObject**)addr = output;
2584 return Py_CLEANUP_SUPPORTED;
2585}
2586
2587
Martin v. Löwis5b222132007-06-10 09:51:05 +00002588char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002590{
Christian Heimesf3863112007-11-22 07:46:41 +00002591 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002592 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2593
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002594 if (!PyUnicode_Check(unicode)) {
2595 PyErr_BadArgument();
2596 return NULL;
2597 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002599 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002600
2601 if (_PyUnicode_UTF8(unicode) == NULL) {
2602 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2603 if (bytes == NULL)
2604 return NULL;
2605 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2606 if (u->_base.utf8 == NULL) {
2607 Py_DECREF(bytes);
2608 return NULL;
2609 }
2610 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2611 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2612 Py_DECREF(bytes);
2613 }
2614
2615 if (psize)
2616 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2617 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002618}
2619
2620char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002622{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2624}
2625
2626#ifdef Py_DEBUG
2627int unicode_as_unicode_calls = 0;
2628#endif
2629
2630
2631Py_UNICODE *
2632PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2633{
2634 PyUnicodeObject *u;
2635 const unsigned char *one_byte;
2636#if SIZEOF_WCHAR_T == 4
2637 const Py_UCS2 *two_bytes;
2638#else
2639 const Py_UCS4 *four_bytes;
2640 const Py_UCS4 *ucs4_end;
2641 Py_ssize_t num_surrogates;
2642#endif
2643 wchar_t *w;
2644 wchar_t *wchar_end;
2645
2646 if (!PyUnicode_Check(unicode)) {
2647 PyErr_BadArgument();
2648 return NULL;
2649 }
2650 u = (PyUnicodeObject*)unicode;
2651 if (_PyUnicode_WSTR(u) == NULL) {
2652 /* Non-ASCII compact unicode object */
2653 assert(_PyUnicode_KIND(u) != 0);
2654 assert(PyUnicode_IS_READY(u));
2655
2656#ifdef Py_DEBUG
2657 ++unicode_as_unicode_calls;
2658#endif
2659
2660 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2661#if SIZEOF_WCHAR_T == 2
2662 four_bytes = PyUnicode_4BYTE_DATA(u);
2663 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2664 num_surrogates = 0;
2665
2666 for (; four_bytes < ucs4_end; ++four_bytes) {
2667 if (*four_bytes > 0xFFFF)
2668 ++num_surrogates;
2669 }
2670
2671 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2672 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2673 if (!_PyUnicode_WSTR(u)) {
2674 PyErr_NoMemory();
2675 return NULL;
2676 }
2677 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2678
2679 w = _PyUnicode_WSTR(u);
2680 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2681 four_bytes = PyUnicode_4BYTE_DATA(u);
2682 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2683 if (*four_bytes > 0xFFFF) {
2684 /* encode surrogate pair in this case */
2685 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2686 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2687 }
2688 else
2689 *w = *four_bytes;
2690
2691 if (w > wchar_end) {
2692 assert(0 && "Miscalculated string end");
2693 }
2694 }
2695 *w = 0;
2696#else
2697 /* sizeof(wchar_t) == 4 */
2698 Py_FatalError("Impossible unicode object state, wstr and str "
2699 "should share memory already.");
2700 return NULL;
2701#endif
2702 }
2703 else {
2704 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2705 (_PyUnicode_LENGTH(u) + 1));
2706 if (!_PyUnicode_WSTR(u)) {
2707 PyErr_NoMemory();
2708 return NULL;
2709 }
2710 if (!PyUnicode_IS_COMPACT_ASCII(u))
2711 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2712 w = _PyUnicode_WSTR(u);
2713 wchar_end = w + _PyUnicode_LENGTH(u);
2714
2715 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2716 one_byte = PyUnicode_1BYTE_DATA(u);
2717 for (; w < wchar_end; ++one_byte, ++w)
2718 *w = *one_byte;
2719 /* null-terminate the wstr */
2720 *w = 0;
2721 }
2722 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2723#if SIZEOF_WCHAR_T == 4
2724 two_bytes = PyUnicode_2BYTE_DATA(u);
2725 for (; w < wchar_end; ++two_bytes, ++w)
2726 *w = *two_bytes;
2727 /* null-terminate the wstr */
2728 *w = 0;
2729#else
2730 /* sizeof(wchar_t) == 2 */
2731 PyObject_FREE(_PyUnicode_WSTR(u));
2732 _PyUnicode_WSTR(u) = NULL;
2733 Py_FatalError("Impossible unicode object state, wstr "
2734 "and str should share memory already.");
2735 return NULL;
2736#endif
2737 }
2738 else {
2739 assert(0 && "This should never happen.");
2740 }
2741 }
2742 }
2743 if (size != NULL)
2744 *size = PyUnicode_WSTR_LENGTH(u);
2745 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002746}
2747
Alexander Belopolsky40018472011-02-26 01:02:56 +00002748Py_UNICODE *
2749PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752}
2753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002754
Alexander Belopolsky40018472011-02-26 01:02:56 +00002755Py_ssize_t
2756PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757{
2758 if (!PyUnicode_Check(unicode)) {
2759 PyErr_BadArgument();
2760 goto onError;
2761 }
2762 return PyUnicode_GET_SIZE(unicode);
2763
Benjamin Peterson29060642009-01-31 22:14:21 +00002764 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765 return -1;
2766}
2767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002768Py_ssize_t
2769PyUnicode_GetLength(PyObject *unicode)
2770{
2771 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2772 PyErr_BadArgument();
2773 return -1;
2774 }
2775
2776 return PyUnicode_GET_LENGTH(unicode);
2777}
2778
2779Py_UCS4
2780PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2781{
2782 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2783 return PyErr_BadArgument();
2784 return (Py_UCS4)-1;
2785 }
2786 return PyUnicode_READ_CHAR(unicode, index);
2787}
2788
2789int
2790PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2791{
2792 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2793 return PyErr_BadArgument();
2794 return -1;
2795 }
2796
2797 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2798 index, ch);
2799 return 0;
2800}
2801
Alexander Belopolsky40018472011-02-26 01:02:56 +00002802const char *
2803PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002804{
Victor Stinner42cb4622010-09-01 19:39:01 +00002805 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002806}
2807
Victor Stinner554f3f02010-06-16 23:33:54 +00002808/* create or adjust a UnicodeDecodeError */
2809static void
2810make_decode_exception(PyObject **exceptionObject,
2811 const char *encoding,
2812 const char *input, Py_ssize_t length,
2813 Py_ssize_t startpos, Py_ssize_t endpos,
2814 const char *reason)
2815{
2816 if (*exceptionObject == NULL) {
2817 *exceptionObject = PyUnicodeDecodeError_Create(
2818 encoding, input, length, startpos, endpos, reason);
2819 }
2820 else {
2821 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2822 goto onError;
2823 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2824 goto onError;
2825 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2826 goto onError;
2827 }
2828 return;
2829
2830onError:
2831 Py_DECREF(*exceptionObject);
2832 *exceptionObject = NULL;
2833}
2834
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835/* error handling callback helper:
2836 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002837 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 and adjust various state variables.
2839 return 0 on success, -1 on error
2840*/
2841
Alexander Belopolsky40018472011-02-26 01:02:56 +00002842static int
2843unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2844 const char *encoding, const char *reason,
2845 const char **input, const char **inend, Py_ssize_t *startinpos,
2846 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2847 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002848{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002849 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002850
2851 PyObject *restuple = NULL;
2852 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002853 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002854 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002855 Py_ssize_t requiredsize;
2856 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002857 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002858 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002859 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002860 int res = -1;
2861
2862 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002863 *errorHandler = PyCodec_LookupError(errors);
2864 if (*errorHandler == NULL)
2865 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002866 }
2867
Victor Stinner554f3f02010-06-16 23:33:54 +00002868 make_decode_exception(exceptionObject,
2869 encoding,
2870 *input, *inend - *input,
2871 *startinpos, *endinpos,
2872 reason);
2873 if (*exceptionObject == NULL)
2874 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002875
2876 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2877 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002878 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002880 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002882 }
2883 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002884 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002885
2886 /* Copy back the bytes variables, which might have been modified by the
2887 callback */
2888 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2889 if (!inputobj)
2890 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002891 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002892 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002893 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002894 *input = PyBytes_AS_STRING(inputobj);
2895 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002896 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002897 /* we can DECREF safely, as the exception has another reference,
2898 so the object won't go away. */
2899 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002900
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002901 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002902 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002903 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002904 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2905 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002906 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002907
2908 /* need more space? (at least enough for what we
2909 have+the replacement+the rest of the string (starting
2910 at the new input position), so we won't have to check space
2911 when there are no errors in the rest of the string) */
2912 repptr = PyUnicode_AS_UNICODE(repunicode);
2913 repsize = PyUnicode_GET_SIZE(repunicode);
2914 requiredsize = *outpos + repsize + insize-newpos;
2915 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002916 if (requiredsize<2*outsize)
2917 requiredsize = 2*outsize;
2918 if (_PyUnicode_Resize(output, requiredsize) < 0)
2919 goto onError;
2920 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002921 }
2922 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002923 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002924 Py_UNICODE_COPY(*outptr, repptr, repsize);
2925 *outptr += repsize;
2926 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002927
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002928 /* we made it! */
2929 res = 0;
2930
Benjamin Peterson29060642009-01-31 22:14:21 +00002931 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002932 Py_XDECREF(restuple);
2933 return res;
2934}
2935
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002936/* --- UTF-7 Codec -------------------------------------------------------- */
2937
Antoine Pitrou244651a2009-05-04 18:56:13 +00002938/* See RFC2152 for details. We encode conservatively and decode liberally. */
2939
2940/* Three simple macros defining base-64. */
2941
2942/* Is c a base-64 character? */
2943
2944#define IS_BASE64(c) \
2945 (((c) >= 'A' && (c) <= 'Z') || \
2946 ((c) >= 'a' && (c) <= 'z') || \
2947 ((c) >= '0' && (c) <= '9') || \
2948 (c) == '+' || (c) == '/')
2949
2950/* given that c is a base-64 character, what is its base-64 value? */
2951
2952#define FROM_BASE64(c) \
2953 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2954 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2955 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2956 (c) == '+' ? 62 : 63)
2957
2958/* What is the base-64 character of the bottom 6 bits of n? */
2959
2960#define TO_BASE64(n) \
2961 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2962
2963/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2964 * decoded as itself. We are permissive on decoding; the only ASCII
2965 * byte not decoding to itself is the + which begins a base64
2966 * string. */
2967
2968#define DECODE_DIRECT(c) \
2969 ((c) <= 127 && (c) != '+')
2970
2971/* The UTF-7 encoder treats ASCII characters differently according to
2972 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2973 * the above). See RFC2152. This array identifies these different
2974 * sets:
2975 * 0 : "Set D"
2976 * alphanumeric and '(),-./:?
2977 * 1 : "Set O"
2978 * !"#$%&*;<=>@[]^_`{|}
2979 * 2 : "whitespace"
2980 * ht nl cr sp
2981 * 3 : special (must be base64 encoded)
2982 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2983 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002984
Tim Petersced69f82003-09-16 20:30:58 +00002985static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002986char utf7_category[128] = {
2987/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2988 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2989/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2990 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2991/* sp ! " # $ % & ' ( ) * + , - . / */
2992 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2993/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2994 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2995/* @ A B C D E F G H I J K L M N O */
2996 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2997/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2998 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2999/* ` a b c d e f g h i j k l m n o */
3000 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3001/* p q r s t u v w x y z { | } ~ del */
3002 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003003};
3004
Antoine Pitrou244651a2009-05-04 18:56:13 +00003005/* ENCODE_DIRECT: this character should be encoded as itself. The
3006 * answer depends on whether we are encoding set O as itself, and also
3007 * on whether we are encoding whitespace as itself. RFC2152 makes it
3008 * clear that the answers to these questions vary between
3009 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003010
Antoine Pitrou244651a2009-05-04 18:56:13 +00003011#define ENCODE_DIRECT(c, directO, directWS) \
3012 ((c) < 128 && (c) > 0 && \
3013 ((utf7_category[(c)] == 0) || \
3014 (directWS && (utf7_category[(c)] == 2)) || \
3015 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003016
Alexander Belopolsky40018472011-02-26 01:02:56 +00003017PyObject *
3018PyUnicode_DecodeUTF7(const char *s,
3019 Py_ssize_t size,
3020 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003021{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003022 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3023}
3024
Antoine Pitrou244651a2009-05-04 18:56:13 +00003025/* The decoder. The only state we preserve is our read position,
3026 * i.e. how many characters we have consumed. So if we end in the
3027 * middle of a shift sequence we have to back off the read position
3028 * and the output to the beginning of the sequence, otherwise we lose
3029 * all the shift state (seen bits, number of bits seen, high
3030 * surrogate). */
3031
Alexander Belopolsky40018472011-02-26 01:02:56 +00003032PyObject *
3033PyUnicode_DecodeUTF7Stateful(const char *s,
3034 Py_ssize_t size,
3035 const char *errors,
3036 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003037{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003039 Py_ssize_t startinpos;
3040 Py_ssize_t endinpos;
3041 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003042 const char *e;
3043 PyUnicodeObject *unicode;
3044 Py_UNICODE *p;
3045 const char *errmsg = "";
3046 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003047 Py_UNICODE *shiftOutStart;
3048 unsigned int base64bits = 0;
3049 unsigned long base64buffer = 0;
3050 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051 PyObject *errorHandler = NULL;
3052 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003053
3054 unicode = _PyUnicode_New(size);
3055 if (!unicode)
3056 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003057 if (size == 0) {
3058 if (consumed)
3059 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003060 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003061 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003063 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003064 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003065 e = s + size;
3066
3067 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003069 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003070 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003071
Antoine Pitrou244651a2009-05-04 18:56:13 +00003072 if (inShift) { /* in a base-64 section */
3073 if (IS_BASE64(ch)) { /* consume a base-64 character */
3074 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3075 base64bits += 6;
3076 s++;
3077 if (base64bits >= 16) {
3078 /* we have enough bits for a UTF-16 value */
3079 Py_UNICODE outCh = (Py_UNICODE)
3080 (base64buffer >> (base64bits-16));
3081 base64bits -= 16;
3082 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3083 if (surrogate) {
3084 /* expecting a second surrogate */
3085 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3086#ifdef Py_UNICODE_WIDE
3087 *p++ = (((surrogate & 0x3FF)<<10)
3088 | (outCh & 0x3FF)) + 0x10000;
3089#else
3090 *p++ = surrogate;
3091 *p++ = outCh;
3092#endif
3093 surrogate = 0;
3094 }
3095 else {
3096 surrogate = 0;
3097 errmsg = "second surrogate missing";
3098 goto utf7Error;
3099 }
3100 }
3101 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3102 /* first surrogate */
3103 surrogate = outCh;
3104 }
3105 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3106 errmsg = "unexpected second surrogate";
3107 goto utf7Error;
3108 }
3109 else {
3110 *p++ = outCh;
3111 }
3112 }
3113 }
3114 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003115 inShift = 0;
3116 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003117 if (surrogate) {
3118 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003119 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003120 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003121 if (base64bits > 0) { /* left-over bits */
3122 if (base64bits >= 6) {
3123 /* We've seen at least one base-64 character */
3124 errmsg = "partial character in shift sequence";
3125 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003126 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003127 else {
3128 /* Some bits remain; they should be zero */
3129 if (base64buffer != 0) {
3130 errmsg = "non-zero padding bits in shift sequence";
3131 goto utf7Error;
3132 }
3133 }
3134 }
3135 if (ch != '-') {
3136 /* '-' is absorbed; other terminating
3137 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003138 *p++ = ch;
3139 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003140 }
3141 }
3142 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003143 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003144 s++; /* consume '+' */
3145 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003146 s++;
3147 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003148 }
3149 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003150 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003151 shiftOutStart = p;
3152 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003153 }
3154 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003155 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003156 *p++ = ch;
3157 s++;
3158 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003159 else {
3160 startinpos = s-starts;
3161 s++;
3162 errmsg = "unexpected special character";
3163 goto utf7Error;
3164 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003165 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003166utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003167 outpos = p-PyUnicode_AS_UNICODE(unicode);
3168 endinpos = s-starts;
3169 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003170 errors, &errorHandler,
3171 "utf7", errmsg,
3172 &starts, &e, &startinpos, &endinpos, &exc, &s,
3173 &unicode, &outpos, &p))
3174 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003175 }
3176
Antoine Pitrou244651a2009-05-04 18:56:13 +00003177 /* end of string */
3178
3179 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3180 /* if we're in an inconsistent state, that's an error */
3181 if (surrogate ||
3182 (base64bits >= 6) ||
3183 (base64bits > 0 && base64buffer != 0)) {
3184 outpos = p-PyUnicode_AS_UNICODE(unicode);
3185 endinpos = size;
3186 if (unicode_decode_call_errorhandler(
3187 errors, &errorHandler,
3188 "utf7", "unterminated shift sequence",
3189 &starts, &e, &startinpos, &endinpos, &exc, &s,
3190 &unicode, &outpos, &p))
3191 goto onError;
3192 if (s < e)
3193 goto restart;
3194 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003195 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003196
3197 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003198 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003199 if (inShift) {
3200 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003201 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003202 }
3203 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003204 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003205 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003206 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003207
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003208 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003209 goto onError;
3210
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003211 Py_XDECREF(errorHandler);
3212 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003213 if (PyUnicode_READY(unicode) == -1) {
3214 Py_DECREF(unicode);
3215 return NULL;
3216 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003217 return (PyObject *)unicode;
3218
Benjamin Peterson29060642009-01-31 22:14:21 +00003219 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003220 Py_XDECREF(errorHandler);
3221 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003222 Py_DECREF(unicode);
3223 return NULL;
3224}
3225
3226
Alexander Belopolsky40018472011-02-26 01:02:56 +00003227PyObject *
3228PyUnicode_EncodeUTF7(const Py_UNICODE *s,
3229 Py_ssize_t size,
3230 int base64SetO,
3231 int base64WhiteSpace,
3232 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003233{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003234 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003235 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003236 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003237 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003238 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003239 unsigned int base64bits = 0;
3240 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003241 char * out;
3242 char * start;
3243
3244 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003245 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003246
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003247 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003248 return PyErr_NoMemory();
3249
Antoine Pitrou244651a2009-05-04 18:56:13 +00003250 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003251 if (v == NULL)
3252 return NULL;
3253
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003254 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003255 for (;i < size; ++i) {
3256 Py_UNICODE ch = s[i];
3257
Antoine Pitrou244651a2009-05-04 18:56:13 +00003258 if (inShift) {
3259 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3260 /* shifting out */
3261 if (base64bits) { /* output remaining bits */
3262 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3263 base64buffer = 0;
3264 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003265 }
3266 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003267 /* Characters not in the BASE64 set implicitly unshift the sequence
3268 so no '-' is required, except if the character is itself a '-' */
3269 if (IS_BASE64(ch) || ch == '-') {
3270 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003271 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003272 *out++ = (char) ch;
3273 }
3274 else {
3275 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003276 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003277 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003278 else { /* not in a shift sequence */
3279 if (ch == '+') {
3280 *out++ = '+';
3281 *out++ = '-';
3282 }
3283 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3284 *out++ = (char) ch;
3285 }
3286 else {
3287 *out++ = '+';
3288 inShift = 1;
3289 goto encode_char;
3290 }
3291 }
3292 continue;
3293encode_char:
3294#ifdef Py_UNICODE_WIDE
3295 if (ch >= 0x10000) {
3296 /* code first surrogate */
3297 base64bits += 16;
3298 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3299 while (base64bits >= 6) {
3300 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3301 base64bits -= 6;
3302 }
3303 /* prepare second surrogate */
3304 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3305 }
3306#endif
3307 base64bits += 16;
3308 base64buffer = (base64buffer << 16) | ch;
3309 while (base64bits >= 6) {
3310 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3311 base64bits -= 6;
3312 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003313 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003314 if (base64bits)
3315 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3316 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003317 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003318 if (_PyBytes_Resize(&v, out - start) < 0)
3319 return NULL;
3320 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003321}
3322
Antoine Pitrou244651a2009-05-04 18:56:13 +00003323#undef IS_BASE64
3324#undef FROM_BASE64
3325#undef TO_BASE64
3326#undef DECODE_DIRECT
3327#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003328
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329/* --- UTF-8 Codec -------------------------------------------------------- */
3330
Tim Petersced69f82003-09-16 20:30:58 +00003331static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003333 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3334 illegal prefix. See RFC 3629 for details */
3335 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3336 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003337 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3339 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3340 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3341 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003342 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3343 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3345 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003346 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3347 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3348 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3349 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3350 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351};
3352
Alexander Belopolsky40018472011-02-26 01:02:56 +00003353PyObject *
3354PyUnicode_DecodeUTF8(const char *s,
3355 Py_ssize_t size,
3356 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357{
Walter Dörwald69652032004-09-07 20:24:22 +00003358 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3359}
3360
Antoine Pitrouab868312009-01-10 15:40:25 +00003361/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3362#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3363
3364/* Mask to quickly check whether a C 'long' contains a
3365 non-ASCII, UTF8-encoded char. */
3366#if (SIZEOF_LONG == 8)
3367# define ASCII_CHAR_MASK 0x8080808080808080L
3368#elif (SIZEOF_LONG == 4)
3369# define ASCII_CHAR_MASK 0x80808080L
3370#else
3371# error C 'long' size should be either 4 or 8!
3372#endif
3373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003374/* Scans a UTF-8 string and returns the maximum character to be expected,
3375 the size of the decoded unicode string and if any major errors were
3376 encountered.
3377
3378 This function does check basic UTF-8 sanity, it does however NOT CHECK
3379 if the string contains surrogates, and if all continuation bytes are
3380 within the correct ranges, these checks are performed in
3381 PyUnicode_DecodeUTF8Stateful.
3382
3383 If it sets has_errors to 1, it means the value of unicode_size and max_char
3384 will be bogus and you should not rely on useful information in them.
3385 */
3386static Py_UCS4
3387utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3388 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3389 int *has_errors)
3390{
3391 Py_ssize_t n;
3392 Py_ssize_t char_count = 0;
3393 Py_UCS4 max_char = 127, new_max;
3394 Py_UCS4 upper_bound;
3395 const unsigned char *p = (const unsigned char *)s;
3396 const unsigned char *end = p + string_size;
3397 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3398 int err = 0;
3399
3400 for (; p < end && !err; ++p, ++char_count) {
3401 /* Only check value if it's not a ASCII char... */
3402 if (*p < 0x80) {
3403 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3404 an explanation. */
3405 if (!((size_t) p & LONG_PTR_MASK)) {
3406 /* Help register allocation */
3407 register const unsigned char *_p = p;
3408 while (_p < aligned_end) {
3409 unsigned long value = *(unsigned long *) _p;
3410 if (value & ASCII_CHAR_MASK)
3411 break;
3412 _p += SIZEOF_LONG;
3413 char_count += SIZEOF_LONG;
3414 }
3415 p = _p;
3416 if (p == end)
3417 break;
3418 }
3419 }
3420 if (*p >= 0x80) {
3421 n = utf8_code_length[*p];
3422 new_max = max_char;
3423 switch (n) {
3424 /* invalid start byte */
3425 case 0:
3426 err = 1;
3427 break;
3428 case 2:
3429 /* Code points between 0x00FF and 0x07FF inclusive.
3430 Approximate the upper bound of the code point,
3431 if this flips over 255 we can be sure it will be more
3432 than 255 and the string will need 2 bytes per code coint,
3433 if it stays under or equal to 255, we can be sure 1 byte
3434 is enough.
3435 ((*p & 0b00011111) << 6) | 0b00111111 */
3436 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3437 if (max_char < upper_bound)
3438 new_max = upper_bound;
3439 /* Ensure we track at least that we left ASCII space. */
3440 if (new_max < 128)
3441 new_max = 128;
3442 break;
3443 case 3:
3444 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3445 always > 255 and <= 65535 and will always need 2 bytes. */
3446 if (max_char < 65535)
3447 new_max = 65535;
3448 break;
3449 case 4:
3450 /* Code point will be above 0xFFFF for sure in this case. */
3451 new_max = 65537;
3452 break;
3453 /* Internal error, this should be caught by the first if */
3454 case 1:
3455 default:
3456 assert(0 && "Impossible case in utf8_max_char_and_size");
3457 err = 1;
3458 }
3459 /* Instead of number of overall bytes for this code point,
3460 n containts the number of following bytes: */
3461 --n;
3462 /* Check if the follow up chars are all valid continuation bytes */
3463 if (n >= 1) {
3464 const unsigned char *cont;
3465 if ((p + n) >= end) {
3466 if (consumed == 0)
3467 /* incomplete data, non-incremental decoding */
3468 err = 1;
3469 break;
3470 }
3471 for (cont = p + 1; cont < (p + n); ++cont) {
3472 if ((*cont & 0xc0) != 0x80) {
3473 err = 1;
3474 break;
3475 }
3476 }
3477 p += n;
3478 }
3479 else
3480 err = 1;
3481 max_char = new_max;
3482 }
3483 }
3484
3485 if (unicode_size)
3486 *unicode_size = char_count;
3487 if (has_errors)
3488 *has_errors = err;
3489 return max_char;
3490}
3491
3492/* Similar to PyUnicode_WRITE but can also write into wstr field
3493 of the legacy unicode representation */
3494#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3495 do { \
3496 const int k_ = (kind); \
3497 if (k_ == PyUnicode_WCHAR_KIND) \
3498 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3499 else if (k_ == PyUnicode_1BYTE_KIND) \
3500 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3501 else if (k_ == PyUnicode_2BYTE_KIND) \
3502 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3503 else \
3504 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3505 } while (0)
3506
Alexander Belopolsky40018472011-02-26 01:02:56 +00003507PyObject *
3508PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003509 Py_ssize_t size,
3510 const char *errors,
3511 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003512{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003515 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003516 Py_ssize_t startinpos;
3517 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003518 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003520 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 PyObject *errorHandler = NULL;
3522 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003523 Py_UCS4 maxchar = 0;
3524 Py_ssize_t unicode_size;
3525 Py_ssize_t i;
3526 int kind;
3527 void *data;
3528 int has_errors;
3529 Py_UNICODE *error_outptr;
3530#if SIZEOF_WCHAR_T == 2
3531 Py_ssize_t wchar_offset = 0;
3532#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533
Walter Dörwald69652032004-09-07 20:24:22 +00003534 if (size == 0) {
3535 if (consumed)
3536 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003537 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003538 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003539 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3540 consumed, &has_errors);
3541 if (has_errors) {
3542 unicode = _PyUnicode_New(size);
3543 if (!unicode)
3544 return NULL;
3545 kind = PyUnicode_WCHAR_KIND;
3546 data = PyUnicode_AS_UNICODE(unicode);
3547 assert(data != NULL);
3548 }
3549 else {
3550 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3551 if (!unicode)
3552 return NULL;
3553 /* When the string is ASCII only, just use memcpy and return.
3554 unicode_size may be != size if there is an incomplete UTF-8
3555 sequence at the end of the ASCII block. */
3556 if (maxchar < 128 && size == unicode_size) {
3557 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3558 return (PyObject *)unicode;
3559 }
3560 kind = PyUnicode_KIND(unicode);
3561 data = PyUnicode_DATA(unicode);
3562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003564 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003566 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567
3568 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003569 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003570
3571 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003572 /* Fast path for runs of ASCII characters. Given that common UTF-8
3573 input will consist of an overwhelming majority of ASCII
3574 characters, we try to optimize for this case by checking
3575 as many characters as a C 'long' can contain.
3576 First, check if we can do an aligned read, as most CPUs have
3577 a penalty for unaligned reads.
3578 */
3579 if (!((size_t) s & LONG_PTR_MASK)) {
3580 /* Help register allocation */
3581 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003582 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003583 while (_s < aligned_end) {
3584 /* Read a whole long at a time (either 4 or 8 bytes),
3585 and do a fast unrolled copy if it only contains ASCII
3586 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003587 unsigned long value = *(unsigned long *) _s;
3588 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003589 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003590 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3591 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3592 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3593 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003594#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003595 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3596 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3597 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3598 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003599#endif
3600 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003601 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003602 }
3603 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003604 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003605 if (s == e)
3606 break;
3607 ch = (unsigned char)*s;
3608 }
3609 }
3610
3611 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003612 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613 s++;
3614 continue;
3615 }
3616
3617 n = utf8_code_length[ch];
3618
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003619 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003620 if (consumed)
3621 break;
3622 else {
3623 errmsg = "unexpected end of data";
3624 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003625 endinpos = startinpos+1;
3626 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3627 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003628 goto utf8Error;
3629 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003630 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631
3632 switch (n) {
3633
3634 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003635 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003636 startinpos = s-starts;
3637 endinpos = startinpos+1;
3638 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639
3640 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003641 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003642 startinpos = s-starts;
3643 endinpos = startinpos+1;
3644 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645
3646 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003647 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003648 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003649 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003650 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003651 goto utf8Error;
3652 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003654 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003655 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 break;
3657
3658 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003659 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3660 will result in surrogates in range d800-dfff. Surrogates are
3661 not valid UTF-8 so they are rejected.
3662 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3663 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003664 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003665 (s[2] & 0xc0) != 0x80 ||
3666 ((unsigned char)s[0] == 0xE0 &&
3667 (unsigned char)s[1] < 0xA0) ||
3668 ((unsigned char)s[0] == 0xED &&
3669 (unsigned char)s[1] > 0x9F)) {
3670 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003672 endinpos = startinpos + 1;
3673
3674 /* if s[1] first two bits are 1 and 0, then the invalid
3675 continuation byte is s[2], so increment endinpos by 1,
3676 if not, s[1] is invalid and endinpos doesn't need to
3677 be incremented. */
3678 if ((s[1] & 0xC0) == 0x80)
3679 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003680 goto utf8Error;
3681 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003683 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003684 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003685 break;
3686
3687 case 4:
3688 if ((s[1] & 0xc0) != 0x80 ||
3689 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003690 (s[3] & 0xc0) != 0x80 ||
3691 ((unsigned char)s[0] == 0xF0 &&
3692 (unsigned char)s[1] < 0x90) ||
3693 ((unsigned char)s[0] == 0xF4 &&
3694 (unsigned char)s[1] > 0x8F)) {
3695 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003696 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003697 endinpos = startinpos + 1;
3698 if ((s[1] & 0xC0) == 0x80) {
3699 endinpos++;
3700 if ((s[2] & 0xC0) == 0x80)
3701 endinpos++;
3702 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003703 goto utf8Error;
3704 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003705 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003706 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3707 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003709 /* If the string is flexible or we have native UCS-4, write
3710 directly.. */
3711 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3712 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003714 else {
3715 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003717 /* translate from 10000..10FFFF to 0..FFFF */
3718 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003720 /* high surrogate = top 10 bits added to D800 */
3721 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3722 (Py_UNICODE)(0xD800 + (ch >> 10)));
3723
3724 /* low surrogate = bottom 10 bits added to DC00 */
3725 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3726 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3727 }
3728#if SIZEOF_WCHAR_T == 2
3729 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003730#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 }
3733 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003734 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003735
Benjamin Peterson29060642009-01-31 22:14:21 +00003736 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737 /* If this is not yet a resizable string, make it one.. */
3738 if (kind != PyUnicode_WCHAR_KIND) {
3739 const Py_UNICODE *u;
3740 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3741 if (!new_unicode)
3742 goto onError;
3743 u = PyUnicode_AsUnicode((PyObject *)unicode);
3744 if (!u)
3745 goto onError;
3746#if SIZEOF_WCHAR_T == 2
3747 i += wchar_offset;
3748#endif
3749 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3750 Py_DECREF(unicode);
3751 unicode = new_unicode;
3752 kind = 0;
3753 data = PyUnicode_AS_UNICODE(new_unicode);
3754 assert(data != NULL);
3755 }
3756 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003757 if (unicode_decode_call_errorhandler(
3758 errors, &errorHandler,
3759 "utf8", errmsg,
3760 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003762 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003763 /* Update data because unicode_decode_call_errorhandler might have
3764 re-created or resized the unicode object. */
3765 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003766 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003767 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003768 /* Ensure the unicode_size calculation above was correct: */
3769 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3770
Walter Dörwald69652032004-09-07 20:24:22 +00003771 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003772 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003774 /* Adjust length and ready string when it contained errors and
3775 is of the old resizable kind. */
3776 if (kind == PyUnicode_WCHAR_KIND) {
3777 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3778 PyUnicode_READY(unicode) == -1)
3779 goto onError;
3780 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 Py_XDECREF(errorHandler);
3783 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003784 if (PyUnicode_READY(unicode) == -1) {
3785 Py_DECREF(unicode);
3786 return NULL;
3787 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003788 return (PyObject *)unicode;
3789
Benjamin Peterson29060642009-01-31 22:14:21 +00003790 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003791 Py_XDECREF(errorHandler);
3792 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793 Py_DECREF(unicode);
3794 return NULL;
3795}
3796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003797#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003798
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003799#ifdef __APPLE__
3800
3801/* Simplified UTF-8 decoder using surrogateescape error handler,
3802 used to decode the command line arguments on Mac OS X. */
3803
3804wchar_t*
3805_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3806{
3807 int n;
3808 const char *e;
3809 wchar_t *unicode, *p;
3810
3811 /* Note: size will always be longer than the resulting Unicode
3812 character count */
3813 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3814 PyErr_NoMemory();
3815 return NULL;
3816 }
3817 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3818 if (!unicode)
3819 return NULL;
3820
3821 /* Unpack UTF-8 encoded data */
3822 p = unicode;
3823 e = s + size;
3824 while (s < e) {
3825 Py_UCS4 ch = (unsigned char)*s;
3826
3827 if (ch < 0x80) {
3828 *p++ = (wchar_t)ch;
3829 s++;
3830 continue;
3831 }
3832
3833 n = utf8_code_length[ch];
3834 if (s + n > e) {
3835 goto surrogateescape;
3836 }
3837
3838 switch (n) {
3839 case 0:
3840 case 1:
3841 goto surrogateescape;
3842
3843 case 2:
3844 if ((s[1] & 0xc0) != 0x80)
3845 goto surrogateescape;
3846 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3847 assert ((ch > 0x007F) && (ch <= 0x07FF));
3848 *p++ = (wchar_t)ch;
3849 break;
3850
3851 case 3:
3852 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3853 will result in surrogates in range d800-dfff. Surrogates are
3854 not valid UTF-8 so they are rejected.
3855 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3856 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3857 if ((s[1] & 0xc0) != 0x80 ||
3858 (s[2] & 0xc0) != 0x80 ||
3859 ((unsigned char)s[0] == 0xE0 &&
3860 (unsigned char)s[1] < 0xA0) ||
3861 ((unsigned char)s[0] == 0xED &&
3862 (unsigned char)s[1] > 0x9F)) {
3863
3864 goto surrogateescape;
3865 }
3866 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3867 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003868 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003869 break;
3870
3871 case 4:
3872 if ((s[1] & 0xc0) != 0x80 ||
3873 (s[2] & 0xc0) != 0x80 ||
3874 (s[3] & 0xc0) != 0x80 ||
3875 ((unsigned char)s[0] == 0xF0 &&
3876 (unsigned char)s[1] < 0x90) ||
3877 ((unsigned char)s[0] == 0xF4 &&
3878 (unsigned char)s[1] > 0x8F)) {
3879 goto surrogateescape;
3880 }
3881 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3882 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3883 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3884
3885#if SIZEOF_WCHAR_T == 4
3886 *p++ = (wchar_t)ch;
3887#else
3888 /* compute and append the two surrogates: */
3889
3890 /* translate from 10000..10FFFF to 0..FFFF */
3891 ch -= 0x10000;
3892
3893 /* high surrogate = top 10 bits added to D800 */
3894 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3895
3896 /* low surrogate = bottom 10 bits added to DC00 */
3897 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3898#endif
3899 break;
3900 }
3901 s += n;
3902 continue;
3903
3904 surrogateescape:
3905 *p++ = 0xDC00 + ch;
3906 s++;
3907 }
3908 *p = L'\0';
3909 return unicode;
3910}
3911
3912#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003914/* Primary internal function which creates utf8 encoded bytes objects.
3915
3916 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003917 and allocate exactly as much space needed at the end. Else allocate the
3918 maximum possible needed (4 result bytes per Unicode character), and return
3919 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003920*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003921PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923{
Tim Peters602f7402002-04-27 18:03:26 +00003924#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003925
Guido van Rossum98297ee2007-11-06 21:34:58 +00003926 Py_ssize_t i; /* index into s of next input byte */
3927 PyObject *result; /* result string object */
3928 char *p; /* next free byte in output buffer */
3929 Py_ssize_t nallocated; /* number of result bytes allocated */
3930 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003931 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003932 PyObject *errorHandler = NULL;
3933 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934 int kind;
3935 void *data;
3936 Py_ssize_t size;
3937 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3938#if SIZEOF_WCHAR_T == 2
3939 Py_ssize_t wchar_offset = 0;
3940#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003942 if (!PyUnicode_Check(unicode)) {
3943 PyErr_BadArgument();
3944 return NULL;
3945 }
3946
3947 if (PyUnicode_READY(unicode) == -1)
3948 return NULL;
3949
3950 if (_PyUnicode_UTF8(unicode))
3951 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
3952 _PyUnicode_UTF8_LENGTH(unicode));
3953
3954 kind = PyUnicode_KIND(unicode);
3955 data = PyUnicode_DATA(unicode);
3956 size = PyUnicode_GET_LENGTH(unicode);
3957
Tim Peters602f7402002-04-27 18:03:26 +00003958 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959
Tim Peters602f7402002-04-27 18:03:26 +00003960 if (size <= MAX_SHORT_UNICHARS) {
3961 /* Write into the stack buffer; nallocated can't overflow.
3962 * At the end, we'll allocate exactly as much heap space as it
3963 * turns out we need.
3964 */
3965 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003966 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00003967 p = stackbuf;
3968 }
3969 else {
3970 /* Overallocate on the heap, and give the excess back at the end. */
3971 nallocated = size * 4;
3972 if (nallocated / 4 != size) /* overflow! */
3973 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00003974 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003975 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00003976 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003977 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003978 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003979
Tim Peters602f7402002-04-27 18:03:26 +00003980 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003982
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003983 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00003984 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003986
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00003988 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00003989 *p++ = (char)(0xc0 | (ch >> 6));
3990 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00003991 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992 Py_ssize_t newpos;
3993 PyObject *rep;
3994 Py_ssize_t repsize, k, startpos;
3995 startpos = i-1;
3996#if SIZEOF_WCHAR_T == 2
3997 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00003998#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003999 rep = unicode_encode_call_errorhandler(
4000 errors, &errorHandler, "utf-8", "surrogates not allowed",
4001 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4002 &exc, startpos, startpos+1, &newpos);
4003 if (!rep)
4004 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006 if (PyBytes_Check(rep))
4007 repsize = PyBytes_GET_SIZE(rep);
4008 else
4009 repsize = PyUnicode_GET_SIZE(rep);
4010
4011 if (repsize > 4) {
4012 Py_ssize_t offset;
4013
4014 if (result == NULL)
4015 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004016 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004017 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4020 /* integer overflow */
4021 PyErr_NoMemory();
4022 goto error;
4023 }
4024 nallocated += repsize - 4;
4025 if (result != NULL) {
4026 if (_PyBytes_Resize(&result, nallocated) < 0)
4027 goto error;
4028 } else {
4029 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004030 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004031 goto error;
4032 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4033 }
4034 p = PyBytes_AS_STRING(result) + offset;
4035 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037 if (PyBytes_Check(rep)) {
4038 char *prep = PyBytes_AS_STRING(rep);
4039 for(k = repsize; k > 0; k--)
4040 *p++ = *prep++;
4041 } else /* rep is unicode */ {
4042 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4043 Py_UNICODE c;
4044
4045 for(k=0; k<repsize; k++) {
4046 c = prep[k];
4047 if (0x80 <= c) {
4048 raise_encode_exception(&exc, "utf-8",
4049 PyUnicode_AS_UNICODE(unicode),
4050 size, i-1, i,
4051 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004052 goto error;
4053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004055 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004058 } else if (ch < 0x10000) {
4059 *p++ = (char)(0xe0 | (ch >> 12));
4060 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4061 *p++ = (char)(0x80 | (ch & 0x3f));
4062 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004063 /* Encode UCS4 Unicode ordinals */
4064 *p++ = (char)(0xf0 | (ch >> 18));
4065 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4066 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4067 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068#if SIZEOF_WCHAR_T == 2
4069 wchar_offset++;
4070#endif
Tim Peters602f7402002-04-27 18:03:26 +00004071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004073
Guido van Rossum98297ee2007-11-06 21:34:58 +00004074 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004075 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004076 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004077 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004078 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004079 }
4080 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004081 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004082 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004083 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004084 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004085 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004086
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004087 Py_XDECREF(errorHandler);
4088 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004089 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004090 error:
4091 Py_XDECREF(errorHandler);
4092 Py_XDECREF(exc);
4093 Py_XDECREF(result);
4094 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004095
Tim Peters602f7402002-04-27 18:03:26 +00004096#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097}
4098
Alexander Belopolsky40018472011-02-26 01:02:56 +00004099PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004100PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4101 Py_ssize_t size,
4102 const char *errors)
4103{
4104 PyObject *v, *unicode;
4105
4106 unicode = PyUnicode_FromUnicode(s, size);
4107 if (unicode == NULL)
4108 return NULL;
4109 v = _PyUnicode_AsUTF8String(unicode, errors);
4110 Py_DECREF(unicode);
4111 return v;
4112}
4113
4114PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004115PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004117 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118}
4119
Walter Dörwald41980ca2007-08-16 21:55:45 +00004120/* --- UTF-32 Codec ------------------------------------------------------- */
4121
4122PyObject *
4123PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 Py_ssize_t size,
4125 const char *errors,
4126 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004127{
4128 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4129}
4130
4131PyObject *
4132PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 Py_ssize_t size,
4134 const char *errors,
4135 int *byteorder,
4136 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004137{
4138 const char *starts = s;
4139 Py_ssize_t startinpos;
4140 Py_ssize_t endinpos;
4141 Py_ssize_t outpos;
4142 PyUnicodeObject *unicode;
4143 Py_UNICODE *p;
4144#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004145 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004146 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004147#else
4148 const int pairs = 0;
4149#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004150 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004151 int bo = 0; /* assume native ordering by default */
4152 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004153 /* Offsets from q for retrieving bytes in the right order. */
4154#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4155 int iorder[] = {0, 1, 2, 3};
4156#else
4157 int iorder[] = {3, 2, 1, 0};
4158#endif
4159 PyObject *errorHandler = NULL;
4160 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004161
Walter Dörwald41980ca2007-08-16 21:55:45 +00004162 q = (unsigned char *)s;
4163 e = q + size;
4164
4165 if (byteorder)
4166 bo = *byteorder;
4167
4168 /* Check for BOM marks (U+FEFF) in the input and adjust current
4169 byte order setting accordingly. In native mode, the leading BOM
4170 mark is skipped, in all other modes, it is copied to the output
4171 stream as-is (giving a ZWNBSP character). */
4172 if (bo == 0) {
4173 if (size >= 4) {
4174 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004175 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004176#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004177 if (bom == 0x0000FEFF) {
4178 q += 4;
4179 bo = -1;
4180 }
4181 else if (bom == 0xFFFE0000) {
4182 q += 4;
4183 bo = 1;
4184 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004185#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004186 if (bom == 0x0000FEFF) {
4187 q += 4;
4188 bo = 1;
4189 }
4190 else if (bom == 0xFFFE0000) {
4191 q += 4;
4192 bo = -1;
4193 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004194#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004196 }
4197
4198 if (bo == -1) {
4199 /* force LE */
4200 iorder[0] = 0;
4201 iorder[1] = 1;
4202 iorder[2] = 2;
4203 iorder[3] = 3;
4204 }
4205 else if (bo == 1) {
4206 /* force BE */
4207 iorder[0] = 3;
4208 iorder[1] = 2;
4209 iorder[2] = 1;
4210 iorder[3] = 0;
4211 }
4212
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004213 /* On narrow builds we split characters outside the BMP into two
4214 codepoints => count how much extra space we need. */
4215#ifndef Py_UNICODE_WIDE
4216 for (qq = q; qq < e; qq += 4)
4217 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4218 pairs++;
4219#endif
4220
4221 /* This might be one to much, because of a BOM */
4222 unicode = _PyUnicode_New((size+3)/4+pairs);
4223 if (!unicode)
4224 return NULL;
4225 if (size == 0)
4226 return (PyObject *)unicode;
4227
4228 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004229 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004230
Walter Dörwald41980ca2007-08-16 21:55:45 +00004231 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004232 Py_UCS4 ch;
4233 /* remaining bytes at the end? (size should be divisible by 4) */
4234 if (e-q<4) {
4235 if (consumed)
4236 break;
4237 errmsg = "truncated data";
4238 startinpos = ((const char *)q)-starts;
4239 endinpos = ((const char *)e)-starts;
4240 goto utf32Error;
4241 /* The remaining input chars are ignored if the callback
4242 chooses to skip the input */
4243 }
4244 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4245 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004246
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 if (ch >= 0x110000)
4248 {
4249 errmsg = "codepoint not in range(0x110000)";
4250 startinpos = ((const char *)q)-starts;
4251 endinpos = startinpos+4;
4252 goto utf32Error;
4253 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004254#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004255 if (ch >= 0x10000)
4256 {
4257 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4258 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4259 }
4260 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004261#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004262 *p++ = ch;
4263 q += 4;
4264 continue;
4265 utf32Error:
4266 outpos = p-PyUnicode_AS_UNICODE(unicode);
4267 if (unicode_decode_call_errorhandler(
4268 errors, &errorHandler,
4269 "utf32", errmsg,
4270 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4271 &unicode, &outpos, &p))
4272 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004273 }
4274
4275 if (byteorder)
4276 *byteorder = bo;
4277
4278 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004279 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004280
4281 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004282 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004283 goto onError;
4284
4285 Py_XDECREF(errorHandler);
4286 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004287 if (PyUnicode_READY(unicode) == -1) {
4288 Py_DECREF(unicode);
4289 return NULL;
4290 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004291 return (PyObject *)unicode;
4292
Benjamin Peterson29060642009-01-31 22:14:21 +00004293 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004294 Py_DECREF(unicode);
4295 Py_XDECREF(errorHandler);
4296 Py_XDECREF(exc);
4297 return NULL;
4298}
4299
4300PyObject *
4301PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004302 Py_ssize_t size,
4303 const char *errors,
4304 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004305{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004306 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004307 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004308 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004309#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004310 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004311#else
4312 const int pairs = 0;
4313#endif
4314 /* Offsets from p for storing byte pairs in the right order. */
4315#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4316 int iorder[] = {0, 1, 2, 3};
4317#else
4318 int iorder[] = {3, 2, 1, 0};
4319#endif
4320
Benjamin Peterson29060642009-01-31 22:14:21 +00004321#define STORECHAR(CH) \
4322 do { \
4323 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4324 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4325 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4326 p[iorder[0]] = (CH) & 0xff; \
4327 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004328 } while(0)
4329
4330 /* In narrow builds we can output surrogate pairs as one codepoint,
4331 so we need less space. */
4332#ifndef Py_UNICODE_WIDE
4333 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004334 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4335 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4336 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004337#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004338 nsize = (size - pairs + (byteorder == 0));
4339 bytesize = nsize * 4;
4340 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004341 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004342 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004343 if (v == NULL)
4344 return NULL;
4345
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004346 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004347 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004348 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004349 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004350 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004351
4352 if (byteorder == -1) {
4353 /* force LE */
4354 iorder[0] = 0;
4355 iorder[1] = 1;
4356 iorder[2] = 2;
4357 iorder[3] = 3;
4358 }
4359 else if (byteorder == 1) {
4360 /* force BE */
4361 iorder[0] = 3;
4362 iorder[1] = 2;
4363 iorder[2] = 1;
4364 iorder[3] = 0;
4365 }
4366
4367 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004368 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004369#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004370 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4371 Py_UCS4 ch2 = *s;
4372 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4373 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4374 s++;
4375 size--;
4376 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004377 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004378#endif
4379 STORECHAR(ch);
4380 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004381
4382 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004383 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004384#undef STORECHAR
4385}
4386
Alexander Belopolsky40018472011-02-26 01:02:56 +00004387PyObject *
4388PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004389{
4390 if (!PyUnicode_Check(unicode)) {
4391 PyErr_BadArgument();
4392 return NULL;
4393 }
4394 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 PyUnicode_GET_SIZE(unicode),
4396 NULL,
4397 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004398}
4399
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400/* --- UTF-16 Codec ------------------------------------------------------- */
4401
Tim Peters772747b2001-08-09 22:21:55 +00004402PyObject *
4403PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004404 Py_ssize_t size,
4405 const char *errors,
4406 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407{
Walter Dörwald69652032004-09-07 20:24:22 +00004408 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4409}
4410
Antoine Pitrouab868312009-01-10 15:40:25 +00004411/* Two masks for fast checking of whether a C 'long' may contain
4412 UTF16-encoded surrogate characters. This is an efficient heuristic,
4413 assuming that non-surrogate characters with a code point >= 0x8000 are
4414 rare in most input.
4415 FAST_CHAR_MASK is used when the input is in native byte ordering,
4416 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004417*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004418#if (SIZEOF_LONG == 8)
4419# define FAST_CHAR_MASK 0x8000800080008000L
4420# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4421#elif (SIZEOF_LONG == 4)
4422# define FAST_CHAR_MASK 0x80008000L
4423# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4424#else
4425# error C 'long' size should be either 4 or 8!
4426#endif
4427
Walter Dörwald69652032004-09-07 20:24:22 +00004428PyObject *
4429PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004430 Py_ssize_t size,
4431 const char *errors,
4432 int *byteorder,
4433 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004434{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004436 Py_ssize_t startinpos;
4437 Py_ssize_t endinpos;
4438 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 PyUnicodeObject *unicode;
4440 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004441 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004442 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004443 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004444 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004445 /* Offsets from q for retrieving byte pairs in the right order. */
4446#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4447 int ihi = 1, ilo = 0;
4448#else
4449 int ihi = 0, ilo = 1;
4450#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 PyObject *errorHandler = NULL;
4452 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453
4454 /* Note: size will always be longer than the resulting Unicode
4455 character count */
4456 unicode = _PyUnicode_New(size);
4457 if (!unicode)
4458 return NULL;
4459 if (size == 0)
4460 return (PyObject *)unicode;
4461
4462 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004463 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004464 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004465 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466
4467 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004468 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004470 /* Check for BOM marks (U+FEFF) in the input and adjust current
4471 byte order setting accordingly. In native mode, the leading BOM
4472 mark is skipped, in all other modes, it is copied to the output
4473 stream as-is (giving a ZWNBSP character). */
4474 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004475 if (size >= 2) {
4476 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004477#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004478 if (bom == 0xFEFF) {
4479 q += 2;
4480 bo = -1;
4481 }
4482 else if (bom == 0xFFFE) {
4483 q += 2;
4484 bo = 1;
4485 }
Tim Petersced69f82003-09-16 20:30:58 +00004486#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004487 if (bom == 0xFEFF) {
4488 q += 2;
4489 bo = 1;
4490 }
4491 else if (bom == 0xFFFE) {
4492 q += 2;
4493 bo = -1;
4494 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004495#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498
Tim Peters772747b2001-08-09 22:21:55 +00004499 if (bo == -1) {
4500 /* force LE */
4501 ihi = 1;
4502 ilo = 0;
4503 }
4504 else if (bo == 1) {
4505 /* force BE */
4506 ihi = 0;
4507 ilo = 1;
4508 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004509#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4510 native_ordering = ilo < ihi;
4511#else
4512 native_ordering = ilo > ihi;
4513#endif
Tim Peters772747b2001-08-09 22:21:55 +00004514
Antoine Pitrouab868312009-01-10 15:40:25 +00004515 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004516 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004518 /* First check for possible aligned read of a C 'long'. Unaligned
4519 reads are more expensive, better to defer to another iteration. */
4520 if (!((size_t) q & LONG_PTR_MASK)) {
4521 /* Fast path for runs of non-surrogate chars. */
4522 register const unsigned char *_q = q;
4523 Py_UNICODE *_p = p;
4524 if (native_ordering) {
4525 /* Native ordering is simple: as long as the input cannot
4526 possibly contain a surrogate char, do an unrolled copy
4527 of several 16-bit code points to the target object.
4528 The non-surrogate check is done on several input bytes
4529 at a time (as many as a C 'long' can contain). */
4530 while (_q < aligned_end) {
4531 unsigned long data = * (unsigned long *) _q;
4532 if (data & FAST_CHAR_MASK)
4533 break;
4534 _p[0] = ((unsigned short *) _q)[0];
4535 _p[1] = ((unsigned short *) _q)[1];
4536#if (SIZEOF_LONG == 8)
4537 _p[2] = ((unsigned short *) _q)[2];
4538 _p[3] = ((unsigned short *) _q)[3];
4539#endif
4540 _q += SIZEOF_LONG;
4541 _p += SIZEOF_LONG / 2;
4542 }
4543 }
4544 else {
4545 /* Byteswapped ordering is similar, but we must decompose
4546 the copy bytewise, and take care of zero'ing out the
4547 upper bytes if the target object is in 32-bit units
4548 (that is, in UCS-4 builds). */
4549 while (_q < aligned_end) {
4550 unsigned long data = * (unsigned long *) _q;
4551 if (data & SWAPPED_FAST_CHAR_MASK)
4552 break;
4553 /* Zero upper bytes in UCS-4 builds */
4554#if (Py_UNICODE_SIZE > 2)
4555 _p[0] = 0;
4556 _p[1] = 0;
4557#if (SIZEOF_LONG == 8)
4558 _p[2] = 0;
4559 _p[3] = 0;
4560#endif
4561#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004562 /* Issue #4916; UCS-4 builds on big endian machines must
4563 fill the two last bytes of each 4-byte unit. */
4564#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4565# define OFF 2
4566#else
4567# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004568#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004569 ((unsigned char *) _p)[OFF + 1] = _q[0];
4570 ((unsigned char *) _p)[OFF + 0] = _q[1];
4571 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4572 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4573#if (SIZEOF_LONG == 8)
4574 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4575 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4576 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4577 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4578#endif
4579#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004580 _q += SIZEOF_LONG;
4581 _p += SIZEOF_LONG / 2;
4582 }
4583 }
4584 p = _p;
4585 q = _q;
4586 if (q >= e)
4587 break;
4588 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004589 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004590
Benjamin Peterson14339b62009-01-31 16:36:08 +00004591 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004592
4593 if (ch < 0xD800 || ch > 0xDFFF) {
4594 *p++ = ch;
4595 continue;
4596 }
4597
4598 /* UTF-16 code pair: */
4599 if (q > e) {
4600 errmsg = "unexpected end of data";
4601 startinpos = (((const char *)q) - 2) - starts;
4602 endinpos = ((const char *)e) + 1 - starts;
4603 goto utf16Error;
4604 }
4605 if (0xD800 <= ch && ch <= 0xDBFF) {
4606 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4607 q += 2;
4608 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004609#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 *p++ = ch;
4611 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004612#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004614#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004615 continue;
4616 }
4617 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004618 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004619 startinpos = (((const char *)q)-4)-starts;
4620 endinpos = startinpos+2;
4621 goto utf16Error;
4622 }
4623
Benjamin Peterson14339b62009-01-31 16:36:08 +00004624 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 errmsg = "illegal encoding";
4626 startinpos = (((const char *)q)-2)-starts;
4627 endinpos = startinpos+2;
4628 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004629
Benjamin Peterson29060642009-01-31 22:14:21 +00004630 utf16Error:
4631 outpos = p - PyUnicode_AS_UNICODE(unicode);
4632 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004633 errors,
4634 &errorHandler,
4635 "utf16", errmsg,
4636 &starts,
4637 (const char **)&e,
4638 &startinpos,
4639 &endinpos,
4640 &exc,
4641 (const char **)&q,
4642 &unicode,
4643 &outpos,
4644 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004645 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004647 /* remaining byte at the end? (size should be even) */
4648 if (e == q) {
4649 if (!consumed) {
4650 errmsg = "truncated data";
4651 startinpos = ((const char *)q) - starts;
4652 endinpos = ((const char *)e) + 1 - starts;
4653 outpos = p - PyUnicode_AS_UNICODE(unicode);
4654 if (unicode_decode_call_errorhandler(
4655 errors,
4656 &errorHandler,
4657 "utf16", errmsg,
4658 &starts,
4659 (const char **)&e,
4660 &startinpos,
4661 &endinpos,
4662 &exc,
4663 (const char **)&q,
4664 &unicode,
4665 &outpos,
4666 &p))
4667 goto onError;
4668 /* The remaining input chars are ignored if the callback
4669 chooses to skip the input */
4670 }
4671 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672
4673 if (byteorder)
4674 *byteorder = bo;
4675
Walter Dörwald69652032004-09-07 20:24:22 +00004676 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004677 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004678
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004680 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681 goto onError;
4682
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004683 Py_XDECREF(errorHandler);
4684 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004685 if (PyUnicode_READY(unicode) == -1) {
4686 Py_DECREF(unicode);
4687 return NULL;
4688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 return (PyObject *)unicode;
4690
Benjamin Peterson29060642009-01-31 22:14:21 +00004691 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 Py_XDECREF(errorHandler);
4694 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695 return NULL;
4696}
4697
Antoine Pitrouab868312009-01-10 15:40:25 +00004698#undef FAST_CHAR_MASK
4699#undef SWAPPED_FAST_CHAR_MASK
4700
Tim Peters772747b2001-08-09 22:21:55 +00004701PyObject *
4702PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004703 Py_ssize_t size,
4704 const char *errors,
4705 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004707 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004708 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004709 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004710#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004711 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004712#else
4713 const int pairs = 0;
4714#endif
Tim Peters772747b2001-08-09 22:21:55 +00004715 /* Offsets from p for storing byte pairs in the right order. */
4716#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4717 int ihi = 1, ilo = 0;
4718#else
4719 int ihi = 0, ilo = 1;
4720#endif
4721
Benjamin Peterson29060642009-01-31 22:14:21 +00004722#define STORECHAR(CH) \
4723 do { \
4724 p[ihi] = ((CH) >> 8) & 0xff; \
4725 p[ilo] = (CH) & 0xff; \
4726 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004727 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004729#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004730 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004731 if (s[i] >= 0x10000)
4732 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004733#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004734 /* 2 * (size + pairs + (byteorder == 0)) */
4735 if (size > PY_SSIZE_T_MAX ||
4736 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004737 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004738 nsize = size + pairs + (byteorder == 0);
4739 bytesize = nsize * 2;
4740 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004741 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004742 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 if (v == NULL)
4744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004746 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004748 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004749 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004750 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004751
4752 if (byteorder == -1) {
4753 /* force LE */
4754 ihi = 1;
4755 ilo = 0;
4756 }
4757 else if (byteorder == 1) {
4758 /* force BE */
4759 ihi = 0;
4760 ilo = 1;
4761 }
4762
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004763 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 Py_UNICODE ch = *s++;
4765 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004766#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004767 if (ch >= 0x10000) {
4768 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4769 ch = 0xD800 | ((ch-0x10000) >> 10);
4770 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004771#endif
Tim Peters772747b2001-08-09 22:21:55 +00004772 STORECHAR(ch);
4773 if (ch2)
4774 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004775 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004776
4777 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004778 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004779#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780}
4781
Alexander Belopolsky40018472011-02-26 01:02:56 +00004782PyObject *
4783PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784{
4785 if (!PyUnicode_Check(unicode)) {
4786 PyErr_BadArgument();
4787 return NULL;
4788 }
4789 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004790 PyUnicode_GET_SIZE(unicode),
4791 NULL,
4792 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793}
4794
4795/* --- Unicode Escape Codec ----------------------------------------------- */
4796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004797/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4798 if all the escapes in the string make it still a valid ASCII string.
4799 Returns -1 if any escapes were found which cause the string to
4800 pop out of ASCII range. Otherwise returns the length of the
4801 required buffer to hold the string.
4802 */
4803Py_ssize_t
4804length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4805{
4806 const unsigned char *p = (const unsigned char *)s;
4807 const unsigned char *end = p + size;
4808 Py_ssize_t length = 0;
4809
4810 if (size < 0)
4811 return -1;
4812
4813 for (; p < end; ++p) {
4814 if (*p > 127) {
4815 /* Non-ASCII */
4816 return -1;
4817 }
4818 else if (*p != '\\') {
4819 /* Normal character */
4820 ++length;
4821 }
4822 else {
4823 /* Backslash-escape, check next char */
4824 ++p;
4825 /* Escape sequence reaches till end of string or
4826 non-ASCII follow-up. */
4827 if (p >= end || *p > 127)
4828 return -1;
4829 switch (*p) {
4830 case '\n':
4831 /* backslash + \n result in zero characters */
4832 break;
4833 case '\\': case '\'': case '\"':
4834 case 'b': case 'f': case 't':
4835 case 'n': case 'r': case 'v': case 'a':
4836 ++length;
4837 break;
4838 case '0': case '1': case '2': case '3':
4839 case '4': case '5': case '6': case '7':
4840 case 'x': case 'u': case 'U': case 'N':
4841 /* these do not guarantee ASCII characters */
4842 return -1;
4843 default:
4844 /* count the backslash + the other character */
4845 length += 2;
4846 }
4847 }
4848 }
4849 return length;
4850}
4851
4852/* Similar to PyUnicode_WRITE but either write into wstr field
4853 or treat string as ASCII. */
4854#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4855 do { \
4856 if ((kind) != PyUnicode_WCHAR_KIND) \
4857 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4858 else \
4859 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4860 } while (0)
4861
4862#define WRITE_WSTR(buf, index, value) \
4863 assert(kind == PyUnicode_WCHAR_KIND), \
4864 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4865
4866
Fredrik Lundh06d12682001-01-24 07:59:11 +00004867static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004868
Alexander Belopolsky40018472011-02-26 01:02:56 +00004869PyObject *
4870PyUnicode_DecodeUnicodeEscape(const char *s,
4871 Py_ssize_t size,
4872 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004874 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004875 Py_ssize_t startinpos;
4876 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004877 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004879 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004881 char* message;
4882 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883 PyObject *errorHandler = NULL;
4884 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004885 Py_ssize_t ascii_length;
4886 Py_ssize_t i;
4887 int kind;
4888 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004890 ascii_length = length_of_escaped_ascii_string(s, size);
4891
4892 /* After length_of_escaped_ascii_string() there are two alternatives,
4893 either the string is pure ASCII with named escapes like \n, etc.
4894 and we determined it's exact size (common case)
4895 or it contains \x, \u, ... escape sequences. then we create a
4896 legacy wchar string and resize it at the end of this function. */
4897 if (ascii_length >= 0) {
4898 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4899 if (!v)
4900 goto onError;
4901 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4902 kind = PyUnicode_1BYTE_KIND;
4903 data = PyUnicode_DATA(v);
4904 }
4905 else {
4906 /* Escaped strings will always be longer than the resulting
4907 Unicode string, so we start with size here and then reduce the
4908 length after conversion to the true value.
4909 (but if the error callback returns a long replacement string
4910 we'll have to allocate more space) */
4911 v = _PyUnicode_New(size);
4912 if (!v)
4913 goto onError;
4914 kind = PyUnicode_WCHAR_KIND;
4915 data = PyUnicode_AS_UNICODE(v);
4916 }
4917
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 if (size == 0)
4919 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004920 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004922
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923 while (s < end) {
4924 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004925 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004926 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004928 if (kind == PyUnicode_WCHAR_KIND) {
4929 assert(i < _PyUnicode_WSTR_LENGTH(v));
4930 }
4931 else {
4932 /* The only case in which i == ascii_length is a backslash
4933 followed by a newline. */
4934 assert(i <= ascii_length);
4935 }
4936
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 /* Non-escape characters are interpreted as Unicode ordinals */
4938 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004939 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940 continue;
4941 }
4942
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004943 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944 /* \ - Escapes */
4945 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004946 c = *s++;
4947 if (s > end)
4948 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004949
4950 if (kind == PyUnicode_WCHAR_KIND) {
4951 assert(i < _PyUnicode_WSTR_LENGTH(v));
4952 }
4953 else {
4954 /* The only case in which i == ascii_length is a backslash
4955 followed by a newline. */
4956 assert(i < ascii_length || (i == ascii_length && c == '\n'));
4957 }
4958
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004959 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960
Benjamin Peterson29060642009-01-31 22:14:21 +00004961 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004963 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
4964 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
4965 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
4966 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
4967 /* FF */
4968 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
4969 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
4970 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
4971 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
4972 /* VT */
4973 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
4974 /* BEL, not classic C */
4975 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004976
Benjamin Peterson29060642009-01-31 22:14:21 +00004977 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978 case '0': case '1': case '2': case '3':
4979 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004980 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004981 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004982 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004983 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004984 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004986 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987 break;
4988
Benjamin Peterson29060642009-01-31 22:14:21 +00004989 /* hex escapes */
4990 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00004992 digits = 2;
4993 message = "truncated \\xXX escape";
4994 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995
Benjamin Peterson29060642009-01-31 22:14:21 +00004996 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00004998 digits = 4;
4999 message = "truncated \\uXXXX escape";
5000 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001
Benjamin Peterson29060642009-01-31 22:14:21 +00005002 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005003 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005004 digits = 8;
5005 message = "truncated \\UXXXXXXXX escape";
5006 hexescape:
5007 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005008 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005009 if (s+digits>end) {
5010 endinpos = size;
5011 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005012 errors, &errorHandler,
5013 "unicodeescape", "end of string in escape sequence",
5014 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005015 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005016 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005017 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005018 goto nextByte;
5019 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005020 for (j = 0; j < digits; ++j) {
5021 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005022 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005023 endinpos = (s+j+1)-starts;
5024 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005025 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 errors, &errorHandler,
5027 "unicodeescape", message,
5028 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005029 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005030 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005031 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005032 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005033 }
5034 chr = (chr<<4) & ~0xF;
5035 if (c >= '0' && c <= '9')
5036 chr += c - '0';
5037 else if (c >= 'a' && c <= 'f')
5038 chr += 10 + c - 'a';
5039 else
5040 chr += 10 + c - 'A';
5041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005042 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005043 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005044 /* _decoding_error will have already written into the
5045 target buffer. */
5046 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005047 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005048 /* when we get here, chr is a 32-bit unicode character */
5049 if (chr <= 0xffff)
5050 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005051 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005052 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005053 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005054 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005055#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005056 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005057#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005058 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005059 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5060 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005061#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005062 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005063 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005064 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005065 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 errors, &errorHandler,
5067 "unicodeescape", "illegal Unicode character",
5068 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005069 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005070 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005071 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005072 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005073 break;
5074
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005076 case 'N':
5077 message = "malformed \\N character escape";
5078 if (ucnhash_CAPI == NULL) {
5079 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005080 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5081 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005082 if (ucnhash_CAPI == NULL)
5083 goto ucnhashError;
5084 }
5085 if (*s == '{') {
5086 const char *start = s+1;
5087 /* look for the closing brace */
5088 while (*s != '}' && s < end)
5089 s++;
5090 if (s > start && s < end && *s == '}') {
5091 /* found a name. look it up in the unicode database */
5092 message = "unknown Unicode character name";
5093 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005094 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5095 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005096 goto store;
5097 }
5098 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005099 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005100 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005101 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005102 errors, &errorHandler,
5103 "unicodeescape", message,
5104 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005105 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005106 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005107 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005108 break;
5109
5110 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005111 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005112 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113 message = "\\ at end of string";
5114 s--;
5115 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005116 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005117 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 errors, &errorHandler,
5119 "unicodeescape", message,
5120 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005121 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005122 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005123 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005124 }
5125 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005126 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5127 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005128 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005129 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005131 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005132 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005134 /* Ensure the length prediction worked in case of ASCII strings */
5135 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5136
5137 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5138 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005139 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005140 Py_XDECREF(errorHandler);
5141 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005143
Benjamin Peterson29060642009-01-31 22:14:21 +00005144 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005145 PyErr_SetString(
5146 PyExc_UnicodeError,
5147 "\\N escapes not supported (can't load unicodedata module)"
5148 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005149 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005150 Py_XDECREF(errorHandler);
5151 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005152 return NULL;
5153
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005156 Py_XDECREF(errorHandler);
5157 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158 return NULL;
5159}
5160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005161#undef WRITE_ASCII_OR_WSTR
5162#undef WRITE_WSTR
5163
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164/* Return a Unicode-Escape string version of the Unicode object.
5165
5166 If quotes is true, the string is enclosed in u"" or u'' quotes as
5167 appropriate.
5168
5169*/
5170
Walter Dörwald79e913e2007-05-12 11:08:06 +00005171static const char *hexdigits = "0123456789abcdef";
5172
Alexander Belopolsky40018472011-02-26 01:02:56 +00005173PyObject *
5174PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5175 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005177 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005180#ifdef Py_UNICODE_WIDE
5181 const Py_ssize_t expandsize = 10;
5182#else
5183 const Py_ssize_t expandsize = 6;
5184#endif
5185
Thomas Wouters89f507f2006-12-13 04:49:30 +00005186 /* XXX(nnorwitz): rather than over-allocating, it would be
5187 better to choose a different scheme. Perhaps scan the
5188 first N-chars of the string and allocate based on that size.
5189 */
5190 /* Initial allocation is based on the longest-possible unichr
5191 escape.
5192
5193 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5194 unichr, so in this case it's the longest unichr escape. In
5195 narrow (UTF-16) builds this is five chars per source unichr
5196 since there are two unichrs in the surrogate pair, so in narrow
5197 (UTF-16) builds it's not the longest unichr escape.
5198
5199 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5200 so in the narrow (UTF-16) build case it's the longest unichr
5201 escape.
5202 */
5203
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005204 if (size == 0)
5205 return PyBytes_FromStringAndSize(NULL, 0);
5206
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005207 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005209
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005210 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 2
5212 + expandsize*size
5213 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 if (repr == NULL)
5215 return NULL;
5216
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005217 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 while (size-- > 0) {
5220 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005221
Walter Dörwald79e913e2007-05-12 11:08:06 +00005222 /* Escape backslashes */
5223 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224 *p++ = '\\';
5225 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005226 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005227 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005228
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005229#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005230 /* Map 21-bit characters to '\U00xxxxxx' */
5231 else if (ch >= 0x10000) {
5232 *p++ = '\\';
5233 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005234 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5235 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5236 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5237 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5238 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5239 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5240 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5241 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005243 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005244#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5246 else if (ch >= 0xD800 && ch < 0xDC00) {
5247 Py_UNICODE ch2;
5248 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005249
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 ch2 = *s++;
5251 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005252 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005253 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5254 *p++ = '\\';
5255 *p++ = 'U';
5256 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5257 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5258 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5259 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5260 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5261 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5262 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5263 *p++ = hexdigits[ucs & 0x0000000F];
5264 continue;
5265 }
5266 /* Fall through: isolated surrogates are copied as-is */
5267 s--;
5268 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005269 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005270#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005271
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005273 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 *p++ = '\\';
5275 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005276 *p++ = hexdigits[(ch >> 12) & 0x000F];
5277 *p++ = hexdigits[(ch >> 8) & 0x000F];
5278 *p++ = hexdigits[(ch >> 4) & 0x000F];
5279 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005281
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005282 /* Map special whitespace to '\t', \n', '\r' */
5283 else if (ch == '\t') {
5284 *p++ = '\\';
5285 *p++ = 't';
5286 }
5287 else if (ch == '\n') {
5288 *p++ = '\\';
5289 *p++ = 'n';
5290 }
5291 else if (ch == '\r') {
5292 *p++ = '\\';
5293 *p++ = 'r';
5294 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005295
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005296 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005297 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005299 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005300 *p++ = hexdigits[(ch >> 4) & 0x000F];
5301 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005302 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005303
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 /* Copy everything else as-is */
5305 else
5306 *p++ = (char) ch;
5307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005309 assert(p - PyBytes_AS_STRING(repr) > 0);
5310 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5311 return NULL;
5312 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313}
5314
Alexander Belopolsky40018472011-02-26 01:02:56 +00005315PyObject *
5316PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005318 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 if (!PyUnicode_Check(unicode)) {
5320 PyErr_BadArgument();
5321 return NULL;
5322 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005323 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5324 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005325 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326}
5327
5328/* --- Raw Unicode Escape Codec ------------------------------------------- */
5329
Alexander Belopolsky40018472011-02-26 01:02:56 +00005330PyObject *
5331PyUnicode_DecodeRawUnicodeEscape(const char *s,
5332 Py_ssize_t size,
5333 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005335 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005336 Py_ssize_t startinpos;
5337 Py_ssize_t endinpos;
5338 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005340 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 const char *end;
5342 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005343 PyObject *errorHandler = NULL;
5344 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005345
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346 /* Escaped strings will always be longer than the resulting
5347 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005348 length after conversion to the true value. (But decoding error
5349 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 v = _PyUnicode_New(size);
5351 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005355 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356 end = s + size;
5357 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 unsigned char c;
5359 Py_UCS4 x;
5360 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005361 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 /* Non-escape characters are interpreted as Unicode ordinals */
5364 if (*s != '\\') {
5365 *p++ = (unsigned char)*s++;
5366 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005367 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 startinpos = s-starts;
5369
5370 /* \u-escapes are only interpreted iff the number of leading
5371 backslashes if odd */
5372 bs = s;
5373 for (;s < end;) {
5374 if (*s != '\\')
5375 break;
5376 *p++ = (unsigned char)*s++;
5377 }
5378 if (((s - bs) & 1) == 0 ||
5379 s >= end ||
5380 (*s != 'u' && *s != 'U')) {
5381 continue;
5382 }
5383 p--;
5384 count = *s=='u' ? 4 : 8;
5385 s++;
5386
5387 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5388 outpos = p-PyUnicode_AS_UNICODE(v);
5389 for (x = 0, i = 0; i < count; ++i, ++s) {
5390 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005391 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 endinpos = s-starts;
5393 if (unicode_decode_call_errorhandler(
5394 errors, &errorHandler,
5395 "rawunicodeescape", "truncated \\uXXXX",
5396 &starts, &end, &startinpos, &endinpos, &exc, &s,
5397 &v, &outpos, &p))
5398 goto onError;
5399 goto nextByte;
5400 }
5401 x = (x<<4) & ~0xF;
5402 if (c >= '0' && c <= '9')
5403 x += c - '0';
5404 else if (c >= 'a' && c <= 'f')
5405 x += 10 + c - 'a';
5406 else
5407 x += 10 + c - 'A';
5408 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005409 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 /* UCS-2 character */
5411 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005412 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 /* UCS-4 character. Either store directly, or as
5414 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005415#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005417#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 x -= 0x10000L;
5419 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5420 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005421#endif
5422 } else {
5423 endinpos = s-starts;
5424 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005425 if (unicode_decode_call_errorhandler(
5426 errors, &errorHandler,
5427 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 &starts, &end, &startinpos, &endinpos, &exc, &s,
5429 &v, &outpos, &p))
5430 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005431 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005432 nextByte:
5433 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005435 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005436 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005437 Py_XDECREF(errorHandler);
5438 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005439 if (PyUnicode_READY(v) == -1) {
5440 Py_DECREF(v);
5441 return NULL;
5442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005444
Benjamin Peterson29060642009-01-31 22:14:21 +00005445 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005447 Py_XDECREF(errorHandler);
5448 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 return NULL;
5450}
5451
Alexander Belopolsky40018472011-02-26 01:02:56 +00005452PyObject *
5453PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5454 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005456 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 char *p;
5458 char *q;
5459
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005460#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005461 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005462#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005463 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005464#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005465
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005466 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005467 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005468
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005469 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 if (repr == NULL)
5471 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005472 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005473 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005475 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 while (size-- > 0) {
5477 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005478#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 /* Map 32-bit characters to '\Uxxxxxxxx' */
5480 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005481 *p++ = '\\';
5482 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005483 *p++ = hexdigits[(ch >> 28) & 0xf];
5484 *p++ = hexdigits[(ch >> 24) & 0xf];
5485 *p++ = hexdigits[(ch >> 20) & 0xf];
5486 *p++ = hexdigits[(ch >> 16) & 0xf];
5487 *p++ = hexdigits[(ch >> 12) & 0xf];
5488 *p++ = hexdigits[(ch >> 8) & 0xf];
5489 *p++ = hexdigits[(ch >> 4) & 0xf];
5490 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005491 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005492 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005493#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5495 if (ch >= 0xD800 && ch < 0xDC00) {
5496 Py_UNICODE ch2;
5497 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005498
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 ch2 = *s++;
5500 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005501 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005502 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5503 *p++ = '\\';
5504 *p++ = 'U';
5505 *p++ = hexdigits[(ucs >> 28) & 0xf];
5506 *p++ = hexdigits[(ucs >> 24) & 0xf];
5507 *p++ = hexdigits[(ucs >> 20) & 0xf];
5508 *p++ = hexdigits[(ucs >> 16) & 0xf];
5509 *p++ = hexdigits[(ucs >> 12) & 0xf];
5510 *p++ = hexdigits[(ucs >> 8) & 0xf];
5511 *p++ = hexdigits[(ucs >> 4) & 0xf];
5512 *p++ = hexdigits[ucs & 0xf];
5513 continue;
5514 }
5515 /* Fall through: isolated surrogates are copied as-is */
5516 s--;
5517 size++;
5518 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005519#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 /* Map 16-bit characters to '\uxxxx' */
5521 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 *p++ = '\\';
5523 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005524 *p++ = hexdigits[(ch >> 12) & 0xf];
5525 *p++ = hexdigits[(ch >> 8) & 0xf];
5526 *p++ = hexdigits[(ch >> 4) & 0xf];
5527 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 /* Copy everything else as-is */
5530 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531 *p++ = (char) ch;
5532 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005533 size = p - q;
5534
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005535 assert(size > 0);
5536 if (_PyBytes_Resize(&repr, size) < 0)
5537 return NULL;
5538 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539}
5540
Alexander Belopolsky40018472011-02-26 01:02:56 +00005541PyObject *
5542PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005544 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005546 PyErr_BadArgument();
5547 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005549 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5550 PyUnicode_GET_SIZE(unicode));
5551
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005552 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553}
5554
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005555/* --- Unicode Internal Codec ------------------------------------------- */
5556
Alexander Belopolsky40018472011-02-26 01:02:56 +00005557PyObject *
5558_PyUnicode_DecodeUnicodeInternal(const char *s,
5559 Py_ssize_t size,
5560 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005561{
5562 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005563 Py_ssize_t startinpos;
5564 Py_ssize_t endinpos;
5565 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005566 PyUnicodeObject *v;
5567 Py_UNICODE *p;
5568 const char *end;
5569 const char *reason;
5570 PyObject *errorHandler = NULL;
5571 PyObject *exc = NULL;
5572
Neal Norwitzd43069c2006-01-08 01:12:10 +00005573#ifdef Py_UNICODE_WIDE
5574 Py_UNICODE unimax = PyUnicode_GetMax();
5575#endif
5576
Thomas Wouters89f507f2006-12-13 04:49:30 +00005577 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005578 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5579 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005581 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5582 as string was created with the old API. */
5583 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005584 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005585 p = PyUnicode_AS_UNICODE(v);
5586 end = s + size;
5587
5588 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005589 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005590 /* We have to sanity check the raw data, otherwise doom looms for
5591 some malformed UCS-4 data. */
5592 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005593#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005594 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005595#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005596 end-s < Py_UNICODE_SIZE
5597 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005599 startinpos = s - starts;
5600 if (end-s < Py_UNICODE_SIZE) {
5601 endinpos = end-starts;
5602 reason = "truncated input";
5603 }
5604 else {
5605 endinpos = s - starts + Py_UNICODE_SIZE;
5606 reason = "illegal code point (> 0x10FFFF)";
5607 }
5608 outpos = p - PyUnicode_AS_UNICODE(v);
5609 if (unicode_decode_call_errorhandler(
5610 errors, &errorHandler,
5611 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005612 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005613 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005614 goto onError;
5615 }
5616 }
5617 else {
5618 p++;
5619 s += Py_UNICODE_SIZE;
5620 }
5621 }
5622
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005623 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005624 goto onError;
5625 Py_XDECREF(errorHandler);
5626 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005627 if (PyUnicode_READY(v) == -1) {
5628 Py_DECREF(v);
5629 return NULL;
5630 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005631 return (PyObject *)v;
5632
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005634 Py_XDECREF(v);
5635 Py_XDECREF(errorHandler);
5636 Py_XDECREF(exc);
5637 return NULL;
5638}
5639
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640/* --- Latin-1 Codec ------------------------------------------------------ */
5641
Alexander Belopolsky40018472011-02-26 01:02:56 +00005642PyObject *
5643PyUnicode_DecodeLatin1(const char *s,
5644 Py_ssize_t size,
5645 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005648 return PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649}
5650
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005652static void
5653make_encode_exception(PyObject **exceptionObject,
5654 const char *encoding,
5655 const Py_UNICODE *unicode, Py_ssize_t size,
5656 Py_ssize_t startpos, Py_ssize_t endpos,
5657 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005659 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 *exceptionObject = PyUnicodeEncodeError_Create(
5661 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 }
5663 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5665 goto onError;
5666 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5667 goto onError;
5668 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5669 goto onError;
5670 return;
5671 onError:
5672 Py_DECREF(*exceptionObject);
5673 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 }
5675}
5676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005677/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005678static void
5679raise_encode_exception(PyObject **exceptionObject,
5680 const char *encoding,
5681 const Py_UNICODE *unicode, Py_ssize_t size,
5682 Py_ssize_t startpos, Py_ssize_t endpos,
5683 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005684{
5685 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005687 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005689}
5690
5691/* error handling callback helper:
5692 build arguments, call the callback and check the arguments,
5693 put the result into newpos and return the replacement string, which
5694 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005695static PyObject *
5696unicode_encode_call_errorhandler(const char *errors,
5697 PyObject **errorHandler,
5698 const char *encoding, const char *reason,
5699 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5700 Py_ssize_t startpos, Py_ssize_t endpos,
5701 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005703 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005704
5705 PyObject *restuple;
5706 PyObject *resunicode;
5707
5708 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005710 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712 }
5713
5714 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005716 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005718
5719 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005720 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005721 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005723 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005724 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 Py_DECREF(restuple);
5726 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005727 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005728 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 &resunicode, newpos)) {
5730 Py_DECREF(restuple);
5731 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005733 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5734 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5735 Py_DECREF(restuple);
5736 return NULL;
5737 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005738 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005740 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5742 Py_DECREF(restuple);
5743 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005744 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005745 Py_INCREF(resunicode);
5746 Py_DECREF(restuple);
5747 return resunicode;
5748}
5749
Alexander Belopolsky40018472011-02-26 01:02:56 +00005750static PyObject *
5751unicode_encode_ucs1(const Py_UNICODE *p,
5752 Py_ssize_t size,
5753 const char *errors,
5754 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755{
5756 /* output object */
5757 PyObject *res;
5758 /* pointers to the beginning and end+1 of input */
5759 const Py_UNICODE *startp = p;
5760 const Py_UNICODE *endp = p + size;
5761 /* pointer to the beginning of the unencodable characters */
5762 /* const Py_UNICODE *badp = NULL; */
5763 /* pointer into the output */
5764 char *str;
5765 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005766 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005767 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5768 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 PyObject *errorHandler = NULL;
5770 PyObject *exc = NULL;
5771 /* the following variable is used for caching string comparisons
5772 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5773 int known_errorHandler = -1;
5774
5775 /* allocate enough for a simple encoding without
5776 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005777 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005778 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005779 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005780 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005781 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005782 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005783 ressize = size;
5784
5785 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005787
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 /* can we encode this? */
5789 if (c<limit) {
5790 /* no overflow check, because we know that the space is enough */
5791 *str++ = (char)c;
5792 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005793 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 else {
5795 Py_ssize_t unicodepos = p-startp;
5796 Py_ssize_t requiredsize;
5797 PyObject *repunicode;
5798 Py_ssize_t repsize;
5799 Py_ssize_t newpos;
5800 Py_ssize_t respos;
5801 Py_UNICODE *uni2;
5802 /* startpos for collecting unencodable chars */
5803 const Py_UNICODE *collstart = p;
5804 const Py_UNICODE *collend = p;
5805 /* find all unecodable characters */
5806 while ((collend < endp) && ((*collend)>=limit))
5807 ++collend;
5808 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5809 if (known_errorHandler==-1) {
5810 if ((errors==NULL) || (!strcmp(errors, "strict")))
5811 known_errorHandler = 1;
5812 else if (!strcmp(errors, "replace"))
5813 known_errorHandler = 2;
5814 else if (!strcmp(errors, "ignore"))
5815 known_errorHandler = 3;
5816 else if (!strcmp(errors, "xmlcharrefreplace"))
5817 known_errorHandler = 4;
5818 else
5819 known_errorHandler = 0;
5820 }
5821 switch (known_errorHandler) {
5822 case 1: /* strict */
5823 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5824 goto onError;
5825 case 2: /* replace */
5826 while (collstart++<collend)
5827 *str++ = '?'; /* fall through */
5828 case 3: /* ignore */
5829 p = collend;
5830 break;
5831 case 4: /* xmlcharrefreplace */
5832 respos = str - PyBytes_AS_STRING(res);
5833 /* determine replacement size (temporarily (mis)uses p) */
5834 for (p = collstart, repsize = 0; p < collend; ++p) {
5835 if (*p<10)
5836 repsize += 2+1+1;
5837 else if (*p<100)
5838 repsize += 2+2+1;
5839 else if (*p<1000)
5840 repsize += 2+3+1;
5841 else if (*p<10000)
5842 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005843#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 else
5845 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005846#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 else if (*p<100000)
5848 repsize += 2+5+1;
5849 else if (*p<1000000)
5850 repsize += 2+6+1;
5851 else
5852 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005853#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005854 }
5855 requiredsize = respos+repsize+(endp-collend);
5856 if (requiredsize > ressize) {
5857 if (requiredsize<2*ressize)
5858 requiredsize = 2*ressize;
5859 if (_PyBytes_Resize(&res, requiredsize))
5860 goto onError;
5861 str = PyBytes_AS_STRING(res) + respos;
5862 ressize = requiredsize;
5863 }
5864 /* generate replacement (temporarily (mis)uses p) */
5865 for (p = collstart; p < collend; ++p) {
5866 str += sprintf(str, "&#%d;", (int)*p);
5867 }
5868 p = collend;
5869 break;
5870 default:
5871 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5872 encoding, reason, startp, size, &exc,
5873 collstart-startp, collend-startp, &newpos);
5874 if (repunicode == NULL)
5875 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005876 if (PyBytes_Check(repunicode)) {
5877 /* Directly copy bytes result to output. */
5878 repsize = PyBytes_Size(repunicode);
5879 if (repsize > 1) {
5880 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005881 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005882 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5883 Py_DECREF(repunicode);
5884 goto onError;
5885 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005886 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005887 ressize += repsize-1;
5888 }
5889 memcpy(str, PyBytes_AsString(repunicode), repsize);
5890 str += repsize;
5891 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005892 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005893 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005894 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 /* need more space? (at least enough for what we
5896 have+the replacement+the rest of the string, so
5897 we won't have to check space for encodable characters) */
5898 respos = str - PyBytes_AS_STRING(res);
5899 repsize = PyUnicode_GET_SIZE(repunicode);
5900 requiredsize = respos+repsize+(endp-collend);
5901 if (requiredsize > ressize) {
5902 if (requiredsize<2*ressize)
5903 requiredsize = 2*ressize;
5904 if (_PyBytes_Resize(&res, requiredsize)) {
5905 Py_DECREF(repunicode);
5906 goto onError;
5907 }
5908 str = PyBytes_AS_STRING(res) + respos;
5909 ressize = requiredsize;
5910 }
5911 /* check if there is anything unencodable in the replacement
5912 and copy it to the output */
5913 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5914 c = *uni2;
5915 if (c >= limit) {
5916 raise_encode_exception(&exc, encoding, startp, size,
5917 unicodepos, unicodepos+1, reason);
5918 Py_DECREF(repunicode);
5919 goto onError;
5920 }
5921 *str = (char)c;
5922 }
5923 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005924 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005925 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005926 }
5927 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005928 /* Resize if we allocated to much */
5929 size = str - PyBytes_AS_STRING(res);
5930 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005931 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005932 if (_PyBytes_Resize(&res, size) < 0)
5933 goto onError;
5934 }
5935
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005936 Py_XDECREF(errorHandler);
5937 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005938 return res;
5939
5940 onError:
5941 Py_XDECREF(res);
5942 Py_XDECREF(errorHandler);
5943 Py_XDECREF(exc);
5944 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005945}
5946
Alexander Belopolsky40018472011-02-26 01:02:56 +00005947PyObject *
5948PyUnicode_EncodeLatin1(const Py_UNICODE *p,
5949 Py_ssize_t size,
5950 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005952 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953}
5954
Alexander Belopolsky40018472011-02-26 01:02:56 +00005955PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005956_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957{
5958 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 PyErr_BadArgument();
5960 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005962 if (PyUnicode_READY(unicode) == -1)
5963 return NULL;
5964 /* Fast path: if it is a one-byte string, construct
5965 bytes object directly. */
5966 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
5967 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
5968 PyUnicode_GET_LENGTH(unicode));
5969 /* Non-Latin-1 characters present. Defer to above function to
5970 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005973 errors);
5974}
5975
5976PyObject*
5977PyUnicode_AsLatin1String(PyObject *unicode)
5978{
5979 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980}
5981
5982/* --- 7-bit ASCII Codec -------------------------------------------------- */
5983
Alexander Belopolsky40018472011-02-26 01:02:56 +00005984PyObject *
5985PyUnicode_DecodeASCII(const char *s,
5986 Py_ssize_t size,
5987 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005989 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 PyUnicodeObject *v;
5991 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005992 Py_ssize_t startinpos;
5993 Py_ssize_t endinpos;
5994 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005995 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005996 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005997 PyObject *errorHandler = NULL;
5998 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005999 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006000
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006002 if (size == 1 && *(unsigned char*)s < 128)
6003 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6004
6005 /* Fast path. Assume the input actually *is* ASCII, and allocate
6006 a single-block Unicode object with that assumption. If there is
6007 an error, drop the object and start over. */
6008 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6009 if (v == NULL)
6010 goto onError;
6011 d = PyUnicode_1BYTE_DATA(v);
6012 for (i = 0; i < size; i++) {
6013 unsigned char ch = ((unsigned char*)s)[i];
6014 if (ch < 128)
6015 d[i] = ch;
6016 else
6017 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006019 if (i == size)
6020 return (PyObject*)v;
6021 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006022
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 v = _PyUnicode_New(size);
6024 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006029 e = s + size;
6030 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 register unsigned char c = (unsigned char)*s;
6032 if (c < 128) {
6033 *p++ = c;
6034 ++s;
6035 }
6036 else {
6037 startinpos = s-starts;
6038 endinpos = startinpos + 1;
6039 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6040 if (unicode_decode_call_errorhandler(
6041 errors, &errorHandler,
6042 "ascii", "ordinal not in range(128)",
6043 &starts, &e, &startinpos, &endinpos, &exc, &s,
6044 &v, &outpos, &p))
6045 goto onError;
6046 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006048 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6050 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006051 Py_XDECREF(errorHandler);
6052 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006053 if (PyUnicode_READY(v) == -1) {
6054 Py_DECREF(v);
6055 return NULL;
6056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006058
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006061 Py_XDECREF(errorHandler);
6062 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 return NULL;
6064}
6065
Alexander Belopolsky40018472011-02-26 01:02:56 +00006066PyObject *
6067PyUnicode_EncodeASCII(const Py_UNICODE *p,
6068 Py_ssize_t size,
6069 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006071 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072}
6073
Alexander Belopolsky40018472011-02-26 01:02:56 +00006074PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006075_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076{
6077 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 PyErr_BadArgument();
6079 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006081 if (PyUnicode_READY(unicode) == -1)
6082 return NULL;
6083 /* Fast path: if it is an ASCII-only string, construct bytes object
6084 directly. Else defer to above function to raise the exception. */
6085 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6086 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6087 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006090 errors);
6091}
6092
6093PyObject *
6094PyUnicode_AsASCIIString(PyObject *unicode)
6095{
6096 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097}
6098
Victor Stinner99b95382011-07-04 14:23:54 +02006099#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006100
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006101/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006102
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006103#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006104#define NEED_RETRY
6105#endif
6106
6107/* XXX This code is limited to "true" double-byte encodings, as
6108 a) it assumes an incomplete character consists of a single byte, and
6109 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006111
Alexander Belopolsky40018472011-02-26 01:02:56 +00006112static int
6113is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006114{
6115 const char *curr = s + offset;
6116
6117 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 const char *prev = CharPrev(s, curr);
6119 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006120 }
6121 return 0;
6122}
6123
6124/*
6125 * Decode MBCS string into unicode object. If 'final' is set, converts
6126 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6127 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006128static int
6129decode_mbcs(PyUnicodeObject **v,
6130 const char *s, /* MBCS string */
6131 int size, /* sizeof MBCS string */
6132 int final,
6133 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006134{
6135 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006136 Py_ssize_t n;
6137 DWORD usize;
6138 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006139
6140 assert(size >= 0);
6141
Victor Stinner554f3f02010-06-16 23:33:54 +00006142 /* check and handle 'errors' arg */
6143 if (errors==NULL || strcmp(errors, "strict")==0)
6144 flags = MB_ERR_INVALID_CHARS;
6145 else if (strcmp(errors, "ignore")==0)
6146 flags = 0;
6147 else {
6148 PyErr_Format(PyExc_ValueError,
6149 "mbcs encoding does not support errors='%s'",
6150 errors);
6151 return -1;
6152 }
6153
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006154 /* Skip trailing lead-byte unless 'final' is set */
6155 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006156 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006157
6158 /* First get the size of the result */
6159 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006160 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6161 if (usize==0)
6162 goto mbcs_decode_error;
6163 } else
6164 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006165
6166 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 /* Create unicode object */
6168 *v = _PyUnicode_New(usize);
6169 if (*v == NULL)
6170 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006171 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006172 }
6173 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 /* Extend unicode object */
6175 n = PyUnicode_GET_SIZE(*v);
6176 if (_PyUnicode_Resize(v, n + usize) < 0)
6177 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006178 }
6179
6180 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006181 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006183 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6184 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006186 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006187 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006188
6189mbcs_decode_error:
6190 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6191 we raise a UnicodeDecodeError - else it is a 'generic'
6192 windows error
6193 */
6194 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6195 /* Ideally, we should get reason from FormatMessage - this
6196 is the Windows 2000 English version of the message
6197 */
6198 PyObject *exc = NULL;
6199 const char *reason = "No mapping for the Unicode character exists "
6200 "in the target multi-byte code page.";
6201 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6202 if (exc != NULL) {
6203 PyCodec_StrictErrors(exc);
6204 Py_DECREF(exc);
6205 }
6206 } else {
6207 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6208 }
6209 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006210}
6211
Alexander Belopolsky40018472011-02-26 01:02:56 +00006212PyObject *
6213PyUnicode_DecodeMBCSStateful(const char *s,
6214 Py_ssize_t size,
6215 const char *errors,
6216 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006217{
6218 PyUnicodeObject *v = NULL;
6219 int done;
6220
6221 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006222 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006223
6224#ifdef NEED_RETRY
6225 retry:
6226 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006227 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006228 else
6229#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006230 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006231
6232 if (done < 0) {
6233 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006235 }
6236
6237 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006239
6240#ifdef NEED_RETRY
6241 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 s += done;
6243 size -= done;
6244 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006245 }
6246#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006247 if (PyUnicode_READY(v) == -1) {
6248 Py_DECREF(v);
6249 return NULL;
6250 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006251 return (PyObject *)v;
6252}
6253
Alexander Belopolsky40018472011-02-26 01:02:56 +00006254PyObject *
6255PyUnicode_DecodeMBCS(const char *s,
6256 Py_ssize_t size,
6257 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006258{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006259 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6260}
6261
6262/*
6263 * Convert unicode into string object (MBCS).
6264 * Returns 0 if succeed, -1 otherwise.
6265 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006266static int
6267encode_mbcs(PyObject **repr,
6268 const Py_UNICODE *p, /* unicode */
6269 int size, /* size of unicode */
6270 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006271{
Victor Stinner554f3f02010-06-16 23:33:54 +00006272 BOOL usedDefaultChar = FALSE;
6273 BOOL *pusedDefaultChar;
6274 int mbcssize;
6275 Py_ssize_t n;
6276 PyObject *exc = NULL;
6277 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006278
6279 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006280
Victor Stinner554f3f02010-06-16 23:33:54 +00006281 /* check and handle 'errors' arg */
6282 if (errors==NULL || strcmp(errors, "strict")==0) {
6283 flags = WC_NO_BEST_FIT_CHARS;
6284 pusedDefaultChar = &usedDefaultChar;
6285 } else if (strcmp(errors, "replace")==0) {
6286 flags = 0;
6287 pusedDefaultChar = NULL;
6288 } else {
6289 PyErr_Format(PyExc_ValueError,
6290 "mbcs encoding does not support errors='%s'",
6291 errors);
6292 return -1;
6293 }
6294
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006295 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006296 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006297 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6298 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 if (mbcssize == 0) {
6300 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6301 return -1;
6302 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006303 /* If we used a default char, then we failed! */
6304 if (pusedDefaultChar && *pusedDefaultChar)
6305 goto mbcs_encode_error;
6306 } else {
6307 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006308 }
6309
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006310 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 /* Create string object */
6312 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6313 if (*repr == NULL)
6314 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006315 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006316 }
6317 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006318 /* Extend string object */
6319 n = PyBytes_Size(*repr);
6320 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6321 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006322 }
6323
6324 /* Do the conversion */
6325 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006327 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6328 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6330 return -1;
6331 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006332 if (pusedDefaultChar && *pusedDefaultChar)
6333 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006334 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006335 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006336
6337mbcs_encode_error:
6338 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6339 Py_XDECREF(exc);
6340 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006341}
6342
Alexander Belopolsky40018472011-02-26 01:02:56 +00006343PyObject *
6344PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6345 Py_ssize_t size,
6346 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006347{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006348 PyObject *repr = NULL;
6349 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006350
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006351#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006353 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006354 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006355 else
6356#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006357 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006358
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006359 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 Py_XDECREF(repr);
6361 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006362 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006363
6364#ifdef NEED_RETRY
6365 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 p += INT_MAX;
6367 size -= INT_MAX;
6368 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006369 }
6370#endif
6371
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006372 return repr;
6373}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006374
Alexander Belopolsky40018472011-02-26 01:02:56 +00006375PyObject *
6376PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006377{
6378 if (!PyUnicode_Check(unicode)) {
6379 PyErr_BadArgument();
6380 return NULL;
6381 }
6382 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 PyUnicode_GET_SIZE(unicode),
6384 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006385}
6386
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006387#undef NEED_RETRY
6388
Victor Stinner99b95382011-07-04 14:23:54 +02006389#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006390
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391/* --- Character Mapping Codec -------------------------------------------- */
6392
Alexander Belopolsky40018472011-02-26 01:02:56 +00006393PyObject *
6394PyUnicode_DecodeCharmap(const char *s,
6395 Py_ssize_t size,
6396 PyObject *mapping,
6397 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006400 Py_ssize_t startinpos;
6401 Py_ssize_t endinpos;
6402 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006403 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 PyUnicodeObject *v;
6405 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006406 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 PyObject *errorHandler = NULL;
6408 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006409 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006410 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006411
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 /* Default to Latin-1 */
6413 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006414 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415
6416 v = _PyUnicode_New(size);
6417 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006423 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 mapstring = PyUnicode_AS_UNICODE(mapping);
6425 maplen = PyUnicode_GET_SIZE(mapping);
6426 while (s < e) {
6427 unsigned char ch = *s;
6428 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 if (ch < maplen)
6431 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 if (x == 0xfffe) {
6434 /* undefined mapping */
6435 outpos = p-PyUnicode_AS_UNICODE(v);
6436 startinpos = s-starts;
6437 endinpos = startinpos+1;
6438 if (unicode_decode_call_errorhandler(
6439 errors, &errorHandler,
6440 "charmap", "character maps to <undefined>",
6441 &starts, &e, &startinpos, &endinpos, &exc, &s,
6442 &v, &outpos, &p)) {
6443 goto onError;
6444 }
6445 continue;
6446 }
6447 *p++ = x;
6448 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006449 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006450 }
6451 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 while (s < e) {
6453 unsigned char ch = *s;
6454 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006455
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6457 w = PyLong_FromLong((long)ch);
6458 if (w == NULL)
6459 goto onError;
6460 x = PyObject_GetItem(mapping, w);
6461 Py_DECREF(w);
6462 if (x == NULL) {
6463 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6464 /* No mapping found means: mapping is undefined. */
6465 PyErr_Clear();
6466 x = Py_None;
6467 Py_INCREF(x);
6468 } else
6469 goto onError;
6470 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006471
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 /* Apply mapping */
6473 if (PyLong_Check(x)) {
6474 long value = PyLong_AS_LONG(x);
6475 if (value < 0 || value > 65535) {
6476 PyErr_SetString(PyExc_TypeError,
6477 "character mapping must be in range(65536)");
6478 Py_DECREF(x);
6479 goto onError;
6480 }
6481 *p++ = (Py_UNICODE)value;
6482 }
6483 else if (x == Py_None) {
6484 /* undefined mapping */
6485 outpos = p-PyUnicode_AS_UNICODE(v);
6486 startinpos = s-starts;
6487 endinpos = startinpos+1;
6488 if (unicode_decode_call_errorhandler(
6489 errors, &errorHandler,
6490 "charmap", "character maps to <undefined>",
6491 &starts, &e, &startinpos, &endinpos, &exc, &s,
6492 &v, &outpos, &p)) {
6493 Py_DECREF(x);
6494 goto onError;
6495 }
6496 Py_DECREF(x);
6497 continue;
6498 }
6499 else if (PyUnicode_Check(x)) {
6500 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006501
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 if (targetsize == 1)
6503 /* 1-1 mapping */
6504 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006505
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 else if (targetsize > 1) {
6507 /* 1-n mapping */
6508 if (targetsize > extrachars) {
6509 /* resize first */
6510 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6511 Py_ssize_t needed = (targetsize - extrachars) + \
6512 (targetsize << 2);
6513 extrachars += needed;
6514 /* XXX overflow detection missing */
6515 if (_PyUnicode_Resize(&v,
6516 PyUnicode_GET_SIZE(v) + needed) < 0) {
6517 Py_DECREF(x);
6518 goto onError;
6519 }
6520 p = PyUnicode_AS_UNICODE(v) + oldpos;
6521 }
6522 Py_UNICODE_COPY(p,
6523 PyUnicode_AS_UNICODE(x),
6524 targetsize);
6525 p += targetsize;
6526 extrachars -= targetsize;
6527 }
6528 /* 1-0 mapping: skip the character */
6529 }
6530 else {
6531 /* wrong return value */
6532 PyErr_SetString(PyExc_TypeError,
6533 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006534 Py_DECREF(x);
6535 goto onError;
6536 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 Py_DECREF(x);
6538 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006539 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 }
6541 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6543 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006544 Py_XDECREF(errorHandler);
6545 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006546 if (PyUnicode_READY(v) == -1) {
6547 Py_DECREF(v);
6548 return NULL;
6549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006551
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006553 Py_XDECREF(errorHandler);
6554 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555 Py_XDECREF(v);
6556 return NULL;
6557}
6558
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006559/* Charmap encoding: the lookup table */
6560
Alexander Belopolsky40018472011-02-26 01:02:56 +00006561struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 PyObject_HEAD
6563 unsigned char level1[32];
6564 int count2, count3;
6565 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006566};
6567
6568static PyObject*
6569encoding_map_size(PyObject *obj, PyObject* args)
6570{
6571 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006572 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006574}
6575
6576static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006577 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 PyDoc_STR("Return the size (in bytes) of this object") },
6579 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006580};
6581
6582static void
6583encoding_map_dealloc(PyObject* o)
6584{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006585 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006586}
6587
6588static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006589 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 "EncodingMap", /*tp_name*/
6591 sizeof(struct encoding_map), /*tp_basicsize*/
6592 0, /*tp_itemsize*/
6593 /* methods */
6594 encoding_map_dealloc, /*tp_dealloc*/
6595 0, /*tp_print*/
6596 0, /*tp_getattr*/
6597 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006598 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006599 0, /*tp_repr*/
6600 0, /*tp_as_number*/
6601 0, /*tp_as_sequence*/
6602 0, /*tp_as_mapping*/
6603 0, /*tp_hash*/
6604 0, /*tp_call*/
6605 0, /*tp_str*/
6606 0, /*tp_getattro*/
6607 0, /*tp_setattro*/
6608 0, /*tp_as_buffer*/
6609 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6610 0, /*tp_doc*/
6611 0, /*tp_traverse*/
6612 0, /*tp_clear*/
6613 0, /*tp_richcompare*/
6614 0, /*tp_weaklistoffset*/
6615 0, /*tp_iter*/
6616 0, /*tp_iternext*/
6617 encoding_map_methods, /*tp_methods*/
6618 0, /*tp_members*/
6619 0, /*tp_getset*/
6620 0, /*tp_base*/
6621 0, /*tp_dict*/
6622 0, /*tp_descr_get*/
6623 0, /*tp_descr_set*/
6624 0, /*tp_dictoffset*/
6625 0, /*tp_init*/
6626 0, /*tp_alloc*/
6627 0, /*tp_new*/
6628 0, /*tp_free*/
6629 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006630};
6631
6632PyObject*
6633PyUnicode_BuildEncodingMap(PyObject* string)
6634{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006635 PyObject *result;
6636 struct encoding_map *mresult;
6637 int i;
6638 int need_dict = 0;
6639 unsigned char level1[32];
6640 unsigned char level2[512];
6641 unsigned char *mlevel1, *mlevel2, *mlevel3;
6642 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006643 int kind;
6644 void *data;
6645 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006647 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006648 PyErr_BadArgument();
6649 return NULL;
6650 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006651 kind = PyUnicode_KIND(string);
6652 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006653 memset(level1, 0xFF, sizeof level1);
6654 memset(level2, 0xFF, sizeof level2);
6655
6656 /* If there isn't a one-to-one mapping of NULL to \0,
6657 or if there are non-BMP characters, we need to use
6658 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006659 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006660 need_dict = 1;
6661 for (i = 1; i < 256; i++) {
6662 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006663 ch = PyUnicode_READ(kind, data, i);
6664 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006665 need_dict = 1;
6666 break;
6667 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006668 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006669 /* unmapped character */
6670 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006671 l1 = ch >> 11;
6672 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006673 if (level1[l1] == 0xFF)
6674 level1[l1] = count2++;
6675 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006676 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006677 }
6678
6679 if (count2 >= 0xFF || count3 >= 0xFF)
6680 need_dict = 1;
6681
6682 if (need_dict) {
6683 PyObject *result = PyDict_New();
6684 PyObject *key, *value;
6685 if (!result)
6686 return NULL;
6687 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006688 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006689 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006690 if (!key || !value)
6691 goto failed1;
6692 if (PyDict_SetItem(result, key, value) == -1)
6693 goto failed1;
6694 Py_DECREF(key);
6695 Py_DECREF(value);
6696 }
6697 return result;
6698 failed1:
6699 Py_XDECREF(key);
6700 Py_XDECREF(value);
6701 Py_DECREF(result);
6702 return NULL;
6703 }
6704
6705 /* Create a three-level trie */
6706 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6707 16*count2 + 128*count3 - 1);
6708 if (!result)
6709 return PyErr_NoMemory();
6710 PyObject_Init(result, &EncodingMapType);
6711 mresult = (struct encoding_map*)result;
6712 mresult->count2 = count2;
6713 mresult->count3 = count3;
6714 mlevel1 = mresult->level1;
6715 mlevel2 = mresult->level23;
6716 mlevel3 = mresult->level23 + 16*count2;
6717 memcpy(mlevel1, level1, 32);
6718 memset(mlevel2, 0xFF, 16*count2);
6719 memset(mlevel3, 0, 128*count3);
6720 count3 = 0;
6721 for (i = 1; i < 256; i++) {
6722 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006723 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006724 /* unmapped character */
6725 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006726 o1 = PyUnicode_READ(kind, data, i)>>11;
6727 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006728 i2 = 16*mlevel1[o1] + o2;
6729 if (mlevel2[i2] == 0xFF)
6730 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006731 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006732 i3 = 128*mlevel2[i2] + o3;
6733 mlevel3[i3] = i;
6734 }
6735 return result;
6736}
6737
6738static int
6739encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6740{
6741 struct encoding_map *map = (struct encoding_map*)mapping;
6742 int l1 = c>>11;
6743 int l2 = (c>>7) & 0xF;
6744 int l3 = c & 0x7F;
6745 int i;
6746
6747#ifdef Py_UNICODE_WIDE
6748 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006749 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006750 }
6751#endif
6752 if (c == 0)
6753 return 0;
6754 /* level 1*/
6755 i = map->level1[l1];
6756 if (i == 0xFF) {
6757 return -1;
6758 }
6759 /* level 2*/
6760 i = map->level23[16*i+l2];
6761 if (i == 0xFF) {
6762 return -1;
6763 }
6764 /* level 3 */
6765 i = map->level23[16*map->count2 + 128*i + l3];
6766 if (i == 0) {
6767 return -1;
6768 }
6769 return i;
6770}
6771
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006772/* Lookup the character ch in the mapping. If the character
6773 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006774 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006775static PyObject *
6776charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777{
Christian Heimes217cfd12007-12-02 14:31:20 +00006778 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006779 PyObject *x;
6780
6781 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006783 x = PyObject_GetItem(mapping, w);
6784 Py_DECREF(w);
6785 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006786 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6787 /* No mapping found means: mapping is undefined. */
6788 PyErr_Clear();
6789 x = Py_None;
6790 Py_INCREF(x);
6791 return x;
6792 } else
6793 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006795 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006796 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006797 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 long value = PyLong_AS_LONG(x);
6799 if (value < 0 || value > 255) {
6800 PyErr_SetString(PyExc_TypeError,
6801 "character mapping must be in range(256)");
6802 Py_DECREF(x);
6803 return NULL;
6804 }
6805 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006807 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 /* wrong return value */
6811 PyErr_Format(PyExc_TypeError,
6812 "character mapping must return integer, bytes or None, not %.400s",
6813 x->ob_type->tp_name);
6814 Py_DECREF(x);
6815 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816 }
6817}
6818
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006819static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006820charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006821{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006822 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6823 /* exponentially overallocate to minimize reallocations */
6824 if (requiredsize < 2*outsize)
6825 requiredsize = 2*outsize;
6826 if (_PyBytes_Resize(outobj, requiredsize))
6827 return -1;
6828 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006829}
6830
Benjamin Peterson14339b62009-01-31 16:36:08 +00006831typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006833} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006834/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006835 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006836 space is available. Return a new reference to the object that
6837 was put in the output buffer, or Py_None, if the mapping was undefined
6838 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006839 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006840static charmapencode_result
6841charmapencode_output(Py_UNICODE c, PyObject *mapping,
6842 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006843{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006844 PyObject *rep;
6845 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006846 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006847
Christian Heimes90aa7642007-12-19 02:45:37 +00006848 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006849 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006850 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006851 if (res == -1)
6852 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 if (outsize<requiredsize)
6854 if (charmapencode_resize(outobj, outpos, requiredsize))
6855 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006856 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 outstart[(*outpos)++] = (char)res;
6858 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006859 }
6860
6861 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006862 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006864 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 Py_DECREF(rep);
6866 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006867 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 if (PyLong_Check(rep)) {
6869 Py_ssize_t requiredsize = *outpos+1;
6870 if (outsize<requiredsize)
6871 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6872 Py_DECREF(rep);
6873 return enc_EXCEPTION;
6874 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006875 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006877 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 else {
6879 const char *repchars = PyBytes_AS_STRING(rep);
6880 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6881 Py_ssize_t requiredsize = *outpos+repsize;
6882 if (outsize<requiredsize)
6883 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6884 Py_DECREF(rep);
6885 return enc_EXCEPTION;
6886 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006887 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 memcpy(outstart + *outpos, repchars, repsize);
6889 *outpos += repsize;
6890 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006891 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006892 Py_DECREF(rep);
6893 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006894}
6895
6896/* handle an error in PyUnicode_EncodeCharmap
6897 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006898static int
6899charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006900 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006901 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006902 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006903 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006904{
6905 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006906 Py_ssize_t repsize;
6907 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006908 Py_UNICODE *uni2;
6909 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006910 Py_ssize_t collstartpos = *inpos;
6911 Py_ssize_t collendpos = *inpos+1;
6912 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006913 char *encoding = "charmap";
6914 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006915 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006916
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006917 /* find all unencodable characters */
6918 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006919 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006920 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006921 int res = encoding_map_lookup(p[collendpos], mapping);
6922 if (res != -1)
6923 break;
6924 ++collendpos;
6925 continue;
6926 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006927
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 rep = charmapencode_lookup(p[collendpos], mapping);
6929 if (rep==NULL)
6930 return -1;
6931 else if (rep!=Py_None) {
6932 Py_DECREF(rep);
6933 break;
6934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006935 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006937 }
6938 /* cache callback name lookup
6939 * (if not done yet, i.e. it's the first error) */
6940 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006941 if ((errors==NULL) || (!strcmp(errors, "strict")))
6942 *known_errorHandler = 1;
6943 else if (!strcmp(errors, "replace"))
6944 *known_errorHandler = 2;
6945 else if (!strcmp(errors, "ignore"))
6946 *known_errorHandler = 3;
6947 else if (!strcmp(errors, "xmlcharrefreplace"))
6948 *known_errorHandler = 4;
6949 else
6950 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006951 }
6952 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006953 case 1: /* strict */
6954 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6955 return -1;
6956 case 2: /* replace */
6957 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006958 x = charmapencode_output('?', mapping, res, respos);
6959 if (x==enc_EXCEPTION) {
6960 return -1;
6961 }
6962 else if (x==enc_FAILED) {
6963 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6964 return -1;
6965 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006966 }
6967 /* fall through */
6968 case 3: /* ignore */
6969 *inpos = collendpos;
6970 break;
6971 case 4: /* xmlcharrefreplace */
6972 /* generate replacement (temporarily (mis)uses p) */
6973 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 char buffer[2+29+1+1];
6975 char *cp;
6976 sprintf(buffer, "&#%d;", (int)p[collpos]);
6977 for (cp = buffer; *cp; ++cp) {
6978 x = charmapencode_output(*cp, mapping, res, respos);
6979 if (x==enc_EXCEPTION)
6980 return -1;
6981 else if (x==enc_FAILED) {
6982 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6983 return -1;
6984 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006985 }
6986 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006987 *inpos = collendpos;
6988 break;
6989 default:
6990 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 encoding, reason, p, size, exceptionObject,
6992 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006993 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006995 if (PyBytes_Check(repunicode)) {
6996 /* Directly copy bytes result to output. */
6997 Py_ssize_t outsize = PyBytes_Size(*res);
6998 Py_ssize_t requiredsize;
6999 repsize = PyBytes_Size(repunicode);
7000 requiredsize = *respos + repsize;
7001 if (requiredsize > outsize)
7002 /* Make room for all additional bytes. */
7003 if (charmapencode_resize(res, respos, requiredsize)) {
7004 Py_DECREF(repunicode);
7005 return -1;
7006 }
7007 memcpy(PyBytes_AsString(*res) + *respos,
7008 PyBytes_AsString(repunicode), repsize);
7009 *respos += repsize;
7010 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007011 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007012 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007013 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007014 /* generate replacement */
7015 repsize = PyUnicode_GET_SIZE(repunicode);
7016 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 x = charmapencode_output(*uni2, mapping, res, respos);
7018 if (x==enc_EXCEPTION) {
7019 return -1;
7020 }
7021 else if (x==enc_FAILED) {
7022 Py_DECREF(repunicode);
7023 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7024 return -1;
7025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007026 }
7027 *inpos = newpos;
7028 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007029 }
7030 return 0;
7031}
7032
Alexander Belopolsky40018472011-02-26 01:02:56 +00007033PyObject *
7034PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7035 Py_ssize_t size,
7036 PyObject *mapping,
7037 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007039 /* output object */
7040 PyObject *res = NULL;
7041 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007042 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007043 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007044 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007045 PyObject *errorHandler = NULL;
7046 PyObject *exc = NULL;
7047 /* the following variable is used for caching string comparisons
7048 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7049 * 3=ignore, 4=xmlcharrefreplace */
7050 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051
7052 /* Default to Latin-1 */
7053 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007054 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007056 /* allocate enough for a simple encoding without
7057 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007058 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007059 if (res == NULL)
7060 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007061 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007064 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007065 /* try to encode it */
7066 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7067 if (x==enc_EXCEPTION) /* error */
7068 goto onError;
7069 if (x==enc_FAILED) { /* unencodable character */
7070 if (charmap_encoding_error(p, size, &inpos, mapping,
7071 &exc,
7072 &known_errorHandler, &errorHandler, errors,
7073 &res, &respos)) {
7074 goto onError;
7075 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007076 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007077 else
7078 /* done with this character => adjust input position */
7079 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007082 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007083 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007084 if (_PyBytes_Resize(&res, respos) < 0)
7085 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007086
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007087 Py_XDECREF(exc);
7088 Py_XDECREF(errorHandler);
7089 return res;
7090
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007092 Py_XDECREF(res);
7093 Py_XDECREF(exc);
7094 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095 return NULL;
7096}
7097
Alexander Belopolsky40018472011-02-26 01:02:56 +00007098PyObject *
7099PyUnicode_AsCharmapString(PyObject *unicode,
7100 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101{
7102 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007103 PyErr_BadArgument();
7104 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105 }
7106 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007107 PyUnicode_GET_SIZE(unicode),
7108 mapping,
7109 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110}
7111
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007112/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007113static void
7114make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007115 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007116 Py_ssize_t startpos, Py_ssize_t endpos,
7117 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007119 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007120 *exceptionObject = _PyUnicodeTranslateError_Create(
7121 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122 }
7123 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007124 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7125 goto onError;
7126 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7127 goto onError;
7128 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7129 goto onError;
7130 return;
7131 onError:
7132 Py_DECREF(*exceptionObject);
7133 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134 }
7135}
7136
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007137/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007138static void
7139raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007140 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007141 Py_ssize_t startpos, Py_ssize_t endpos,
7142 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007143{
7144 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007145 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007146 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007147 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007148}
7149
7150/* error handling callback helper:
7151 build arguments, call the callback and check the arguments,
7152 put the result into newpos and return the replacement string, which
7153 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007154static PyObject *
7155unicode_translate_call_errorhandler(const char *errors,
7156 PyObject **errorHandler,
7157 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007158 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007159 Py_ssize_t startpos, Py_ssize_t endpos,
7160 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007161{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007162 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007163
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007164 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007165 PyObject *restuple;
7166 PyObject *resunicode;
7167
7168 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007170 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007171 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007172 }
7173
7174 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007175 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007176 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007178
7179 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007180 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007181 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007183 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007184 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007185 Py_DECREF(restuple);
7186 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007187 }
7188 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 &resunicode, &i_newpos)) {
7190 Py_DECREF(restuple);
7191 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007192 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007193 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007194 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007195 else
7196 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007197 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7199 Py_DECREF(restuple);
7200 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007201 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007202 Py_INCREF(resunicode);
7203 Py_DECREF(restuple);
7204 return resunicode;
7205}
7206
7207/* Lookup the character ch in the mapping and put the result in result,
7208 which must be decrefed by the caller.
7209 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007210static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007211charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007212{
Christian Heimes217cfd12007-12-02 14:31:20 +00007213 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007214 PyObject *x;
7215
7216 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007218 x = PyObject_GetItem(mapping, w);
7219 Py_DECREF(w);
7220 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007221 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7222 /* No mapping found means: use 1:1 mapping. */
7223 PyErr_Clear();
7224 *result = NULL;
7225 return 0;
7226 } else
7227 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007228 }
7229 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 *result = x;
7231 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007232 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007233 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 long value = PyLong_AS_LONG(x);
7235 long max = PyUnicode_GetMax();
7236 if (value < 0 || value > max) {
7237 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007238 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007239 Py_DECREF(x);
7240 return -1;
7241 }
7242 *result = x;
7243 return 0;
7244 }
7245 else if (PyUnicode_Check(x)) {
7246 *result = x;
7247 return 0;
7248 }
7249 else {
7250 /* wrong return value */
7251 PyErr_SetString(PyExc_TypeError,
7252 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007253 Py_DECREF(x);
7254 return -1;
7255 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007256}
7257/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 if not reallocate and adjust various state variables.
7259 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007260static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007261charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007262 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007263{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007264 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007265 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 /* exponentially overallocate to minimize reallocations */
7267 if (requiredsize < 2 * oldsize)
7268 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007269 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7270 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007271 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007272 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007273 }
7274 return 0;
7275}
7276/* lookup the character, put the result in the output string and adjust
7277 various state variables. Return a new reference to the object that
7278 was put in the output buffer in *result, or Py_None, if the mapping was
7279 undefined (in which case no character was written).
7280 The called must decref result.
7281 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007282static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007283charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7284 PyObject *mapping, Py_UCS4 **output,
7285 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007286 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007287{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007288 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7289 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007290 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007291 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007292 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007293 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007294 }
7295 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007296 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007297 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007298 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007299 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007300 }
7301 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007302 Py_ssize_t repsize;
7303 if (PyUnicode_READY(*res) == -1)
7304 return -1;
7305 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007306 if (repsize==1) {
7307 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007308 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 }
7310 else if (repsize!=0) {
7311 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007312 Py_ssize_t requiredsize = *opos +
7313 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007314 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007315 Py_ssize_t i;
7316 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007318 for(i = 0; i < repsize; i++)
7319 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007321 }
7322 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007324 return 0;
7325}
7326
Alexander Belopolsky40018472011-02-26 01:02:56 +00007327PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007328_PyUnicode_TranslateCharmap(PyObject *input,
7329 PyObject *mapping,
7330 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007332 /* input object */
7333 char *idata;
7334 Py_ssize_t size, i;
7335 int kind;
7336 /* output buffer */
7337 Py_UCS4 *output = NULL;
7338 Py_ssize_t osize;
7339 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007340 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007341 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007342 char *reason = "character maps to <undefined>";
7343 PyObject *errorHandler = NULL;
7344 PyObject *exc = NULL;
7345 /* the following variable is used for caching string comparisons
7346 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7347 * 3=ignore, 4=xmlcharrefreplace */
7348 int known_errorHandler = -1;
7349
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 PyErr_BadArgument();
7352 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007355 if (PyUnicode_READY(input) == -1)
7356 return NULL;
7357 idata = (char*)PyUnicode_DATA(input);
7358 kind = PyUnicode_KIND(input);
7359 size = PyUnicode_GET_LENGTH(input);
7360 i = 0;
7361
7362 if (size == 0) {
7363 Py_INCREF(input);
7364 return input;
7365 }
7366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007367 /* allocate enough for a simple 1:1 translation without
7368 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007369 osize = size;
7370 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7371 opos = 0;
7372 if (output == NULL) {
7373 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007375 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007377 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 /* try to encode it */
7379 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007380 if (charmaptranslate_output(input, i, mapping,
7381 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 Py_XDECREF(x);
7383 goto onError;
7384 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007385 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007387 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 else { /* untranslatable character */
7389 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7390 Py_ssize_t repsize;
7391 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007392 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007394 Py_ssize_t collstart = i;
7395 Py_ssize_t collend = i+1;
7396 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007399 while (collend < size) {
7400 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 goto onError;
7402 Py_XDECREF(x);
7403 if (x!=Py_None)
7404 break;
7405 ++collend;
7406 }
7407 /* cache callback name lookup
7408 * (if not done yet, i.e. it's the first error) */
7409 if (known_errorHandler==-1) {
7410 if ((errors==NULL) || (!strcmp(errors, "strict")))
7411 known_errorHandler = 1;
7412 else if (!strcmp(errors, "replace"))
7413 known_errorHandler = 2;
7414 else if (!strcmp(errors, "ignore"))
7415 known_errorHandler = 3;
7416 else if (!strcmp(errors, "xmlcharrefreplace"))
7417 known_errorHandler = 4;
7418 else
7419 known_errorHandler = 0;
7420 }
7421 switch (known_errorHandler) {
7422 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007423 raise_translate_exception(&exc, input, collstart,
7424 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007425 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 case 2: /* replace */
7427 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007428 for (coll = collstart; coll<collend; coll++)
7429 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007430 /* fall through */
7431 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007432 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 break;
7434 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007435 /* generate replacement (temporarily (mis)uses i) */
7436 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 char buffer[2+29+1+1];
7438 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007439 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7440 if (charmaptranslate_makespace(&output, &osize,
7441 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 goto onError;
7443 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007444 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007446 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 break;
7448 default:
7449 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007450 reason, input, &exc,
7451 collstart, collend, &newpos);
7452 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 goto onError;
7454 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007455 repsize = PyUnicode_GET_LENGTH(repunicode);
7456 if (charmaptranslate_makespace(&output, &osize,
7457 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 Py_DECREF(repunicode);
7459 goto onError;
7460 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007461 for (uni2 = 0; repsize-->0; ++uni2)
7462 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7463 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007465 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007466 }
7467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007468 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7469 if (!res)
7470 goto onError;
7471 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007472 Py_XDECREF(exc);
7473 Py_XDECREF(errorHandler);
7474 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007477 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007478 Py_XDECREF(exc);
7479 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480 return NULL;
7481}
7482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007483/* Deprecated. Use PyUnicode_Translate instead. */
7484PyObject *
7485PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7486 Py_ssize_t size,
7487 PyObject *mapping,
7488 const char *errors)
7489{
7490 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7491 if (!unicode)
7492 return NULL;
7493 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7494}
7495
Alexander Belopolsky40018472011-02-26 01:02:56 +00007496PyObject *
7497PyUnicode_Translate(PyObject *str,
7498 PyObject *mapping,
7499 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500{
7501 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007502
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503 str = PyUnicode_FromObject(str);
7504 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007505 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007506 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507 Py_DECREF(str);
7508 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007509
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 Py_XDECREF(str);
7512 return NULL;
7513}
Tim Petersced69f82003-09-16 20:30:58 +00007514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007515static Py_UCS4
7516fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7517{
7518 /* No need to call PyUnicode_READY(self) because this function is only
7519 called as a callback from fixup() which does it already. */
7520 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7521 const int kind = PyUnicode_KIND(self);
7522 void *data = PyUnicode_DATA(self);
7523 Py_UCS4 maxchar = 0, ch, fixed;
7524 Py_ssize_t i;
7525
7526 for (i = 0; i < len; ++i) {
7527 ch = PyUnicode_READ(kind, data, i);
7528 fixed = 0;
7529 if (ch > 127) {
7530 if (Py_UNICODE_ISSPACE(ch))
7531 fixed = ' ';
7532 else {
7533 const int decimal = Py_UNICODE_TODECIMAL(ch);
7534 if (decimal >= 0)
7535 fixed = '0' + decimal;
7536 }
7537 if (fixed != 0) {
7538 if (fixed > maxchar)
7539 maxchar = fixed;
7540 PyUnicode_WRITE(kind, data, i, fixed);
7541 }
7542 else if (ch > maxchar)
7543 maxchar = ch;
7544 }
7545 else if (ch > maxchar)
7546 maxchar = ch;
7547 }
7548
7549 return maxchar;
7550}
7551
7552PyObject *
7553_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7554{
7555 if (!PyUnicode_Check(unicode)) {
7556 PyErr_BadInternalCall();
7557 return NULL;
7558 }
7559 if (PyUnicode_READY(unicode) == -1)
7560 return NULL;
7561 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7562 /* If the string is already ASCII, just return the same string */
7563 Py_INCREF(unicode);
7564 return unicode;
7565 }
7566 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7567}
7568
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007569PyObject *
7570PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7571 Py_ssize_t length)
7572{
7573 PyObject *result;
7574 Py_UNICODE *p; /* write pointer into result */
7575 Py_ssize_t i;
7576 /* Copy to a new string */
7577 result = (PyObject *)_PyUnicode_New(length);
7578 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7579 if (result == NULL)
7580 return result;
7581 p = PyUnicode_AS_UNICODE(result);
7582 /* Iterate over code points */
7583 for (i = 0; i < length; i++) {
7584 Py_UNICODE ch =s[i];
7585 if (ch > 127) {
7586 int decimal = Py_UNICODE_TODECIMAL(ch);
7587 if (decimal >= 0)
7588 p[i] = '0' + decimal;
7589 }
7590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007591 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7592 Py_DECREF(result);
7593 return NULL;
7594 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007595 return result;
7596}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007597/* --- Decimal Encoder ---------------------------------------------------- */
7598
Alexander Belopolsky40018472011-02-26 01:02:56 +00007599int
7600PyUnicode_EncodeDecimal(Py_UNICODE *s,
7601 Py_ssize_t length,
7602 char *output,
7603 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007604{
7605 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007606 PyObject *errorHandler = NULL;
7607 PyObject *exc = NULL;
7608 const char *encoding = "decimal";
7609 const char *reason = "invalid decimal Unicode string";
7610 /* the following variable is used for caching string comparisons
7611 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7612 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007613
7614 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 PyErr_BadArgument();
7616 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007617 }
7618
7619 p = s;
7620 end = s + length;
7621 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 register Py_UNICODE ch = *p;
7623 int decimal;
7624 PyObject *repunicode;
7625 Py_ssize_t repsize;
7626 Py_ssize_t newpos;
7627 Py_UNICODE *uni2;
7628 Py_UNICODE *collstart;
7629 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007630
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007632 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 ++p;
7634 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007635 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 decimal = Py_UNICODE_TODECIMAL(ch);
7637 if (decimal >= 0) {
7638 *output++ = '0' + decimal;
7639 ++p;
7640 continue;
7641 }
7642 if (0 < ch && ch < 256) {
7643 *output++ = (char)ch;
7644 ++p;
7645 continue;
7646 }
7647 /* All other characters are considered unencodable */
7648 collstart = p;
7649 collend = p+1;
7650 while (collend < end) {
7651 if ((0 < *collend && *collend < 256) ||
7652 !Py_UNICODE_ISSPACE(*collend) ||
7653 Py_UNICODE_TODECIMAL(*collend))
7654 break;
7655 }
7656 /* cache callback name lookup
7657 * (if not done yet, i.e. it's the first error) */
7658 if (known_errorHandler==-1) {
7659 if ((errors==NULL) || (!strcmp(errors, "strict")))
7660 known_errorHandler = 1;
7661 else if (!strcmp(errors, "replace"))
7662 known_errorHandler = 2;
7663 else if (!strcmp(errors, "ignore"))
7664 known_errorHandler = 3;
7665 else if (!strcmp(errors, "xmlcharrefreplace"))
7666 known_errorHandler = 4;
7667 else
7668 known_errorHandler = 0;
7669 }
7670 switch (known_errorHandler) {
7671 case 1: /* strict */
7672 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7673 goto onError;
7674 case 2: /* replace */
7675 for (p = collstart; p < collend; ++p)
7676 *output++ = '?';
7677 /* fall through */
7678 case 3: /* ignore */
7679 p = collend;
7680 break;
7681 case 4: /* xmlcharrefreplace */
7682 /* generate replacement (temporarily (mis)uses p) */
7683 for (p = collstart; p < collend; ++p)
7684 output += sprintf(output, "&#%d;", (int)*p);
7685 p = collend;
7686 break;
7687 default:
7688 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7689 encoding, reason, s, length, &exc,
7690 collstart-s, collend-s, &newpos);
7691 if (repunicode == NULL)
7692 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007693 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007694 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007695 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7696 Py_DECREF(repunicode);
7697 goto onError;
7698 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 /* generate replacement */
7700 repsize = PyUnicode_GET_SIZE(repunicode);
7701 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7702 Py_UNICODE ch = *uni2;
7703 if (Py_UNICODE_ISSPACE(ch))
7704 *output++ = ' ';
7705 else {
7706 decimal = Py_UNICODE_TODECIMAL(ch);
7707 if (decimal >= 0)
7708 *output++ = '0' + decimal;
7709 else if (0 < ch && ch < 256)
7710 *output++ = (char)ch;
7711 else {
7712 Py_DECREF(repunicode);
7713 raise_encode_exception(&exc, encoding,
7714 s, length, collstart-s, collend-s, reason);
7715 goto onError;
7716 }
7717 }
7718 }
7719 p = s + newpos;
7720 Py_DECREF(repunicode);
7721 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007722 }
7723 /* 0-terminate the output string */
7724 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007725 Py_XDECREF(exc);
7726 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007727 return 0;
7728
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007730 Py_XDECREF(exc);
7731 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007732 return -1;
7733}
7734
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735/* --- Helpers ------------------------------------------------------------ */
7736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007737#include "stringlib/ucs1lib.h"
7738#include "stringlib/fastsearch.h"
7739#include "stringlib/partition.h"
7740#include "stringlib/split.h"
7741#include "stringlib/count.h"
7742#include "stringlib/find.h"
7743#include "stringlib/localeutil.h"
7744#include "stringlib/undef.h"
7745
7746#include "stringlib/ucs2lib.h"
7747#include "stringlib/fastsearch.h"
7748#include "stringlib/partition.h"
7749#include "stringlib/split.h"
7750#include "stringlib/count.h"
7751#include "stringlib/find.h"
7752#include "stringlib/localeutil.h"
7753#include "stringlib/undef.h"
7754
7755#include "stringlib/ucs4lib.h"
7756#include "stringlib/fastsearch.h"
7757#include "stringlib/partition.h"
7758#include "stringlib/split.h"
7759#include "stringlib/count.h"
7760#include "stringlib/find.h"
7761#include "stringlib/localeutil.h"
7762#include "stringlib/undef.h"
7763
7764static Py_ssize_t
7765any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7766 const Py_UCS1*, Py_ssize_t,
7767 Py_ssize_t, Py_ssize_t),
7768 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7769 const Py_UCS2*, Py_ssize_t,
7770 Py_ssize_t, Py_ssize_t),
7771 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7772 const Py_UCS4*, Py_ssize_t,
7773 Py_ssize_t, Py_ssize_t),
7774 PyObject* s1, PyObject* s2,
7775 Py_ssize_t start,
7776 Py_ssize_t end)
7777{
7778 int kind1, kind2, kind;
7779 void *buf1, *buf2;
7780 Py_ssize_t len1, len2, result;
7781
7782 kind1 = PyUnicode_KIND(s1);
7783 kind2 = PyUnicode_KIND(s2);
7784 kind = kind1 > kind2 ? kind1 : kind2;
7785 buf1 = PyUnicode_DATA(s1);
7786 buf2 = PyUnicode_DATA(s2);
7787 if (kind1 != kind)
7788 buf1 = _PyUnicode_AsKind(s1, kind);
7789 if (!buf1)
7790 return -2;
7791 if (kind2 != kind)
7792 buf2 = _PyUnicode_AsKind(s2, kind);
7793 if (!buf2) {
7794 if (kind1 != kind) PyMem_Free(buf1);
7795 return -2;
7796 }
7797 len1 = PyUnicode_GET_LENGTH(s1);
7798 len2 = PyUnicode_GET_LENGTH(s2);
7799
7800 switch(kind) {
7801 case PyUnicode_1BYTE_KIND:
7802 result = ucs1(buf1, len1, buf2, len2, start, end);
7803 break;
7804 case PyUnicode_2BYTE_KIND:
7805 result = ucs2(buf1, len1, buf2, len2, start, end);
7806 break;
7807 case PyUnicode_4BYTE_KIND:
7808 result = ucs4(buf1, len1, buf2, len2, start, end);
7809 break;
7810 default:
7811 assert(0); result = -2;
7812 }
7813
7814 if (kind1 != kind)
7815 PyMem_Free(buf1);
7816 if (kind2 != kind)
7817 PyMem_Free(buf2);
7818
7819 return result;
7820}
7821
7822Py_ssize_t
7823_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7824 Py_ssize_t n_buffer,
7825 void *digits, Py_ssize_t n_digits,
7826 Py_ssize_t min_width,
7827 const char *grouping,
7828 const char *thousands_sep)
7829{
7830 switch(kind) {
7831 case PyUnicode_1BYTE_KIND:
7832 return _PyUnicode_ucs1_InsertThousandsGrouping(
7833 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7834 min_width, grouping, thousands_sep);
7835 case PyUnicode_2BYTE_KIND:
7836 return _PyUnicode_ucs2_InsertThousandsGrouping(
7837 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7838 min_width, grouping, thousands_sep);
7839 case PyUnicode_4BYTE_KIND:
7840 return _PyUnicode_ucs4_InsertThousandsGrouping(
7841 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7842 min_width, grouping, thousands_sep);
7843 }
7844 assert(0);
7845 return -1;
7846}
7847
7848
Eric Smith8c663262007-08-25 02:26:07 +00007849#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007850#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007851
Thomas Wouters477c8d52006-05-27 19:21:47 +00007852#include "stringlib/count.h"
7853#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007854
Thomas Wouters477c8d52006-05-27 19:21:47 +00007855/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007856#define ADJUST_INDICES(start, end, len) \
7857 if (end > len) \
7858 end = len; \
7859 else if (end < 0) { \
7860 end += len; \
7861 if (end < 0) \
7862 end = 0; \
7863 } \
7864 if (start < 0) { \
7865 start += len; \
7866 if (start < 0) \
7867 start = 0; \
7868 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007869
Alexander Belopolsky40018472011-02-26 01:02:56 +00007870Py_ssize_t
7871PyUnicode_Count(PyObject *str,
7872 PyObject *substr,
7873 Py_ssize_t start,
7874 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007876 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007877 PyUnicodeObject* str_obj;
7878 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007879 int kind1, kind2, kind;
7880 void *buf1 = NULL, *buf2 = NULL;
7881 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007882
Thomas Wouters477c8d52006-05-27 19:21:47 +00007883 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007884 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007885 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007886 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007887 if (!sub_obj || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007888 Py_DECREF(str_obj);
7889 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890 }
Tim Petersced69f82003-09-16 20:30:58 +00007891
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007892 kind1 = PyUnicode_KIND(str_obj);
7893 kind2 = PyUnicode_KIND(sub_obj);
7894 kind = kind1 > kind2 ? kind1 : kind2;
7895 buf1 = PyUnicode_DATA(str_obj);
7896 if (kind1 != kind)
7897 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7898 if (!buf1)
7899 goto onError;
7900 buf2 = PyUnicode_DATA(sub_obj);
7901 if (kind2 != kind)
7902 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7903 if (!buf2)
7904 goto onError;
7905 len1 = PyUnicode_GET_LENGTH(str_obj);
7906 len2 = PyUnicode_GET_LENGTH(sub_obj);
7907
7908 ADJUST_INDICES(start, end, len1);
7909 switch(kind) {
7910 case PyUnicode_1BYTE_KIND:
7911 result = ucs1lib_count(
7912 ((Py_UCS1*)buf1) + start, end - start,
7913 buf2, len2, PY_SSIZE_T_MAX
7914 );
7915 break;
7916 case PyUnicode_2BYTE_KIND:
7917 result = ucs2lib_count(
7918 ((Py_UCS2*)buf1) + start, end - start,
7919 buf2, len2, PY_SSIZE_T_MAX
7920 );
7921 break;
7922 case PyUnicode_4BYTE_KIND:
7923 result = ucs4lib_count(
7924 ((Py_UCS4*)buf1) + start, end - start,
7925 buf2, len2, PY_SSIZE_T_MAX
7926 );
7927 break;
7928 default:
7929 assert(0); result = 0;
7930 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007931
7932 Py_DECREF(sub_obj);
7933 Py_DECREF(str_obj);
7934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007935 if (kind1 != kind)
7936 PyMem_Free(buf1);
7937 if (kind2 != kind)
7938 PyMem_Free(buf2);
7939
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007941 onError:
7942 Py_DECREF(sub_obj);
7943 Py_DECREF(str_obj);
7944 if (kind1 != kind && buf1)
7945 PyMem_Free(buf1);
7946 if (kind2 != kind && buf2)
7947 PyMem_Free(buf2);
7948 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949}
7950
Alexander Belopolsky40018472011-02-26 01:02:56 +00007951Py_ssize_t
7952PyUnicode_Find(PyObject *str,
7953 PyObject *sub,
7954 Py_ssize_t start,
7955 Py_ssize_t end,
7956 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007958 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00007959
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007961 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007963 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007964 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 Py_DECREF(str);
7966 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967 }
Tim Petersced69f82003-09-16 20:30:58 +00007968
Thomas Wouters477c8d52006-05-27 19:21:47 +00007969 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007970 result = any_find_slice(
7971 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
7972 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007973 );
7974 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007975 result = any_find_slice(
7976 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
7977 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007978 );
7979
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007981 Py_DECREF(sub);
7982
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 return result;
7984}
7985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007986Py_ssize_t
7987PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
7988 Py_ssize_t start, Py_ssize_t end,
7989 int direction)
7990{
7991 char *result;
7992 int kind;
7993 if (PyUnicode_READY(str) == -1)
7994 return -2;
7995 if (end > PyUnicode_GET_LENGTH(str))
7996 end = PyUnicode_GET_LENGTH(str);
7997 kind = PyUnicode_KIND(str);
7998 result = findchar(PyUnicode_1BYTE_DATA(str)
7999 + PyUnicode_KIND_SIZE(kind, start),
8000 kind,
8001 end-start, ch, direction);
8002 if (!result)
8003 return -1;
8004 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8005}
8006
Alexander Belopolsky40018472011-02-26 01:02:56 +00008007static int
8008tailmatch(PyUnicodeObject *self,
8009 PyUnicodeObject *substring,
8010 Py_ssize_t start,
8011 Py_ssize_t end,
8012 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008014 int kind_self;
8015 int kind_sub;
8016 void *data_self;
8017 void *data_sub;
8018 Py_ssize_t offset;
8019 Py_ssize_t i;
8020 Py_ssize_t end_sub;
8021
8022 if (PyUnicode_READY(self) == -1 ||
8023 PyUnicode_READY(substring) == -1)
8024 return 0;
8025
8026 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 return 1;
8028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008029 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8030 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008034 kind_self = PyUnicode_KIND(self);
8035 data_self = PyUnicode_DATA(self);
8036 kind_sub = PyUnicode_KIND(substring);
8037 data_sub = PyUnicode_DATA(substring);
8038 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8039
8040 if (direction > 0)
8041 offset = end;
8042 else
8043 offset = start;
8044
8045 if (PyUnicode_READ(kind_self, data_self, offset) ==
8046 PyUnicode_READ(kind_sub, data_sub, 0) &&
8047 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8048 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8049 /* If both are of the same kind, memcmp is sufficient */
8050 if (kind_self == kind_sub) {
8051 return ! memcmp((char *)data_self +
8052 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8053 data_sub,
8054 PyUnicode_GET_LENGTH(substring) *
8055 PyUnicode_CHARACTER_SIZE(substring));
8056 }
8057 /* otherwise we have to compare each character by first accesing it */
8058 else {
8059 /* We do not need to compare 0 and len(substring)-1 because
8060 the if statement above ensured already that they are equal
8061 when we end up here. */
8062 // TODO: honor direction and do a forward or backwards search
8063 for (i = 1; i < end_sub; ++i) {
8064 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8065 PyUnicode_READ(kind_sub, data_sub, i))
8066 return 0;
8067 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008069 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070 }
8071
8072 return 0;
8073}
8074
Alexander Belopolsky40018472011-02-26 01:02:56 +00008075Py_ssize_t
8076PyUnicode_Tailmatch(PyObject *str,
8077 PyObject *substr,
8078 Py_ssize_t start,
8079 Py_ssize_t end,
8080 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008082 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008083
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084 str = PyUnicode_FromObject(str);
8085 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008087 substr = PyUnicode_FromObject(substr);
8088 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 Py_DECREF(str);
8090 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 }
Tim Petersced69f82003-09-16 20:30:58 +00008092
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 (PyUnicodeObject *)substr,
8095 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096 Py_DECREF(str);
8097 Py_DECREF(substr);
8098 return result;
8099}
8100
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101/* Apply fixfct filter to the Unicode object self and return a
8102 reference to the modified object */
8103
Alexander Belopolsky40018472011-02-26 01:02:56 +00008104static PyObject *
8105fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008106 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008108 PyObject *u;
8109 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008111 if (PyUnicode_READY(self) == -1)
8112 return NULL;
8113 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8114 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8115 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8120 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008122 /* fix functions return the new maximum character in a string,
8123 if the kind of the resulting unicode object does not change,
8124 everything is fine. Otherwise we need to change the string kind
8125 and re-run the fix function. */
8126 maxchar_new = fixfct((PyUnicodeObject*)u);
8127 if (maxchar_new == 0)
8128 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8129 else if (maxchar_new <= 127)
8130 maxchar_new = 127;
8131 else if (maxchar_new <= 255)
8132 maxchar_new = 255;
8133 else if (maxchar_new <= 65535)
8134 maxchar_new = 65535;
8135 else
8136 maxchar_new = 1114111; /* 0x10ffff */
8137
8138 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 /* fixfct should return TRUE if it modified the buffer. If
8140 FALSE, return a reference to the original buffer instead
8141 (to save space, not time) */
8142 Py_INCREF(self);
8143 Py_DECREF(u);
8144 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008146 else if (maxchar_new == maxchar_old) {
8147 return u;
8148 }
8149 else {
8150 /* In case the maximum character changed, we need to
8151 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008152 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008153 if (v == NULL) {
8154 Py_DECREF(u);
8155 return NULL;
8156 }
8157 if (maxchar_new > maxchar_old) {
8158 /* If the maxchar increased so that the kind changed, not all
8159 characters are representable anymore and we need to fix the
8160 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008161 if (PyUnicode_CopyCharacters(v, 0,
8162 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008163 PyUnicode_GET_LENGTH(self)) < 0)
8164 {
8165 Py_DECREF(u);
8166 return NULL;
8167 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008168 maxchar_old = fixfct((PyUnicodeObject*)v);
8169 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8170 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008171 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008172 if (PyUnicode_CopyCharacters(v, 0,
8173 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008174 PyUnicode_GET_LENGTH(self)) < 0)
8175 {
8176 Py_DECREF(u);
8177 return NULL;
8178 }
8179 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180
8181 Py_DECREF(u);
8182 return v;
8183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184}
8185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008186static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008187fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008189 /* No need to call PyUnicode_READY(self) because this function is only
8190 called as a callback from fixup() which does it already. */
8191 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8192 const int kind = PyUnicode_KIND(self);
8193 void *data = PyUnicode_DATA(self);
8194 int touched = 0;
8195 Py_UCS4 maxchar = 0;
8196 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008198 for (i = 0; i < len; ++i) {
8199 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8200 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8201 if (up != ch) {
8202 if (up > maxchar)
8203 maxchar = up;
8204 PyUnicode_WRITE(kind, data, i, up);
8205 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008207 else if (ch > maxchar)
8208 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209 }
8210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008211 if (touched)
8212 return maxchar;
8213 else
8214 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215}
8216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008217static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008218fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008220 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8221 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8222 const int kind = PyUnicode_KIND(self);
8223 void *data = PyUnicode_DATA(self);
8224 int touched = 0;
8225 Py_UCS4 maxchar = 0;
8226 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008228 for(i = 0; i < len; ++i) {
8229 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8230 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8231 if (lo != ch) {
8232 if (lo > maxchar)
8233 maxchar = lo;
8234 PyUnicode_WRITE(kind, data, i, lo);
8235 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008237 else if (ch > maxchar)
8238 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 }
8240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008241 if (touched)
8242 return maxchar;
8243 else
8244 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245}
8246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008247static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008248fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008250 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8251 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8252 const int kind = PyUnicode_KIND(self);
8253 void *data = PyUnicode_DATA(self);
8254 int touched = 0;
8255 Py_UCS4 maxchar = 0;
8256 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008258 for(i = 0; i < len; ++i) {
8259 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8260 Py_UCS4 nu = 0;
8261
8262 if (Py_UNICODE_ISUPPER(ch))
8263 nu = Py_UNICODE_TOLOWER(ch);
8264 else if (Py_UNICODE_ISLOWER(ch))
8265 nu = Py_UNICODE_TOUPPER(ch);
8266
8267 if (nu != 0) {
8268 if (nu > maxchar)
8269 maxchar = nu;
8270 PyUnicode_WRITE(kind, data, i, nu);
8271 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008273 else if (ch > maxchar)
8274 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 }
8276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008277 if (touched)
8278 return maxchar;
8279 else
8280 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281}
8282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008283static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008284fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008286 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8287 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8288 const int kind = PyUnicode_KIND(self);
8289 void *data = PyUnicode_DATA(self);
8290 int touched = 0;
8291 Py_UCS4 maxchar = 0;
8292 Py_ssize_t i = 0;
8293 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008294
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008295 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008297
8298 ch = PyUnicode_READ(kind, data, i);
8299 if (!Py_UNICODE_ISUPPER(ch)) {
8300 maxchar = Py_UNICODE_TOUPPER(ch);
8301 PyUnicode_WRITE(kind, data, i, maxchar);
8302 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008304 ++i;
8305 for(; i < len; ++i) {
8306 ch = PyUnicode_READ(kind, data, i);
8307 if (!Py_UNICODE_ISLOWER(ch)) {
8308 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8309 if (lo > maxchar)
8310 maxchar = lo;
8311 PyUnicode_WRITE(kind, data, i, lo);
8312 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008314 else if (ch > maxchar)
8315 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008316 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008317
8318 if (touched)
8319 return maxchar;
8320 else
8321 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322}
8323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008324static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008325fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8328 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8329 const int kind = PyUnicode_KIND(self);
8330 void *data = PyUnicode_DATA(self);
8331 Py_UCS4 maxchar = 0;
8332 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 int previous_is_cased;
8334
8335 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008336 if (len == 1) {
8337 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8338 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8339 if (ti != ch) {
8340 PyUnicode_WRITE(kind, data, i, ti);
8341 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 }
8343 else
8344 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008347 for(; i < len; ++i) {
8348 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8349 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008350
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008352 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008354 nu = Py_UNICODE_TOTITLE(ch);
8355
8356 if (nu > maxchar)
8357 maxchar = nu;
8358 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008359
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 if (Py_UNICODE_ISLOWER(ch) ||
8361 Py_UNICODE_ISUPPER(ch) ||
8362 Py_UNICODE_ISTITLE(ch))
8363 previous_is_cased = 1;
8364 else
8365 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368}
8369
Tim Peters8ce9f162004-08-27 01:49:32 +00008370PyObject *
8371PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008373 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008374 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008376 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008377 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8378 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008379 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 Py_ssize_t sz, i, res_offset;
8381 Py_UCS4 maxchar = 0;
8382 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383
Tim Peters05eba1f2004-08-27 21:32:02 +00008384 fseq = PySequence_Fast(seq, "");
8385 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008386 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008387 }
8388
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008389 /* NOTE: the following code can't call back into Python code,
8390 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008391 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008392
Tim Peters05eba1f2004-08-27 21:32:02 +00008393 seqlen = PySequence_Fast_GET_SIZE(fseq);
8394 /* If empty sequence, return u"". */
8395 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008396 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008397 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008398 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008399 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008400 /* If singleton sequence with an exact Unicode, return that. */
8401 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 item = items[0];
8403 if (PyUnicode_CheckExact(item)) {
8404 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008405 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 goto Done;
8407 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008408 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008409 else {
8410 /* Set up sep and seplen */
8411 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008412 /* fall back to a blank space separator */
8413 sep = PyUnicode_FromOrdinal(' ');
8414 if (!sep || PyUnicode_READY(sep) == -1)
8415 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008416 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008417 else {
8418 if (!PyUnicode_Check(separator)) {
8419 PyErr_Format(PyExc_TypeError,
8420 "separator: expected str instance,"
8421 " %.80s found",
8422 Py_TYPE(separator)->tp_name);
8423 goto onError;
8424 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008425 if (PyUnicode_READY(separator) == -1)
8426 goto onError;
8427 sep = separator;
8428 seplen = PyUnicode_GET_LENGTH(separator);
8429 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8430 /* inc refcount to keep this code path symetric with the
8431 above case of a blank separator */
8432 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008433 }
8434 }
8435
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008436 /* There are at least two things to join, or else we have a subclass
8437 * of str in the sequence.
8438 * Do a pre-pass to figure out the total amount of space we'll
8439 * need (sz), and see whether all argument are strings.
8440 */
8441 sz = 0;
8442 for (i = 0; i < seqlen; i++) {
8443 const Py_ssize_t old_sz = sz;
8444 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 if (!PyUnicode_Check(item)) {
8446 PyErr_Format(PyExc_TypeError,
8447 "sequence item %zd: expected str instance,"
8448 " %.80s found",
8449 i, Py_TYPE(item)->tp_name);
8450 goto onError;
8451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452 if (PyUnicode_READY(item) == -1)
8453 goto onError;
8454 sz += PyUnicode_GET_LENGTH(item);
8455 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8456 if (item_maxchar > maxchar)
8457 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008458 if (i != 0)
8459 sz += seplen;
8460 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8461 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008463 goto onError;
8464 }
8465 }
Tim Petersced69f82003-09-16 20:30:58 +00008466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008468 if (res == NULL)
8469 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008470
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008471 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008473 Py_ssize_t itemlen;
8474 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008475 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 /* Copy item, and maybe the separator. */
8477 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008478 if (PyUnicode_CopyCharacters(res, res_offset,
8479 sep, 0, seplen) < 0)
8480 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008483 if (PyUnicode_CopyCharacters(res, res_offset,
8484 item, 0, itemlen) < 0)
8485 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008486 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008487 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008488 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008489
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008491 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492 Py_XDECREF(sep);
8493 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008496 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008497 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008498 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499 return NULL;
8500}
8501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502#define FILL(kind, data, value, start, length) \
8503 do { \
8504 Py_ssize_t i_ = 0; \
8505 assert(kind != PyUnicode_WCHAR_KIND); \
8506 switch ((kind)) { \
8507 case PyUnicode_1BYTE_KIND: { \
8508 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8509 memset(to_, (unsigned char)value, length); \
8510 break; \
8511 } \
8512 case PyUnicode_2BYTE_KIND: { \
8513 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8514 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8515 break; \
8516 } \
8517 default: { \
8518 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8519 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8520 break; \
8521 } \
8522 } \
8523 } while (0)
8524
Alexander Belopolsky40018472011-02-26 01:02:56 +00008525static PyUnicodeObject *
8526pad(PyUnicodeObject *self,
8527 Py_ssize_t left,
8528 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 PyObject *u;
8532 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008533 int kind;
8534 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535
8536 if (left < 0)
8537 left = 0;
8538 if (right < 0)
8539 right = 0;
8540
Tim Peters7a29bd52001-09-12 03:03:31 +00008541 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542 Py_INCREF(self);
8543 return self;
8544 }
8545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8547 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008548 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8549 return NULL;
8550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8552 if (fill > maxchar)
8553 maxchar = fill;
8554 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008555 if (!u)
8556 return NULL;
8557
8558 kind = PyUnicode_KIND(u);
8559 data = PyUnicode_DATA(u);
8560 if (left)
8561 FILL(kind, data, fill, 0, left);
8562 if (right)
8563 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008564 if (PyUnicode_CopyCharacters(u, left,
8565 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008566 _PyUnicode_LENGTH(self)) < 0)
8567 {
8568 Py_DECREF(u);
8569 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570 }
8571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008574#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575
Alexander Belopolsky40018472011-02-26 01:02:56 +00008576PyObject *
8577PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580
8581 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008585 switch(PyUnicode_KIND(string)) {
8586 case PyUnicode_1BYTE_KIND:
8587 list = ucs1lib_splitlines(
8588 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8589 PyUnicode_GET_LENGTH(string), keepends);
8590 break;
8591 case PyUnicode_2BYTE_KIND:
8592 list = ucs2lib_splitlines(
8593 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8594 PyUnicode_GET_LENGTH(string), keepends);
8595 break;
8596 case PyUnicode_4BYTE_KIND:
8597 list = ucs4lib_splitlines(
8598 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8599 PyUnicode_GET_LENGTH(string), keepends);
8600 break;
8601 default:
8602 assert(0);
8603 list = 0;
8604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605 Py_DECREF(string);
8606 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607}
8608
Alexander Belopolsky40018472011-02-26 01:02:56 +00008609static PyObject *
8610split(PyUnicodeObject *self,
8611 PyUnicodeObject *substring,
8612 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008614 int kind1, kind2, kind;
8615 void *buf1, *buf2;
8616 Py_ssize_t len1, len2;
8617 PyObject* out;
8618
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008620 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622 if (PyUnicode_READY(self) == -1)
8623 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 if (substring == NULL)
8626 switch(PyUnicode_KIND(self)) {
8627 case PyUnicode_1BYTE_KIND:
8628 return ucs1lib_split_whitespace(
8629 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8630 PyUnicode_GET_LENGTH(self), maxcount
8631 );
8632 case PyUnicode_2BYTE_KIND:
8633 return ucs2lib_split_whitespace(
8634 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8635 PyUnicode_GET_LENGTH(self), maxcount
8636 );
8637 case PyUnicode_4BYTE_KIND:
8638 return ucs4lib_split_whitespace(
8639 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8640 PyUnicode_GET_LENGTH(self), maxcount
8641 );
8642 default:
8643 assert(0);
8644 return NULL;
8645 }
8646
8647 if (PyUnicode_READY(substring) == -1)
8648 return NULL;
8649
8650 kind1 = PyUnicode_KIND(self);
8651 kind2 = PyUnicode_KIND(substring);
8652 kind = kind1 > kind2 ? kind1 : kind2;
8653 buf1 = PyUnicode_DATA(self);
8654 buf2 = PyUnicode_DATA(substring);
8655 if (kind1 != kind)
8656 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8657 if (!buf1)
8658 return NULL;
8659 if (kind2 != kind)
8660 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8661 if (!buf2) {
8662 if (kind1 != kind) PyMem_Free(buf1);
8663 return NULL;
8664 }
8665 len1 = PyUnicode_GET_LENGTH(self);
8666 len2 = PyUnicode_GET_LENGTH(substring);
8667
8668 switch(kind) {
8669 case PyUnicode_1BYTE_KIND:
8670 out = ucs1lib_split(
8671 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8672 break;
8673 case PyUnicode_2BYTE_KIND:
8674 out = ucs2lib_split(
8675 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8676 break;
8677 case PyUnicode_4BYTE_KIND:
8678 out = ucs4lib_split(
8679 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8680 break;
8681 default:
8682 out = NULL;
8683 }
8684 if (kind1 != kind)
8685 PyMem_Free(buf1);
8686 if (kind2 != kind)
8687 PyMem_Free(buf2);
8688 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689}
8690
Alexander Belopolsky40018472011-02-26 01:02:56 +00008691static PyObject *
8692rsplit(PyUnicodeObject *self,
8693 PyUnicodeObject *substring,
8694 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008695{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 int kind1, kind2, kind;
8697 void *buf1, *buf2;
8698 Py_ssize_t len1, len2;
8699 PyObject* out;
8700
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008701 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008702 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 if (PyUnicode_READY(self) == -1)
8705 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 if (substring == NULL)
8708 switch(PyUnicode_KIND(self)) {
8709 case PyUnicode_1BYTE_KIND:
8710 return ucs1lib_rsplit_whitespace(
8711 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8712 PyUnicode_GET_LENGTH(self), maxcount
8713 );
8714 case PyUnicode_2BYTE_KIND:
8715 return ucs2lib_rsplit_whitespace(
8716 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8717 PyUnicode_GET_LENGTH(self), maxcount
8718 );
8719 case PyUnicode_4BYTE_KIND:
8720 return ucs4lib_rsplit_whitespace(
8721 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8722 PyUnicode_GET_LENGTH(self), maxcount
8723 );
8724 default:
8725 assert(0);
8726 return NULL;
8727 }
8728
8729 if (PyUnicode_READY(substring) == -1)
8730 return NULL;
8731
8732 kind1 = PyUnicode_KIND(self);
8733 kind2 = PyUnicode_KIND(substring);
8734 kind = kind1 > kind2 ? kind1 : kind2;
8735 buf1 = PyUnicode_DATA(self);
8736 buf2 = PyUnicode_DATA(substring);
8737 if (kind1 != kind)
8738 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8739 if (!buf1)
8740 return NULL;
8741 if (kind2 != kind)
8742 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8743 if (!buf2) {
8744 if (kind1 != kind) PyMem_Free(buf1);
8745 return NULL;
8746 }
8747 len1 = PyUnicode_GET_LENGTH(self);
8748 len2 = PyUnicode_GET_LENGTH(substring);
8749
8750 switch(kind) {
8751 case PyUnicode_1BYTE_KIND:
8752 out = ucs1lib_rsplit(
8753 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8754 break;
8755 case PyUnicode_2BYTE_KIND:
8756 out = ucs2lib_rsplit(
8757 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8758 break;
8759 case PyUnicode_4BYTE_KIND:
8760 out = ucs4lib_rsplit(
8761 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8762 break;
8763 default:
8764 out = NULL;
8765 }
8766 if (kind1 != kind)
8767 PyMem_Free(buf1);
8768 if (kind2 != kind)
8769 PyMem_Free(buf2);
8770 return out;
8771}
8772
8773static Py_ssize_t
8774anylib_find(int kind, void *buf1, Py_ssize_t len1,
8775 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8776{
8777 switch(kind) {
8778 case PyUnicode_1BYTE_KIND:
8779 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8780 case PyUnicode_2BYTE_KIND:
8781 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8782 case PyUnicode_4BYTE_KIND:
8783 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8784 }
8785 assert(0);
8786 return -1;
8787}
8788
8789static Py_ssize_t
8790anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8791 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8792{
8793 switch(kind) {
8794 case PyUnicode_1BYTE_KIND:
8795 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8796 case PyUnicode_2BYTE_KIND:
8797 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8798 case PyUnicode_4BYTE_KIND:
8799 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8800 }
8801 assert(0);
8802 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008803}
8804
Alexander Belopolsky40018472011-02-26 01:02:56 +00008805static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806replace(PyObject *self, PyObject *str1,
8807 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809 PyObject *u;
8810 char *sbuf = PyUnicode_DATA(self);
8811 char *buf1 = PyUnicode_DATA(str1);
8812 char *buf2 = PyUnicode_DATA(str2);
8813 int srelease = 0, release1 = 0, release2 = 0;
8814 int skind = PyUnicode_KIND(self);
8815 int kind1 = PyUnicode_KIND(str1);
8816 int kind2 = PyUnicode_KIND(str2);
8817 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8818 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8819 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008820
8821 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008822 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008823 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008824 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008826 if (skind < kind1)
8827 /* substring too wide to be present */
8828 goto nothing;
8829
8830 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008831 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008832 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008834 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008836 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837 Py_UCS4 u1, u2, maxchar;
8838 int mayshrink, rkind;
8839 u1 = PyUnicode_READ_CHAR(str1, 0);
8840 if (!findchar(sbuf, PyUnicode_KIND(self),
8841 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008842 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008843 u2 = PyUnicode_READ_CHAR(str2, 0);
8844 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8845 /* Replacing u1 with u2 may cause a maxchar reduction in the
8846 result string. */
8847 mayshrink = maxchar > 127;
8848 if (u2 > maxchar) {
8849 maxchar = u2;
8850 mayshrink = 0;
8851 }
8852 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008853 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008855 if (PyUnicode_CopyCharacters(u, 0,
8856 (PyObject*)self, 0, slen) < 0)
8857 {
8858 Py_DECREF(u);
8859 return NULL;
8860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008861 rkind = PyUnicode_KIND(u);
8862 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8863 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008864 if (--maxcount < 0)
8865 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008868 if (mayshrink) {
8869 PyObject *tmp = u;
8870 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8871 PyUnicode_GET_LENGTH(tmp));
8872 Py_DECREF(tmp);
8873 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 int rkind = skind;
8876 char *res;
8877 if (kind1 < rkind) {
8878 /* widen substring */
8879 buf1 = _PyUnicode_AsKind(str1, rkind);
8880 if (!buf1) goto error;
8881 release1 = 1;
8882 }
8883 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008884 if (i < 0)
8885 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 if (rkind > kind2) {
8887 /* widen replacement */
8888 buf2 = _PyUnicode_AsKind(str2, rkind);
8889 if (!buf2) goto error;
8890 release2 = 1;
8891 }
8892 else if (rkind < kind2) {
8893 /* widen self and buf1 */
8894 rkind = kind2;
8895 if (release1) PyMem_Free(buf1);
8896 sbuf = _PyUnicode_AsKind(self, rkind);
8897 if (!sbuf) goto error;
8898 srelease = 1;
8899 buf1 = _PyUnicode_AsKind(str1, rkind);
8900 if (!buf1) goto error;
8901 release1 = 1;
8902 }
8903 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8904 if (!res) {
8905 PyErr_NoMemory();
8906 goto error;
8907 }
8908 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008909 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008910 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8911 buf2,
8912 PyUnicode_KIND_SIZE(rkind, len2));
8913 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008914
8915 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8917 slen-i,
8918 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008919 if (i == -1)
8920 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008921 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8922 buf2,
8923 PyUnicode_KIND_SIZE(rkind, len2));
8924 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008925 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926
8927 u = PyUnicode_FromKindAndData(rkind, res, slen);
8928 PyMem_Free(res);
8929 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933 Py_ssize_t n, i, j, ires;
8934 Py_ssize_t product, new_size;
8935 int rkind = skind;
8936 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938 if (kind1 < rkind) {
8939 buf1 = _PyUnicode_AsKind(str1, rkind);
8940 if (!buf1) goto error;
8941 release1 = 1;
8942 }
8943 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008944 if (n == 0)
8945 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 if (kind2 < rkind) {
8947 buf2 = _PyUnicode_AsKind(str2, rkind);
8948 if (!buf2) goto error;
8949 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 else if (kind2 > rkind) {
8952 rkind = kind2;
8953 sbuf = _PyUnicode_AsKind(self, rkind);
8954 if (!sbuf) goto error;
8955 srelease = 1;
8956 if (release1) PyMem_Free(buf1);
8957 buf1 = _PyUnicode_AsKind(str1, rkind);
8958 if (!buf1) goto error;
8959 release1 = 1;
8960 }
8961 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
8962 PyUnicode_GET_LENGTH(str1))); */
8963 product = n * (len2-len1);
8964 if ((product / (len2-len1)) != n) {
8965 PyErr_SetString(PyExc_OverflowError,
8966 "replace string is too long");
8967 goto error;
8968 }
8969 new_size = slen + product;
8970 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
8971 PyErr_SetString(PyExc_OverflowError,
8972 "replace string is too long");
8973 goto error;
8974 }
8975 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
8976 if (!res)
8977 goto error;
8978 ires = i = 0;
8979 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008980 while (n-- > 0) {
8981 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 j = anylib_find(rkind,
8983 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8984 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008985 if (j == -1)
8986 break;
8987 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008988 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8990 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8991 PyUnicode_KIND_SIZE(rkind, j-i));
8992 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008993 }
8994 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995 if (len2 > 0) {
8996 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8997 buf2,
8998 PyUnicode_KIND_SIZE(rkind, len2));
8999 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009002 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009004 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9006 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9007 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009008 } else {
9009 /* interleave */
9010 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9012 buf2,
9013 PyUnicode_KIND_SIZE(rkind, len2));
9014 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009015 if (--n <= 0)
9016 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9018 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9019 PyUnicode_KIND_SIZE(rkind, 1));
9020 ires++;
9021 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9024 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9025 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009029 if (srelease)
9030 PyMem_FREE(sbuf);
9031 if (release1)
9032 PyMem_FREE(buf1);
9033 if (release2)
9034 PyMem_FREE(buf2);
9035 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009036
Benjamin Peterson29060642009-01-31 22:14:21 +00009037 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009038 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 if (srelease)
9040 PyMem_FREE(sbuf);
9041 if (release1)
9042 PyMem_FREE(buf1);
9043 if (release2)
9044 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009045 if (PyUnicode_CheckExact(self)) {
9046 Py_INCREF(self);
9047 return (PyObject *) self;
9048 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 return PyUnicode_FromKindAndData(PyUnicode_KIND(self),
9050 PyUnicode_DATA(self),
9051 PyUnicode_GET_LENGTH(self));
9052 error:
9053 if (srelease && sbuf)
9054 PyMem_FREE(sbuf);
9055 if (release1 && buf1)
9056 PyMem_FREE(buf1);
9057 if (release2 && buf2)
9058 PyMem_FREE(buf2);
9059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060}
9061
9062/* --- Unicode Object Methods --------------------------------------------- */
9063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009064PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009065 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066\n\
9067Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009068characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069
9070static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009071unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073 return fixup(self, fixtitle);
9074}
9075
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009076PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009077 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009078\n\
9079Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009080have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081
9082static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009083unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085 return fixup(self, fixcapitalize);
9086}
9087
9088#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009089PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091\n\
9092Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009093normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094
9095static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009096unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097{
9098 PyObject *list;
9099 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009100 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102 /* Split into words */
9103 list = split(self, NULL, -1);
9104 if (!list)
9105 return NULL;
9106
9107 /* Capitalize each word */
9108 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9109 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009110 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111 if (item == NULL)
9112 goto onError;
9113 Py_DECREF(PyList_GET_ITEM(list, i));
9114 PyList_SET_ITEM(list, i, item);
9115 }
9116
9117 /* Join the words to form a new string */
9118 item = PyUnicode_Join(NULL, list);
9119
Benjamin Peterson29060642009-01-31 22:14:21 +00009120 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121 Py_DECREF(list);
9122 return (PyObject *)item;
9123}
9124#endif
9125
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009126/* Argument converter. Coerces to a single unicode character */
9127
9128static int
9129convert_uc(PyObject *obj, void *addr)
9130{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009132 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009133
Benjamin Peterson14339b62009-01-31 16:36:08 +00009134 uniobj = PyUnicode_FromObject(obj);
9135 if (uniobj == NULL) {
9136 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009138 return 0;
9139 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009140 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009141 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009142 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009143 Py_DECREF(uniobj);
9144 return 0;
9145 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009146 if (PyUnicode_READY(uniobj)) {
9147 Py_DECREF(uniobj);
9148 return 0;
9149 }
9150 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009151 Py_DECREF(uniobj);
9152 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009153}
9154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009155PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009158Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009159done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009160
9161static PyObject *
9162unicode_center(PyUnicodeObject *self, PyObject *args)
9163{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009164 Py_ssize_t marg, left;
9165 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009166 Py_UCS4 fillchar = ' ';
9167
9168 if (PyUnicode_READY(self) == -1)
9169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009170
Thomas Woutersde017742006-02-16 19:34:37 +00009171 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172 return NULL;
9173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175 Py_INCREF(self);
9176 return (PyObject*) self;
9177 }
9178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180 left = marg / 2 + (marg & width & 1);
9181
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009182 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183}
9184
Marc-André Lemburge5034372000-08-08 08:04:29 +00009185#if 0
9186
9187/* This code should go into some future Unicode collation support
9188 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009189 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009190
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009191/* speedy UTF-16 code point order comparison */
9192/* gleaned from: */
9193/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9194
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009195static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009196{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009197 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009198 0, 0, 0, 0, 0, 0, 0, 0,
9199 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009200 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009201};
9202
Guido van Rossumd57fd912000-03-10 22:53:23 +00009203static int
9204unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9205{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009206 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009207
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208 Py_UNICODE *s1 = str1->str;
9209 Py_UNICODE *s2 = str2->str;
9210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009211 len1 = str1->_base._base.length;
9212 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009213
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009215 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009216
9217 c1 = *s1++;
9218 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009219
Benjamin Peterson29060642009-01-31 22:14:21 +00009220 if (c1 > (1<<11) * 26)
9221 c1 += utf16Fixup[c1>>11];
9222 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009223 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009224 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009225
9226 if (c1 != c2)
9227 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009228
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009229 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230 }
9231
9232 return (len1 < len2) ? -1 : (len1 != len2);
9233}
9234
Marc-André Lemburge5034372000-08-08 08:04:29 +00009235#else
9236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237/* This function assumes that str1 and str2 are readied by the caller. */
9238
Marc-André Lemburge5034372000-08-08 08:04:29 +00009239static int
9240unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9241{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 int kind1, kind2;
9243 void *data1, *data2;
9244 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 kind1 = PyUnicode_KIND(str1);
9247 kind2 = PyUnicode_KIND(str2);
9248 data1 = PyUnicode_DATA(str1);
9249 data2 = PyUnicode_DATA(str2);
9250 len1 = PyUnicode_GET_LENGTH(str1);
9251 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009253 for (i = 0; i < len1 && i < len2; ++i) {
9254 Py_UCS4 c1, c2;
9255 c1 = PyUnicode_READ(kind1, data1, i);
9256 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009257
9258 if (c1 != c2)
9259 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009260 }
9261
9262 return (len1 < len2) ? -1 : (len1 != len2);
9263}
9264
9265#endif
9266
Alexander Belopolsky40018472011-02-26 01:02:56 +00009267int
9268PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009269{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9271 if (PyUnicode_READY(left) == -1 ||
9272 PyUnicode_READY(right) == -1)
9273 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009274 return unicode_compare((PyUnicodeObject *)left,
9275 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009277 PyErr_Format(PyExc_TypeError,
9278 "Can't compare %.100s and %.100s",
9279 left->ob_type->tp_name,
9280 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281 return -1;
9282}
9283
Martin v. Löwis5b222132007-06-10 09:51:05 +00009284int
9285PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287 Py_ssize_t i;
9288 int kind;
9289 void *data;
9290 Py_UCS4 chr;
9291
Martin v. Löwis5b222132007-06-10 09:51:05 +00009292 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 if (PyUnicode_READY(uni) == -1)
9294 return -1;
9295 kind = PyUnicode_KIND(uni);
9296 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009297 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9299 if (chr != str[i])
9300 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009301 /* This check keeps Python strings that end in '\0' from comparing equal
9302 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009303 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009304 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009305 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009306 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009307 return 0;
9308}
9309
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009310
Benjamin Peterson29060642009-01-31 22:14:21 +00009311#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009312 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009313
Alexander Belopolsky40018472011-02-26 01:02:56 +00009314PyObject *
9315PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009316{
9317 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009318
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009319 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9320 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009321 if (PyUnicode_READY(left) == -1 ||
9322 PyUnicode_READY(right) == -1)
9323 return NULL;
9324 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9325 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009326 if (op == Py_EQ) {
9327 Py_INCREF(Py_False);
9328 return Py_False;
9329 }
9330 if (op == Py_NE) {
9331 Py_INCREF(Py_True);
9332 return Py_True;
9333 }
9334 }
9335 if (left == right)
9336 result = 0;
9337 else
9338 result = unicode_compare((PyUnicodeObject *)left,
9339 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009340
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009341 /* Convert the return value to a Boolean */
9342 switch (op) {
9343 case Py_EQ:
9344 v = TEST_COND(result == 0);
9345 break;
9346 case Py_NE:
9347 v = TEST_COND(result != 0);
9348 break;
9349 case Py_LE:
9350 v = TEST_COND(result <= 0);
9351 break;
9352 case Py_GE:
9353 v = TEST_COND(result >= 0);
9354 break;
9355 case Py_LT:
9356 v = TEST_COND(result == -1);
9357 break;
9358 case Py_GT:
9359 v = TEST_COND(result == 1);
9360 break;
9361 default:
9362 PyErr_BadArgument();
9363 return NULL;
9364 }
9365 Py_INCREF(v);
9366 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009367 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009368
Brian Curtindfc80e32011-08-10 20:28:54 -05009369 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009370}
9371
Alexander Belopolsky40018472011-02-26 01:02:56 +00009372int
9373PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009374{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009375 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 int kind1, kind2, kind;
9377 void *buf1, *buf2;
9378 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009379 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009380
9381 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009382 sub = PyUnicode_FromObject(element);
9383 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009384 PyErr_Format(PyExc_TypeError,
9385 "'in <string>' requires string as left operand, not %s",
9386 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009387 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 if (PyUnicode_READY(sub) == -1)
9390 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009391
Thomas Wouters477c8d52006-05-27 19:21:47 +00009392 str = PyUnicode_FromObject(container);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 if (!str || PyUnicode_READY(container) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009394 Py_DECREF(sub);
9395 return -1;
9396 }
9397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398 kind1 = PyUnicode_KIND(str);
9399 kind2 = PyUnicode_KIND(sub);
9400 kind = kind1 > kind2 ? kind1 : kind2;
9401 buf1 = PyUnicode_DATA(str);
9402 buf2 = PyUnicode_DATA(sub);
9403 if (kind1 != kind)
9404 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9405 if (!buf1) {
9406 Py_DECREF(sub);
9407 return -1;
9408 }
9409 if (kind2 != kind)
9410 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9411 if (!buf2) {
9412 Py_DECREF(sub);
9413 if (kind1 != kind) PyMem_Free(buf1);
9414 return -1;
9415 }
9416 len1 = PyUnicode_GET_LENGTH(str);
9417 len2 = PyUnicode_GET_LENGTH(sub);
9418
9419 switch(kind) {
9420 case PyUnicode_1BYTE_KIND:
9421 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9422 break;
9423 case PyUnicode_2BYTE_KIND:
9424 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9425 break;
9426 case PyUnicode_4BYTE_KIND:
9427 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9428 break;
9429 default:
9430 result = -1;
9431 assert(0);
9432 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009433
9434 Py_DECREF(str);
9435 Py_DECREF(sub);
9436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 if (kind1 != kind)
9438 PyMem_Free(buf1);
9439 if (kind2 != kind)
9440 PyMem_Free(buf2);
9441
Guido van Rossum403d68b2000-03-13 15:55:09 +00009442 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009443}
9444
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445/* Concat to string or Unicode object giving a new Unicode object. */
9446
Alexander Belopolsky40018472011-02-26 01:02:56 +00009447PyObject *
9448PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 PyObject *u = NULL, *v = NULL, *w;
9451 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452
9453 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009456 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009458 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009459 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009460
9461 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009463 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009467 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469 }
9470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 if (PyUnicode_READY(u) == -1 || PyUnicode_READY(v) == -1)
9472 goto onError;
9473
9474 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009475 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 w = PyUnicode_New(
9479 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9480 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009483 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9484 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009485 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009486 v, 0,
9487 PyUnicode_GET_LENGTH(v)) < 0)
9488 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489 Py_DECREF(u);
9490 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492
Benjamin Peterson29060642009-01-31 22:14:21 +00009493 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494 Py_XDECREF(u);
9495 Py_XDECREF(v);
9496 return NULL;
9497}
9498
Walter Dörwald1ab83302007-05-18 17:15:44 +00009499void
9500PyUnicode_Append(PyObject **pleft, PyObject *right)
9501{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009502 PyObject *new;
9503 if (*pleft == NULL)
9504 return;
9505 if (right == NULL || !PyUnicode_Check(*pleft)) {
9506 Py_DECREF(*pleft);
9507 *pleft = NULL;
9508 return;
9509 }
9510 new = PyUnicode_Concat(*pleft, right);
9511 Py_DECREF(*pleft);
9512 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009513}
9514
9515void
9516PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9517{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009518 PyUnicode_Append(pleft, right);
9519 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009520}
9521
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009522PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009523 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009525Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009526string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009527interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528
9529static PyObject *
9530unicode_count(PyUnicodeObject *self, PyObject *args)
9531{
9532 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009533 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009534 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 int kind1, kind2, kind;
9537 void *buf1, *buf2;
9538 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539
Jesus Ceaac451502011-04-20 17:09:23 +02009540 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9541 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009542 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544 kind1 = PyUnicode_KIND(self);
9545 kind2 = PyUnicode_KIND(substring);
9546 kind = kind1 > kind2 ? kind1 : kind2;
9547 buf1 = PyUnicode_DATA(self);
9548 buf2 = PyUnicode_DATA(substring);
9549 if (kind1 != kind)
9550 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9551 if (!buf1) {
9552 Py_DECREF(substring);
9553 return NULL;
9554 }
9555 if (kind2 != kind)
9556 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9557 if (!buf2) {
9558 Py_DECREF(substring);
9559 if (kind1 != kind) PyMem_Free(buf1);
9560 return NULL;
9561 }
9562 len1 = PyUnicode_GET_LENGTH(self);
9563 len2 = PyUnicode_GET_LENGTH(substring);
9564
9565 ADJUST_INDICES(start, end, len1);
9566 switch(kind) {
9567 case PyUnicode_1BYTE_KIND:
9568 iresult = ucs1lib_count(
9569 ((Py_UCS1*)buf1) + start, end - start,
9570 buf2, len2, PY_SSIZE_T_MAX
9571 );
9572 break;
9573 case PyUnicode_2BYTE_KIND:
9574 iresult = ucs2lib_count(
9575 ((Py_UCS2*)buf1) + start, end - start,
9576 buf2, len2, PY_SSIZE_T_MAX
9577 );
9578 break;
9579 case PyUnicode_4BYTE_KIND:
9580 iresult = ucs4lib_count(
9581 ((Py_UCS4*)buf1) + start, end - start,
9582 buf2, len2, PY_SSIZE_T_MAX
9583 );
9584 break;
9585 default:
9586 assert(0); iresult = 0;
9587 }
9588
9589 result = PyLong_FromSsize_t(iresult);
9590
9591 if (kind1 != kind)
9592 PyMem_Free(buf1);
9593 if (kind2 != kind)
9594 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595
9596 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009597
Guido van Rossumd57fd912000-03-10 22:53:23 +00009598 return result;
9599}
9600
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009601PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009602 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009604Encode S using the codec registered for encoding. Default encoding\n\
9605is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009606handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009607a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9608'xmlcharrefreplace' as well as any other name registered with\n\
9609codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009610
9611static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009612unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009614 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009615 char *encoding = NULL;
9616 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009617
Benjamin Peterson308d6372009-09-18 21:42:35 +00009618 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9619 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009620 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009621 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009622}
9623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009624PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009625 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626\n\
9627Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009628If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009629
9630static PyObject*
9631unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9632{
9633 Py_UNICODE *e;
9634 Py_UNICODE *p;
9635 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009636 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638 PyUnicodeObject *u;
9639 int tabsize = 8;
9640
9641 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009642 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9645 return NULL;
9646
Thomas Wouters7e474022000-07-16 12:04:32 +00009647 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009648 i = 0; /* chars up to and including most recent \n or \r */
9649 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9651 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009653 if (tabsize > 0) {
9654 incr = tabsize - (j % tabsize); /* cannot overflow */
9655 if (j > PY_SSIZE_T_MAX - incr)
9656 goto overflow1;
9657 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009658 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009659 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009661 if (j > PY_SSIZE_T_MAX - 1)
9662 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663 j++;
9664 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009665 if (i > PY_SSIZE_T_MAX - j)
9666 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009668 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669 }
9670 }
9671
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009672 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009673 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009674
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675 /* Second pass: create output string and fill it */
9676 u = _PyUnicode_New(i + j);
9677 if (!u)
9678 return NULL;
9679
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009680 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009681 q = _PyUnicode_WSTR(u); /* next output char */
9682 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009686 if (tabsize > 0) {
9687 i = tabsize - (j % tabsize);
9688 j += i;
9689 while (i--) {
9690 if (q >= qe)
9691 goto overflow2;
9692 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009693 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009695 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009696 else {
9697 if (q >= qe)
9698 goto overflow2;
9699 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009700 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009701 if (*p == '\n' || *p == '\r')
9702 j = 0;
9703 }
9704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 if (PyUnicode_READY(u) == -1) {
9706 Py_DECREF(u);
9707 return NULL;
9708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009710
9711 overflow2:
9712 Py_DECREF(u);
9713 overflow1:
9714 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9715 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716}
9717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009718PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009719 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720\n\
9721Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009722such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009723arguments start and end are interpreted as in slice notation.\n\
9724\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009725Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726
9727static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009729{
Jesus Ceaac451502011-04-20 17:09:23 +02009730 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009731 Py_ssize_t start;
9732 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009733 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734
Jesus Ceaac451502011-04-20 17:09:23 +02009735 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9736 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 if (PyUnicode_READY(self) == -1)
9740 return NULL;
9741 if (PyUnicode_READY(substring) == -1)
9742 return NULL;
9743
9744 result = any_find_slice(
9745 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9746 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009747 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748
9749 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751 if (result == -2)
9752 return NULL;
9753
Christian Heimes217cfd12007-12-02 14:31:20 +00009754 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755}
9756
9757static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009758unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 Py_UCS4 ch;
9761
9762 if (PyUnicode_READY(self) == -1)
9763 return NULL;
9764 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765 PyErr_SetString(PyExc_IndexError, "string index out of range");
9766 return NULL;
9767 }
9768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9770 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771}
9772
Guido van Rossumc2504932007-09-18 19:42:40 +00009773/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009774 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009775static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009776unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009777{
Guido van Rossumc2504932007-09-18 19:42:40 +00009778 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009779 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 if (_PyUnicode_HASH(self) != -1)
9782 return _PyUnicode_HASH(self);
9783 if (PyUnicode_READY(self) == -1)
9784 return -1;
9785 len = PyUnicode_GET_LENGTH(self);
9786
9787 /* The hash function as a macro, gets expanded three times below. */
9788#define HASH(P) \
9789 x = (Py_uhash_t)*P << 7; \
9790 while (--len >= 0) \
9791 x = (1000003*x) ^ (Py_uhash_t)*P++;
9792
9793 switch (PyUnicode_KIND(self)) {
9794 case PyUnicode_1BYTE_KIND: {
9795 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9796 HASH(c);
9797 break;
9798 }
9799 case PyUnicode_2BYTE_KIND: {
9800 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9801 HASH(s);
9802 break;
9803 }
9804 default: {
9805 Py_UCS4 *l;
9806 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9807 "Impossible switch case in unicode_hash");
9808 l = PyUnicode_4BYTE_DATA(self);
9809 HASH(l);
9810 break;
9811 }
9812 }
9813 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9814
Guido van Rossumc2504932007-09-18 19:42:40 +00009815 if (x == -1)
9816 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009818 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009822PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009823 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009825Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826
9827static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009829{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009830 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009831 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009832 Py_ssize_t start;
9833 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009834
Jesus Ceaac451502011-04-20 17:09:23 +02009835 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9836 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009837 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 if (PyUnicode_READY(self) == -1)
9840 return NULL;
9841 if (PyUnicode_READY(substring) == -1)
9842 return NULL;
9843
9844 result = any_find_slice(
9845 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9846 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009847 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009848
9849 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 if (result == -2)
9852 return NULL;
9853
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854 if (result < 0) {
9855 PyErr_SetString(PyExc_ValueError, "substring not found");
9856 return NULL;
9857 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009858
Christian Heimes217cfd12007-12-02 14:31:20 +00009859 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860}
9861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009862PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009863 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009865Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009866at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867
9868static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009869unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 Py_ssize_t i, length;
9872 int kind;
9873 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874 int cased;
9875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 if (PyUnicode_READY(self) == -1)
9877 return NULL;
9878 length = PyUnicode_GET_LENGTH(self);
9879 kind = PyUnicode_KIND(self);
9880 data = PyUnicode_DATA(self);
9881
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 if (length == 1)
9884 return PyBool_FromLong(
9885 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009887 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009888 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009889 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009890
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 for (i = 0; i < length; i++) {
9893 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009894
Benjamin Peterson29060642009-01-31 22:14:21 +00009895 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9896 return PyBool_FromLong(0);
9897 else if (!cased && Py_UNICODE_ISLOWER(ch))
9898 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009900 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901}
9902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009903PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009904 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009906Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009907at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009908
9909static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009910unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 Py_ssize_t i, length;
9913 int kind;
9914 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009915 int cased;
9916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 if (PyUnicode_READY(self) == -1)
9918 return NULL;
9919 length = PyUnicode_GET_LENGTH(self);
9920 kind = PyUnicode_KIND(self);
9921 data = PyUnicode_DATA(self);
9922
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 if (length == 1)
9925 return PyBool_FromLong(
9926 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009927
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009928 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009930 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009931
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 for (i = 0; i < length; i++) {
9934 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009935
Benjamin Peterson29060642009-01-31 22:14:21 +00009936 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9937 return PyBool_FromLong(0);
9938 else if (!cased && Py_UNICODE_ISUPPER(ch))
9939 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009941 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942}
9943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009944PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009945 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009946\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009947Return True if S is a titlecased string and there is at least one\n\
9948character in S, i.e. upper- and titlecase characters may only\n\
9949follow uncased characters and lowercase characters only cased ones.\n\
9950Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951
9952static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009953unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009954{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 Py_ssize_t i, length;
9956 int kind;
9957 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958 int cased, previous_is_cased;
9959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009960 if (PyUnicode_READY(self) == -1)
9961 return NULL;
9962 length = PyUnicode_GET_LENGTH(self);
9963 kind = PyUnicode_KIND(self);
9964 data = PyUnicode_DATA(self);
9965
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 if (length == 1) {
9968 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
9969 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
9970 (Py_UNICODE_ISUPPER(ch) != 0));
9971 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009972
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009973 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009975 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009976
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977 cased = 0;
9978 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 for (i = 0; i < length; i++) {
9980 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009981
Benjamin Peterson29060642009-01-31 22:14:21 +00009982 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
9983 if (previous_is_cased)
9984 return PyBool_FromLong(0);
9985 previous_is_cased = 1;
9986 cased = 1;
9987 }
9988 else if (Py_UNICODE_ISLOWER(ch)) {
9989 if (!previous_is_cased)
9990 return PyBool_FromLong(0);
9991 previous_is_cased = 1;
9992 cased = 1;
9993 }
9994 else
9995 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009997 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009998}
9999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010000PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010001 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010002\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010003Return True if all characters in S are whitespace\n\
10004and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010005
10006static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010007unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 Py_ssize_t i, length;
10010 int kind;
10011 void *data;
10012
10013 if (PyUnicode_READY(self) == -1)
10014 return NULL;
10015 length = PyUnicode_GET_LENGTH(self);
10016 kind = PyUnicode_KIND(self);
10017 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 if (length == 1)
10021 return PyBool_FromLong(
10022 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010024 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010026 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 for (i = 0; i < length; i++) {
10029 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010030 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010031 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010033 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034}
10035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010036PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010037 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010038\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010039Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010040and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010041
10042static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010043unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010044{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 Py_ssize_t i, length;
10046 int kind;
10047 void *data;
10048
10049 if (PyUnicode_READY(self) == -1)
10050 return NULL;
10051 length = PyUnicode_GET_LENGTH(self);
10052 kind = PyUnicode_KIND(self);
10053 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010054
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010055 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 if (length == 1)
10057 return PyBool_FromLong(
10058 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010059
10060 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010062 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 for (i = 0; i < length; i++) {
10065 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010066 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010067 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010068 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010069}
10070
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010071PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010072 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010073\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010074Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010075and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010076
10077static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010078unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010079{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 int kind;
10081 void *data;
10082 Py_ssize_t len, i;
10083
10084 if (PyUnicode_READY(self) == -1)
10085 return NULL;
10086
10087 kind = PyUnicode_KIND(self);
10088 data = PyUnicode_DATA(self);
10089 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010090
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010091 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 if (len == 1) {
10093 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10094 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10095 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010096
10097 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010099 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010100
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 for (i = 0; i < len; i++) {
10102 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010103 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010104 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010105 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010106 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010107}
10108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010109PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010110 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010112Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010113False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010114
10115static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010116unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010117{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 Py_ssize_t i, length;
10119 int kind;
10120 void *data;
10121
10122 if (PyUnicode_READY(self) == -1)
10123 return NULL;
10124 length = PyUnicode_GET_LENGTH(self);
10125 kind = PyUnicode_KIND(self);
10126 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127
Guido van Rossumd57fd912000-03-10 22:53:23 +000010128 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 if (length == 1)
10130 return PyBool_FromLong(
10131 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010133 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010135 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 for (i = 0; i < length; i++) {
10138 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010139 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010141 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142}
10143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010144PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010145 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010147Return True if all characters in S are digits\n\
10148and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149
10150static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010151unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 Py_ssize_t i, length;
10154 int kind;
10155 void *data;
10156
10157 if (PyUnicode_READY(self) == -1)
10158 return NULL;
10159 length = PyUnicode_GET_LENGTH(self);
10160 kind = PyUnicode_KIND(self);
10161 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 if (length == 1) {
10165 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10166 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10167 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010169 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010171 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 for (i = 0; i < length; i++) {
10174 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010175 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010177 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178}
10179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010180PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010181 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010183Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010184False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185
10186static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010187unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 Py_ssize_t i, length;
10190 int kind;
10191 void *data;
10192
10193 if (PyUnicode_READY(self) == -1)
10194 return NULL;
10195 length = PyUnicode_GET_LENGTH(self);
10196 kind = PyUnicode_KIND(self);
10197 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 if (length == 1)
10201 return PyBool_FromLong(
10202 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010204 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010206 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 for (i = 0; i < length; i++) {
10209 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010210 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010212 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213}
10214
Martin v. Löwis47383402007-08-15 07:32:56 +000010215int
10216PyUnicode_IsIdentifier(PyObject *self)
10217{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 int kind;
10219 void *data;
10220 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010221 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 if (PyUnicode_READY(self) == -1) {
10224 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010225 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 }
10227
10228 /* Special case for empty strings */
10229 if (PyUnicode_GET_LENGTH(self) == 0)
10230 return 0;
10231 kind = PyUnicode_KIND(self);
10232 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010233
10234 /* PEP 3131 says that the first character must be in
10235 XID_Start and subsequent characters in XID_Continue,
10236 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010237 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010238 letters, digits, underscore). However, given the current
10239 definition of XID_Start and XID_Continue, it is sufficient
10240 to check just for these, except that _ must be allowed
10241 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010243 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010244 return 0;
10245
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010246 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010248 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010249 return 1;
10250}
10251
10252PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010253 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010254\n\
10255Return True if S is a valid identifier according\n\
10256to the language definition.");
10257
10258static PyObject*
10259unicode_isidentifier(PyObject *self)
10260{
10261 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10262}
10263
Georg Brandl559e5d72008-06-11 18:37:52 +000010264PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010265 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010266\n\
10267Return True if all characters in S are considered\n\
10268printable in repr() or S is empty, False otherwise.");
10269
10270static PyObject*
10271unicode_isprintable(PyObject *self)
10272{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 Py_ssize_t i, length;
10274 int kind;
10275 void *data;
10276
10277 if (PyUnicode_READY(self) == -1)
10278 return NULL;
10279 length = PyUnicode_GET_LENGTH(self);
10280 kind = PyUnicode_KIND(self);
10281 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010282
10283 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 if (length == 1)
10285 return PyBool_FromLong(
10286 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 for (i = 0; i < length; i++) {
10289 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010290 Py_RETURN_FALSE;
10291 }
10292 }
10293 Py_RETURN_TRUE;
10294}
10295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010296PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010297 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298\n\
10299Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010300iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010301
10302static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010303unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010305 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306}
10307
Martin v. Löwis18e16552006-02-15 17:27:45 +000010308static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010309unicode_length(PyUnicodeObject *self)
10310{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 if (PyUnicode_READY(self) == -1)
10312 return -1;
10313 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314}
10315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010316PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010317 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010319Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010320done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321
10322static PyObject *
10323unicode_ljust(PyUnicodeObject *self, PyObject *args)
10324{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010325 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 Py_UCS4 fillchar = ' ';
10327
10328 if (PyUnicode_READY(self) == -1)
10329 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010330
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010331 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332 return NULL;
10333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335 Py_INCREF(self);
10336 return (PyObject*) self;
10337 }
10338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340}
10341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010342PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010343 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010344\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010345Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346
10347static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010348unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010349{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350 return fixup(self, fixlower);
10351}
10352
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010353#define LEFTSTRIP 0
10354#define RIGHTSTRIP 1
10355#define BOTHSTRIP 2
10356
10357/* Arrays indexed by above */
10358static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10359
10360#define STRIPNAME(i) (stripformat[i]+3)
10361
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010362/* externally visible for str.strip(unicode) */
10363PyObject *
10364_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10365{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 void *data;
10367 int kind;
10368 Py_ssize_t i, j, len;
10369 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10372 return NULL;
10373
10374 kind = PyUnicode_KIND(self);
10375 data = PyUnicode_DATA(self);
10376 len = PyUnicode_GET_LENGTH(self);
10377 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10378 PyUnicode_DATA(sepobj),
10379 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010380
Benjamin Peterson14339b62009-01-31 16:36:08 +000010381 i = 0;
10382 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 while (i < len &&
10384 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010385 i++;
10386 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010387 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010388
Benjamin Peterson14339b62009-01-31 16:36:08 +000010389 j = len;
10390 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010391 do {
10392 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 } while (j >= i &&
10394 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010395 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010396 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010397
Benjamin Peterson14339b62009-01-31 16:36:08 +000010398 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010399 Py_INCREF(self);
10400 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010401 }
10402 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 return PyUnicode_Substring((PyObject*)self, i, j);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010404}
10405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406/* Assumes an already ready self string. */
10407
10408static PyObject *
10409substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len)
10410{
10411 const int kind = PyUnicode_KIND(self);
10412 void *data = PyUnicode_DATA(self);
10413 Py_UCS4 maxchar = 0;
10414 Py_ssize_t i;
10415 PyObject *unicode;
10416
10417 if (start < 0 || len < 0 || (start + len) > PyUnicode_GET_LENGTH(self)) {
10418 PyErr_BadInternalCall();
10419 return NULL;
10420 }
10421
10422 if (len == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) {
10423 Py_INCREF(self);
10424 return (PyObject*)self;
10425 }
10426
10427 for (i = 0; i < len; ++i) {
10428 const Py_UCS4 ch = PyUnicode_READ(kind, data, start + i);
10429 if (ch > maxchar)
10430 maxchar = ch;
10431 }
10432
10433 unicode = PyUnicode_New(len, maxchar);
10434 if (unicode == NULL)
10435 return NULL;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010436 if (PyUnicode_CopyCharacters(unicode, 0,
10437 (PyObject*)self, start, len) < 0)
10438 {
10439 Py_DECREF(unicode);
10440 return NULL;
10441 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 return unicode;
10443}
10444
10445PyObject*
10446PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10447{
10448 unsigned char *data;
10449 int kind;
10450
10451 if (start == 0 && end == PyUnicode_GET_LENGTH(self)
10452 && PyUnicode_CheckExact(self))
10453 {
10454 Py_INCREF(self);
10455 return (PyObject *)self;
10456 }
10457
10458 if ((end - start) == 1)
10459 return unicode_getitem((PyUnicodeObject*)self, start);
10460
10461 if (PyUnicode_READY(self) == -1)
10462 return NULL;
10463 kind = PyUnicode_KIND(self);
10464 data = PyUnicode_1BYTE_DATA(self);
10465 return PyUnicode_FromKindAndData(kind, data + PyUnicode_KIND_SIZE(kind, start),
10466 end-start);
10467}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010468
10469static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010470do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010471{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 int kind;
10473 void *data;
10474 Py_ssize_t len, i, j;
10475
10476 if (PyUnicode_READY(self) == -1)
10477 return NULL;
10478
10479 kind = PyUnicode_KIND(self);
10480 data = PyUnicode_DATA(self);
10481 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010482
Benjamin Peterson14339b62009-01-31 16:36:08 +000010483 i = 0;
10484 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010486 i++;
10487 }
10488 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010489
Benjamin Peterson14339b62009-01-31 16:36:08 +000010490 j = len;
10491 if (striptype != LEFTSTRIP) {
10492 do {
10493 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010495 j++;
10496 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010497
Benjamin Peterson14339b62009-01-31 16:36:08 +000010498 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
10499 Py_INCREF(self);
10500 return (PyObject*)self;
10501 }
10502 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 return substring(self, i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010504}
10505
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010506
10507static PyObject *
10508do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10509{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010510 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010511
Benjamin Peterson14339b62009-01-31 16:36:08 +000010512 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10513 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010514
Benjamin Peterson14339b62009-01-31 16:36:08 +000010515 if (sep != NULL && sep != Py_None) {
10516 if (PyUnicode_Check(sep))
10517 return _PyUnicode_XStrip(self, striptype, sep);
10518 else {
10519 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010520 "%s arg must be None or str",
10521 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010522 return NULL;
10523 }
10524 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010525
Benjamin Peterson14339b62009-01-31 16:36:08 +000010526 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010527}
10528
10529
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010530PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010531 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010532\n\
10533Return a copy of the string S with leading and trailing\n\
10534whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010535If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010536
10537static PyObject *
10538unicode_strip(PyUnicodeObject *self, PyObject *args)
10539{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010540 if (PyTuple_GET_SIZE(args) == 0)
10541 return do_strip(self, BOTHSTRIP); /* Common case */
10542 else
10543 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010544}
10545
10546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010547PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010548 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010549\n\
10550Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010551If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010552
10553static PyObject *
10554unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10555{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010556 if (PyTuple_GET_SIZE(args) == 0)
10557 return do_strip(self, LEFTSTRIP); /* Common case */
10558 else
10559 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010560}
10561
10562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010563PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010564 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010565\n\
10566Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010567If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010568
10569static PyObject *
10570unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10571{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010572 if (PyTuple_GET_SIZE(args) == 0)
10573 return do_strip(self, RIGHTSTRIP); /* Common case */
10574 else
10575 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010576}
10577
10578
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010580unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581{
10582 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 Py_ssize_t nchars, n;
10584 size_t nbytes, char_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585
Georg Brandl222de0f2009-04-12 12:01:50 +000010586 if (len < 1) {
10587 Py_INCREF(unicode_empty);
10588 return (PyObject *)unicode_empty;
10589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010590
Tim Peters7a29bd52001-09-12 03:03:31 +000010591 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592 /* no repeat, return original string */
10593 Py_INCREF(str);
10594 return (PyObject*) str;
10595 }
Tim Peters8f422462000-09-09 06:13:41 +000010596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 if (PyUnicode_READY(str) == -1)
10598 return NULL;
10599
Tim Peters8f422462000-09-09 06:13:41 +000010600 /* ensure # of chars needed doesn't overflow int and # of bytes
10601 * needed doesn't overflow size_t
10602 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 nchars = len * PyUnicode_GET_LENGTH(str);
10604 if (nchars / len != PyUnicode_GET_LENGTH(str)) {
Tim Peters8f422462000-09-09 06:13:41 +000010605 PyErr_SetString(PyExc_OverflowError,
10606 "repeated string is too long");
10607 return NULL;
10608 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 char_size = PyUnicode_CHARACTER_SIZE(str);
10610 nbytes = (nchars + 1) * char_size;
10611 if (nbytes / char_size != (size_t)(nchars + 1)) {
Tim Peters8f422462000-09-09 06:13:41 +000010612 PyErr_SetString(PyExc_OverflowError,
10613 "repeated string is too long");
10614 return NULL;
10615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 if (!u)
10618 return NULL;
10619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 if (PyUnicode_GET_LENGTH(str) == 1) {
10621 const int kind = PyUnicode_KIND(str);
10622 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10623 void *to = PyUnicode_DATA(u);
10624 for (n = 0; n < len; ++n)
10625 PyUnicode_WRITE(kind, to, n, fill_char);
10626 }
10627 else {
10628 /* number of characters copied this far */
10629 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10630 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10631 char *to = (char *) PyUnicode_DATA(u);
10632 Py_MEMCPY(to, PyUnicode_DATA(str),
10633 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010634 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 n = (done <= nchars-done) ? done : nchars-done;
10636 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010637 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010638 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639 }
10640
10641 return (PyObject*) u;
10642}
10643
Alexander Belopolsky40018472011-02-26 01:02:56 +000010644PyObject *
10645PyUnicode_Replace(PyObject *obj,
10646 PyObject *subobj,
10647 PyObject *replobj,
10648 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649{
10650 PyObject *self;
10651 PyObject *str1;
10652 PyObject *str2;
10653 PyObject *result;
10654
10655 self = PyUnicode_FromObject(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 if (self == NULL || PyUnicode_READY(obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010658 str1 = PyUnicode_FromObject(subobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 if (str1 == NULL || PyUnicode_READY(obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010660 Py_DECREF(self);
10661 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662 }
10663 str2 = PyUnicode_FromObject(replobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 if (str2 == NULL || PyUnicode_READY(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010665 Py_DECREF(self);
10666 Py_DECREF(str1);
10667 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670 Py_DECREF(self);
10671 Py_DECREF(str1);
10672 Py_DECREF(str2);
10673 return result;
10674}
10675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010676PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010677 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678\n\
10679Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010680old replaced by new. If the optional argument count is\n\
10681given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682
10683static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010685{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 PyObject *str1;
10687 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010688 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689 PyObject *result;
10690
Martin v. Löwis18e16552006-02-15 17:27:45 +000010691 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010694 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 str1 = PyUnicode_FromObject(str1);
10696 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10697 return NULL;
10698 str2 = PyUnicode_FromObject(str2);
10699 if (str2 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010700 Py_DECREF(str1);
10701 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703
10704 result = replace(self, str1, str2, maxcount);
10705
10706 Py_DECREF(str1);
10707 Py_DECREF(str2);
10708 return result;
10709}
10710
Alexander Belopolsky40018472011-02-26 01:02:56 +000010711static PyObject *
10712unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010714 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 Py_ssize_t isize;
10716 Py_ssize_t osize, squote, dquote, i, o;
10717 Py_UCS4 max, quote;
10718 int ikind, okind;
10719 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010722 return NULL;
10723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 isize = PyUnicode_GET_LENGTH(unicode);
10725 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 /* Compute length of output, quote characters, and
10728 maximum character */
10729 osize = 2; /* quotes */
10730 max = 127;
10731 squote = dquote = 0;
10732 ikind = PyUnicode_KIND(unicode);
10733 for (i = 0; i < isize; i++) {
10734 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10735 switch (ch) {
10736 case '\'': squote++; osize++; break;
10737 case '"': dquote++; osize++; break;
10738 case '\\': case '\t': case '\r': case '\n':
10739 osize += 2; break;
10740 default:
10741 /* Fast-path ASCII */
10742 if (ch < ' ' || ch == 0x7f)
10743 osize += 4; /* \xHH */
10744 else if (ch < 0x7f)
10745 osize++;
10746 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10747 osize++;
10748 max = ch > max ? ch : max;
10749 }
10750 else if (ch < 0x100)
10751 osize += 4; /* \xHH */
10752 else if (ch < 0x10000)
10753 osize += 6; /* \uHHHH */
10754 else
10755 osize += 10; /* \uHHHHHHHH */
10756 }
10757 }
10758
10759 quote = '\'';
10760 if (squote) {
10761 if (dquote)
10762 /* Both squote and dquote present. Use squote,
10763 and escape them */
10764 osize += squote;
10765 else
10766 quote = '"';
10767 }
10768
10769 repr = PyUnicode_New(osize, max);
10770 if (repr == NULL)
10771 return NULL;
10772 okind = PyUnicode_KIND(repr);
10773 odata = PyUnicode_DATA(repr);
10774
10775 PyUnicode_WRITE(okind, odata, 0, quote);
10776 PyUnicode_WRITE(okind, odata, osize-1, quote);
10777
10778 for (i = 0, o = 1; i < isize; i++) {
10779 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010780
10781 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 if ((ch == quote) || (ch == '\\')) {
10783 PyUnicode_WRITE(okind, odata, o++, '\\');
10784 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010785 continue;
10786 }
10787
Benjamin Peterson29060642009-01-31 22:14:21 +000010788 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010789 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 PyUnicode_WRITE(okind, odata, o++, '\\');
10791 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010792 }
10793 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 PyUnicode_WRITE(okind, odata, o++, '\\');
10795 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010796 }
10797 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 PyUnicode_WRITE(okind, odata, o++, '\\');
10799 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010800 }
10801
10802 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010803 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 PyUnicode_WRITE(okind, odata, o++, '\\');
10805 PyUnicode_WRITE(okind, odata, o++, 'x');
10806 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10807 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010808 }
10809
Georg Brandl559e5d72008-06-11 18:37:52 +000010810 /* Copy ASCII characters as-is */
10811 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010813 }
10814
Benjamin Peterson29060642009-01-31 22:14:21 +000010815 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010816 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010817 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010818 (categories Z* and C* except ASCII space)
10819 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010821 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 if (ch <= 0xff) {
10823 PyUnicode_WRITE(okind, odata, o++, '\\');
10824 PyUnicode_WRITE(okind, odata, o++, 'x');
10825 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10826 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010827 }
10828 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 else if (ch >= 0x10000) {
10830 PyUnicode_WRITE(okind, odata, o++, '\\');
10831 PyUnicode_WRITE(okind, odata, o++, 'U');
10832 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10833 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10834 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10835 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10836 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10837 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10838 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10839 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010840 }
10841 /* Map 16-bit characters to '\uxxxx' */
10842 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 PyUnicode_WRITE(okind, odata, o++, '\\');
10844 PyUnicode_WRITE(okind, odata, o++, 'u');
10845 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10846 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10847 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10848 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010849 }
10850 }
10851 /* Copy characters as-is */
10852 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010854 }
10855 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010856 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010858 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010859}
10860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010861PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010862 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863\n\
10864Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010865such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866arguments start and end are interpreted as in slice notation.\n\
10867\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010868Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869
10870static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010871unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872{
Jesus Ceaac451502011-04-20 17:09:23 +020010873 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010874 Py_ssize_t start;
10875 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010876 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877
Jesus Ceaac451502011-04-20 17:09:23 +020010878 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10879 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010880 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010882 if (PyUnicode_READY(self) == -1)
10883 return NULL;
10884 if (PyUnicode_READY(substring) == -1)
10885 return NULL;
10886
10887 result = any_find_slice(
10888 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10889 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010890 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010891
10892 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 if (result == -2)
10895 return NULL;
10896
Christian Heimes217cfd12007-12-02 14:31:20 +000010897 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898}
10899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010900PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010901 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010902\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010903Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904
10905static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907{
Jesus Ceaac451502011-04-20 17:09:23 +020010908 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010909 Py_ssize_t start;
10910 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010911 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912
Jesus Ceaac451502011-04-20 17:09:23 +020010913 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10914 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010915 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010917 if (PyUnicode_READY(self) == -1)
10918 return NULL;
10919 if (PyUnicode_READY(substring) == -1)
10920 return NULL;
10921
10922 result = any_find_slice(
10923 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10924 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010925 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926
10927 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929 if (result == -2)
10930 return NULL;
10931
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932 if (result < 0) {
10933 PyErr_SetString(PyExc_ValueError, "substring not found");
10934 return NULL;
10935 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936
Christian Heimes217cfd12007-12-02 14:31:20 +000010937 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938}
10939
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010940PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010941 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010943Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010944done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945
10946static PyObject *
10947unicode_rjust(PyUnicodeObject *self, PyObject *args)
10948{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010949 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 Py_UCS4 fillchar = ' ';
10951
10952 if (PyUnicode_READY(self) == -1)
10953 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010954
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010955 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956 return NULL;
10957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959 Py_INCREF(self);
10960 return (PyObject*) self;
10961 }
10962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964}
10965
Alexander Belopolsky40018472011-02-26 01:02:56 +000010966PyObject *
10967PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968{
10969 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010970
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971 s = PyUnicode_FromObject(s);
10972 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010973 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010974 if (sep != NULL) {
10975 sep = PyUnicode_FromObject(sep);
10976 if (sep == NULL) {
10977 Py_DECREF(s);
10978 return NULL;
10979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980 }
10981
10982 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
10983
10984 Py_DECREF(s);
10985 Py_XDECREF(sep);
10986 return result;
10987}
10988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010989PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010990 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991\n\
10992Return a list of the words in S, using sep as the\n\
10993delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000010994splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000010995whitespace string is a separator and empty strings are\n\
10996removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997
10998static PyObject*
10999unicode_split(PyUnicodeObject *self, PyObject *args)
11000{
11001 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011002 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003
Martin v. Löwis18e16552006-02-15 17:27:45 +000011004 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005 return NULL;
11006
11007 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011008 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011010 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011012 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013}
11014
Thomas Wouters477c8d52006-05-27 19:21:47 +000011015PyObject *
11016PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11017{
11018 PyObject* str_obj;
11019 PyObject* sep_obj;
11020 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021 int kind1, kind2, kind;
11022 void *buf1 = NULL, *buf2 = NULL;
11023 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011024
11025 str_obj = PyUnicode_FromObject(str_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 if (!str_obj || PyUnicode_READY(str_in) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011027 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011028 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011030 Py_DECREF(str_obj);
11031 return NULL;
11032 }
11033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 kind1 = PyUnicode_KIND(str_in);
11035 kind2 = PyUnicode_KIND(sep_obj);
11036 kind = kind1 > kind2 ? kind1 : kind2;
11037 buf1 = PyUnicode_DATA(str_in);
11038 if (kind1 != kind)
11039 buf1 = _PyUnicode_AsKind(str_in, kind);
11040 if (!buf1)
11041 goto onError;
11042 buf2 = PyUnicode_DATA(sep_obj);
11043 if (kind2 != kind)
11044 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11045 if (!buf2)
11046 goto onError;
11047 len1 = PyUnicode_GET_LENGTH(str_obj);
11048 len2 = PyUnicode_GET_LENGTH(sep_obj);
11049
11050 switch(PyUnicode_KIND(str_in)) {
11051 case PyUnicode_1BYTE_KIND:
11052 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11053 break;
11054 case PyUnicode_2BYTE_KIND:
11055 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11056 break;
11057 case PyUnicode_4BYTE_KIND:
11058 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11059 break;
11060 default:
11061 assert(0);
11062 out = 0;
11063 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011064
11065 Py_DECREF(sep_obj);
11066 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067 if (kind1 != kind)
11068 PyMem_Free(buf1);
11069 if (kind2 != kind)
11070 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011071
11072 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073 onError:
11074 Py_DECREF(sep_obj);
11075 Py_DECREF(str_obj);
11076 if (kind1 != kind && buf1)
11077 PyMem_Free(buf1);
11078 if (kind2 != kind && buf2)
11079 PyMem_Free(buf2);
11080 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011081}
11082
11083
11084PyObject *
11085PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11086{
11087 PyObject* str_obj;
11088 PyObject* sep_obj;
11089 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090 int kind1, kind2, kind;
11091 void *buf1 = NULL, *buf2 = NULL;
11092 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011093
11094 str_obj = PyUnicode_FromObject(str_in);
11095 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011096 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011097 sep_obj = PyUnicode_FromObject(sep_in);
11098 if (!sep_obj) {
11099 Py_DECREF(str_obj);
11100 return NULL;
11101 }
11102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103 kind1 = PyUnicode_KIND(str_in);
11104 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011105 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011106 buf1 = PyUnicode_DATA(str_in);
11107 if (kind1 != kind)
11108 buf1 = _PyUnicode_AsKind(str_in, kind);
11109 if (!buf1)
11110 goto onError;
11111 buf2 = PyUnicode_DATA(sep_obj);
11112 if (kind2 != kind)
11113 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11114 if (!buf2)
11115 goto onError;
11116 len1 = PyUnicode_GET_LENGTH(str_obj);
11117 len2 = PyUnicode_GET_LENGTH(sep_obj);
11118
11119 switch(PyUnicode_KIND(str_in)) {
11120 case PyUnicode_1BYTE_KIND:
11121 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11122 break;
11123 case PyUnicode_2BYTE_KIND:
11124 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11125 break;
11126 case PyUnicode_4BYTE_KIND:
11127 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11128 break;
11129 default:
11130 assert(0);
11131 out = 0;
11132 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011133
11134 Py_DECREF(sep_obj);
11135 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 if (kind1 != kind)
11137 PyMem_Free(buf1);
11138 if (kind2 != kind)
11139 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011140
11141 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011142 onError:
11143 Py_DECREF(sep_obj);
11144 Py_DECREF(str_obj);
11145 if (kind1 != kind && buf1)
11146 PyMem_Free(buf1);
11147 if (kind2 != kind && buf2)
11148 PyMem_Free(buf2);
11149 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011150}
11151
11152PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011153 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011154\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011155Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011156the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011157found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011158
11159static PyObject*
11160unicode_partition(PyUnicodeObject *self, PyObject *separator)
11161{
11162 return PyUnicode_Partition((PyObject *)self, separator);
11163}
11164
11165PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011166 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011167\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011168Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011169the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011170separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011171
11172static PyObject*
11173unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11174{
11175 return PyUnicode_RPartition((PyObject *)self, separator);
11176}
11177
Alexander Belopolsky40018472011-02-26 01:02:56 +000011178PyObject *
11179PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011180{
11181 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011182
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011183 s = PyUnicode_FromObject(s);
11184 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011185 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011186 if (sep != NULL) {
11187 sep = PyUnicode_FromObject(sep);
11188 if (sep == NULL) {
11189 Py_DECREF(s);
11190 return NULL;
11191 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011192 }
11193
11194 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11195
11196 Py_DECREF(s);
11197 Py_XDECREF(sep);
11198 return result;
11199}
11200
11201PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011202 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011203\n\
11204Return a list of the words in S, using sep as the\n\
11205delimiter string, starting at the end of the string and\n\
11206working to the front. If maxsplit is given, at most maxsplit\n\
11207splits are done. If sep is not specified, any whitespace string\n\
11208is a separator.");
11209
11210static PyObject*
11211unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11212{
11213 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011214 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011215
Martin v. Löwis18e16552006-02-15 17:27:45 +000011216 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011217 return NULL;
11218
11219 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011220 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011221 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011222 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011223 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011224 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011225}
11226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011227PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011228 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229\n\
11230Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011231Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011232is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233
11234static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011235unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011237 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011238 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011240 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11241 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242 return NULL;
11243
Guido van Rossum86662912000-04-11 15:38:46 +000011244 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245}
11246
11247static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011248PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249{
Walter Dörwald346737f2007-05-31 10:44:43 +000011250 if (PyUnicode_CheckExact(self)) {
11251 Py_INCREF(self);
11252 return self;
11253 } else
11254 /* Subtype -- return genuine unicode string with the same value. */
11255 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
11256 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257}
11258
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011259PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011260 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261\n\
11262Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011263and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264
11265static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011266unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268 return fixup(self, fixswapcase);
11269}
11270
Georg Brandlceee0772007-11-27 23:48:05 +000011271PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011272 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011273\n\
11274Return a translation table usable for str.translate().\n\
11275If there is only one argument, it must be a dictionary mapping Unicode\n\
11276ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011277Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011278If there are two arguments, they must be strings of equal length, and\n\
11279in the resulting dictionary, each character in x will be mapped to the\n\
11280character at the same position in y. If there is a third argument, it\n\
11281must be a string, whose characters will be mapped to None in the result.");
11282
11283static PyObject*
11284unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11285{
11286 PyObject *x, *y = NULL, *z = NULL;
11287 PyObject *new = NULL, *key, *value;
11288 Py_ssize_t i = 0;
11289 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011290
Georg Brandlceee0772007-11-27 23:48:05 +000011291 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11292 return NULL;
11293 new = PyDict_New();
11294 if (!new)
11295 return NULL;
11296 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297 int x_kind, y_kind, z_kind;
11298 void *x_data, *y_data, *z_data;
11299
Georg Brandlceee0772007-11-27 23:48:05 +000011300 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011301 if (!PyUnicode_Check(x)) {
11302 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11303 "be a string if there is a second argument");
11304 goto err;
11305 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011307 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11308 "arguments must have equal length");
11309 goto err;
11310 }
11311 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011312 x_kind = PyUnicode_KIND(x);
11313 y_kind = PyUnicode_KIND(y);
11314 x_data = PyUnicode_DATA(x);
11315 y_data = PyUnicode_DATA(y);
11316 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11317 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11318 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011319 if (!key || !value)
11320 goto err;
11321 res = PyDict_SetItem(new, key, value);
11322 Py_DECREF(key);
11323 Py_DECREF(value);
11324 if (res < 0)
11325 goto err;
11326 }
11327 /* create entries for deleting chars in z */
11328 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 z_kind = PyUnicode_KIND(z);
11330 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011331 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011333 if (!key)
11334 goto err;
11335 res = PyDict_SetItem(new, key, Py_None);
11336 Py_DECREF(key);
11337 if (res < 0)
11338 goto err;
11339 }
11340 }
11341 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 int kind;
11343 void *data;
11344
Georg Brandlceee0772007-11-27 23:48:05 +000011345 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011346 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011347 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11348 "to maketrans it must be a dict");
11349 goto err;
11350 }
11351 /* copy entries into the new dict, converting string keys to int keys */
11352 while (PyDict_Next(x, &i, &key, &value)) {
11353 if (PyUnicode_Check(key)) {
11354 /* convert string keys to integer keys */
11355 PyObject *newkey;
11356 if (PyUnicode_GET_SIZE(key) != 1) {
11357 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11358 "table must be of length 1");
11359 goto err;
11360 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 kind = PyUnicode_KIND(key);
11362 data = PyUnicode_DATA(key);
11363 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011364 if (!newkey)
11365 goto err;
11366 res = PyDict_SetItem(new, newkey, value);
11367 Py_DECREF(newkey);
11368 if (res < 0)
11369 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011370 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011371 /* just keep integer keys */
11372 if (PyDict_SetItem(new, key, value) < 0)
11373 goto err;
11374 } else {
11375 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11376 "be strings or integers");
11377 goto err;
11378 }
11379 }
11380 }
11381 return new;
11382 err:
11383 Py_DECREF(new);
11384 return NULL;
11385}
11386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011387PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389\n\
11390Return a copy of the string S, where all characters have been mapped\n\
11391through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011392Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011393Unmapped characters are left untouched. Characters mapped to None\n\
11394are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395
11396static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400}
11401
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011402PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011403 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011405Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406
11407static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011408unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410 return fixup(self, fixupper);
11411}
11412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011413PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011414 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011416Pad a numeric string S with zeros on the left, to fill a field\n\
11417of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418
11419static PyObject *
11420unicode_zfill(PyUnicodeObject *self, PyObject *args)
11421{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011422 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011424 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011425 int kind;
11426 void *data;
11427 Py_UCS4 chr;
11428
11429 if (PyUnicode_READY(self) == -1)
11430 return NULL;
11431
Martin v. Löwis18e16552006-02-15 17:27:45 +000011432 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433 return NULL;
11434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011436 if (PyUnicode_CheckExact(self)) {
11437 Py_INCREF(self);
11438 return (PyObject*) self;
11439 }
11440 else
11441 return PyUnicode_FromUnicode(
11442 PyUnicode_AS_UNICODE(self),
11443 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445 }
11446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448
11449 u = pad(self, fill, 0, '0');
11450
Walter Dörwald068325e2002-04-15 13:36:47 +000011451 if (u == NULL)
11452 return NULL;
11453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 kind = PyUnicode_KIND(u);
11455 data = PyUnicode_DATA(u);
11456 chr = PyUnicode_READ(kind, data, fill);
11457
11458 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 PyUnicode_WRITE(kind, data, 0, chr);
11461 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462 }
11463
11464 return (PyObject*) u;
11465}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466
11467#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011468static PyObject *
11469unicode__decimal2ascii(PyObject *self)
11470{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011472}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473#endif
11474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011475PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011476 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011478Return True if S starts with the specified prefix, False otherwise.\n\
11479With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011480With optional end, stop comparing S at that position.\n\
11481prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482
11483static PyObject *
11484unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011487 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011489 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011490 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011491 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492
Jesus Ceaac451502011-04-20 17:09:23 +020011493 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011495 if (PyTuple_Check(subobj)) {
11496 Py_ssize_t i;
11497 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11498 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011500 if (substring == NULL)
11501 return NULL;
11502 result = tailmatch(self, substring, start, end, -1);
11503 Py_DECREF(substring);
11504 if (result) {
11505 Py_RETURN_TRUE;
11506 }
11507 }
11508 /* nothing matched */
11509 Py_RETURN_FALSE;
11510 }
11511 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011512 if (substring == NULL) {
11513 if (PyErr_ExceptionMatches(PyExc_TypeError))
11514 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11515 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011517 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011518 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011520 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521}
11522
11523
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011524PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011525 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011527Return True if S ends with the specified suffix, False otherwise.\n\
11528With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011529With optional end, stop comparing S at that position.\n\
11530suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531
11532static PyObject *
11533unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011534 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011536 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011538 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011539 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011540 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541
Jesus Ceaac451502011-04-20 17:09:23 +020011542 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011543 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011544 if (PyTuple_Check(subobj)) {
11545 Py_ssize_t i;
11546 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11547 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011548 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011549 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011551 result = tailmatch(self, substring, start, end, +1);
11552 Py_DECREF(substring);
11553 if (result) {
11554 Py_RETURN_TRUE;
11555 }
11556 }
11557 Py_RETURN_FALSE;
11558 }
11559 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011560 if (substring == NULL) {
11561 if (PyErr_ExceptionMatches(PyExc_TypeError))
11562 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11563 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011564 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011565 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011566 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011568 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569}
11570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011572
11573PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011574 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011575\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011576Return a formatted version of S, using substitutions from args and kwargs.\n\
11577The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011578
Eric Smith27bbca62010-11-04 17:06:58 +000011579PyDoc_STRVAR(format_map__doc__,
11580 "S.format_map(mapping) -> str\n\
11581\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011582Return a formatted version of S, using substitutions from mapping.\n\
11583The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011584
Eric Smith4a7d76d2008-05-30 18:10:19 +000011585static PyObject *
11586unicode__format__(PyObject* self, PyObject* args)
11587{
11588 PyObject *format_spec;
11589
11590 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11591 return NULL;
11592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011593 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11594 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011595}
11596
Eric Smith8c663262007-08-25 02:26:07 +000011597PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011598 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011599\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011600Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011601
11602static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011603unicode__sizeof__(PyUnicodeObject *v)
11604{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 Py_ssize_t size;
11606
11607 /* If it's a compact object, account for base structure +
11608 character data. */
11609 if (PyUnicode_IS_COMPACT_ASCII(v))
11610 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11611 else if (PyUnicode_IS_COMPACT(v))
11612 size = sizeof(PyCompactUnicodeObject) +
11613 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11614 else {
11615 /* If it is a two-block object, account for base object, and
11616 for character block if present. */
11617 size = sizeof(PyUnicodeObject);
11618 if (v->data.any)
11619 size += (PyUnicode_GET_LENGTH(v) + 1) *
11620 PyUnicode_CHARACTER_SIZE(v);
11621 }
11622 /* If the wstr pointer is present, account for it unless it is shared
11623 with the data pointer. Since PyUnicode_DATA will crash if the object
11624 is not ready, check whether it's either not ready (in which case the
11625 data is entirely in wstr) or if the data is not shared. */
11626 if (_PyUnicode_WSTR(v) &&
11627 (!PyUnicode_IS_READY(v) ||
11628 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11629 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11630 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11631 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11632
11633 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011634}
11635
11636PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011638
11639static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011640unicode_getnewargs(PyUnicodeObject *v)
11641{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642 PyObject *copy;
11643 unsigned char *data;
11644 int kind;
11645 if (PyUnicode_READY(v) == -1)
11646 return NULL;
11647 kind = PyUnicode_KIND(v);
11648 data = PyUnicode_1BYTE_DATA(v);
11649 copy = PyUnicode_FromKindAndData(kind, data, PyUnicode_GET_LENGTH(v));
11650 if (!copy)
11651 return NULL;
11652 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011653}
11654
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655static PyMethodDef unicode_methods[] = {
11656
11657 /* Order is according to common usage: often used methods should
11658 appear first, since lookup is done sequentially. */
11659
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011660 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011661 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11662 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011663 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011664 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11665 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11666 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11667 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11668 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11669 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11670 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011671 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011672 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11673 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11674 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011675 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011676 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11677 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11678 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011679 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011680 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011681 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011682 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011683 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11684 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11685 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11686 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11687 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11688 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11689 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11690 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11691 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11692 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11693 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11694 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11695 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11696 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011697 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011698 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011699 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011700 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011701 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011702 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011703 {"maketrans", (PyCFunction) unicode_maketrans,
11704 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011705 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011706#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011707 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708#endif
11709
11710#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011711 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011712 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713#endif
11714
Benjamin Peterson14339b62009-01-31 16:36:08 +000011715 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716 {NULL, NULL}
11717};
11718
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011719static PyObject *
11720unicode_mod(PyObject *v, PyObject *w)
11721{
Brian Curtindfc80e32011-08-10 20:28:54 -050011722 if (!PyUnicode_Check(v))
11723 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011725}
11726
11727static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011728 0, /*nb_add*/
11729 0, /*nb_subtract*/
11730 0, /*nb_multiply*/
11731 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011732};
11733
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011735 (lenfunc) unicode_length, /* sq_length */
11736 PyUnicode_Concat, /* sq_concat */
11737 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11738 (ssizeargfunc) unicode_getitem, /* sq_item */
11739 0, /* sq_slice */
11740 0, /* sq_ass_item */
11741 0, /* sq_ass_slice */
11742 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743};
11744
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011745static PyObject*
11746unicode_subscript(PyUnicodeObject* self, PyObject* item)
11747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748 if (PyUnicode_READY(self) == -1)
11749 return NULL;
11750
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011751 if (PyIndex_Check(item)) {
11752 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011753 if (i == -1 && PyErr_Occurred())
11754 return NULL;
11755 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011757 return unicode_getitem(self, i);
11758 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011759 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011761 Py_UNICODE* result_buf;
11762 PyObject* result;
11763
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011765 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011766 return NULL;
11767 }
11768
11769 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 return PyUnicode_New(0, 0);
11771 } else if (start == 0 && step == 1 &&
11772 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011773 PyUnicode_CheckExact(self)) {
11774 Py_INCREF(self);
11775 return (PyObject *)self;
11776 } else if (step == 1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 return substring(self, start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011778 } else {
11779 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011780 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11781 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011782
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 if (result_buf == NULL)
11784 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011785
11786 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11787 result_buf[i] = source_buf[cur];
11788 }
Tim Petersced69f82003-09-16 20:30:58 +000011789
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011790 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011791 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011792 return result;
11793 }
11794 } else {
11795 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11796 return NULL;
11797 }
11798}
11799
11800static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011801 (lenfunc)unicode_length, /* mp_length */
11802 (binaryfunc)unicode_subscript, /* mp_subscript */
11803 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011804};
11805
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807/* Helpers for PyUnicode_Format() */
11808
11809static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011810getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011812 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011814 (*p_argidx)++;
11815 if (arglen < 0)
11816 return args;
11817 else
11818 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819 }
11820 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011821 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822 return NULL;
11823}
11824
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011825/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011827static PyObject *
11828formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011830 char *p;
11831 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011833
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834 x = PyFloat_AsDouble(v);
11835 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011836 return NULL;
11837
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011839 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011840
Eric Smith0923d1d2009-04-16 20:16:10 +000011841 p = PyOS_double_to_string(x, type, prec,
11842 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011843 if (p == NULL)
11844 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011846 PyMem_Free(p);
11847 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848}
11849
Tim Peters38fd5b62000-09-21 05:43:11 +000011850static PyObject*
11851formatlong(PyObject *val, int flags, int prec, int type)
11852{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011853 char *buf;
11854 int len;
11855 PyObject *str; /* temporary string object. */
11856 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011857
Benjamin Peterson14339b62009-01-31 16:36:08 +000011858 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11859 if (!str)
11860 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011862 Py_DECREF(str);
11863 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011864}
11865
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011868 size_t buflen,
11869 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011871 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011872 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 if (PyUnicode_GET_LENGTH(v) == 1) {
11874 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011875 buf[1] = '\0';
11876 return 1;
11877 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 goto onError;
11879 }
11880 else {
11881 /* Integer input truncated to a character */
11882 long x;
11883 x = PyLong_AsLong(v);
11884 if (x == -1 && PyErr_Occurred())
11885 goto onError;
11886
11887 if (x < 0 || x > 0x10ffff) {
11888 PyErr_SetString(PyExc_OverflowError,
11889 "%c arg not in range(0x110000)");
11890 return -1;
11891 }
11892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011894 buf[1] = '\0';
11895 return 1;
11896 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011897
Benjamin Peterson29060642009-01-31 22:14:21 +000011898 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011899 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011901 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902}
11903
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011904/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011905 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011906*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011907#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011908
Alexander Belopolsky40018472011-02-26 01:02:56 +000011909PyObject *
11910PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 void *fmt;
11913 int fmtkind;
11914 PyObject *result;
11915 Py_UCS4 *res, *res0;
11916 Py_UCS4 max;
11917 int kind;
11918 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011922
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011924 PyErr_BadInternalCall();
11925 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11928 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 fmt = PyUnicode_DATA(uformat);
11931 fmtkind = PyUnicode_KIND(uformat);
11932 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11933 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934
11935 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11937 if (res0 == NULL) {
11938 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011939 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941
11942 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011943 arglen = PyTuple_Size(args);
11944 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945 }
11946 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011947 arglen = -1;
11948 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011950 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011951 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011952 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953
11954 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011956 if (--rescnt < 0) {
11957 rescnt = fmtcnt + 100;
11958 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11960 if (res0 == NULL){
11961 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011962 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 }
11964 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011965 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011966 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011968 }
11969 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011970 /* Got a format specifier */
11971 int flags = 0;
11972 Py_ssize_t width = -1;
11973 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 Py_UCS4 c = '\0';
11975 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011976 int isnumok;
11977 PyObject *v = NULL;
11978 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 void *pbuf;
11980 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 Py_ssize_t len, len1;
11983 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 fmtpos++;
11986 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
11987 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000011988 Py_ssize_t keylen;
11989 PyObject *key;
11990 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000011991
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 if (dict == NULL) {
11993 PyErr_SetString(PyExc_TypeError,
11994 "format requires a mapping");
11995 goto onError;
11996 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011998 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 /* Skip over balanced parentheses */
12001 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012003 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012007 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012009 if (fmtcnt < 0 || pcount > 0) {
12010 PyErr_SetString(PyExc_ValueError,
12011 "incomplete format key");
12012 goto onError;
12013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 key = substring(uformat, keystart, keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012015 if (key == NULL)
12016 goto onError;
12017 if (args_owned) {
12018 Py_DECREF(args);
12019 args_owned = 0;
12020 }
12021 args = PyObject_GetItem(dict, key);
12022 Py_DECREF(key);
12023 if (args == NULL) {
12024 goto onError;
12025 }
12026 args_owned = 1;
12027 arglen = -1;
12028 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012029 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012030 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012032 case '-': flags |= F_LJUST; continue;
12033 case '+': flags |= F_SIGN; continue;
12034 case ' ': flags |= F_BLANK; continue;
12035 case '#': flags |= F_ALT; continue;
12036 case '0': flags |= F_ZERO; continue;
12037 }
12038 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012039 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012040 if (c == '*') {
12041 v = getnextarg(args, arglen, &argidx);
12042 if (v == NULL)
12043 goto onError;
12044 if (!PyLong_Check(v)) {
12045 PyErr_SetString(PyExc_TypeError,
12046 "* wants int");
12047 goto onError;
12048 }
12049 width = PyLong_AsLong(v);
12050 if (width == -1 && PyErr_Occurred())
12051 goto onError;
12052 if (width < 0) {
12053 flags |= F_LJUST;
12054 width = -width;
12055 }
12056 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012058 }
12059 else if (c >= '0' && c <= '9') {
12060 width = c - '0';
12061 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012062 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012063 if (c < '0' || c > '9')
12064 break;
12065 if ((width*10) / 10 != width) {
12066 PyErr_SetString(PyExc_ValueError,
12067 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012068 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012069 }
12070 width = width*10 + (c - '0');
12071 }
12072 }
12073 if (c == '.') {
12074 prec = 0;
12075 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012077 if (c == '*') {
12078 v = getnextarg(args, arglen, &argidx);
12079 if (v == NULL)
12080 goto onError;
12081 if (!PyLong_Check(v)) {
12082 PyErr_SetString(PyExc_TypeError,
12083 "* wants int");
12084 goto onError;
12085 }
12086 prec = PyLong_AsLong(v);
12087 if (prec == -1 && PyErr_Occurred())
12088 goto onError;
12089 if (prec < 0)
12090 prec = 0;
12091 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012093 }
12094 else if (c >= '0' && c <= '9') {
12095 prec = c - '0';
12096 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 if (c < '0' || c > '9')
12099 break;
12100 if ((prec*10) / 10 != prec) {
12101 PyErr_SetString(PyExc_ValueError,
12102 "prec too big");
12103 goto onError;
12104 }
12105 prec = prec*10 + (c - '0');
12106 }
12107 }
12108 } /* prec */
12109 if (fmtcnt >= 0) {
12110 if (c == 'h' || c == 'l' || c == 'L') {
12111 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012113 }
12114 }
12115 if (fmtcnt < 0) {
12116 PyErr_SetString(PyExc_ValueError,
12117 "incomplete format");
12118 goto onError;
12119 }
12120 if (c != '%') {
12121 v = getnextarg(args, arglen, &argidx);
12122 if (v == NULL)
12123 goto onError;
12124 }
12125 sign = 0;
12126 fill = ' ';
12127 switch (c) {
12128
12129 case '%':
12130 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012132 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 len = 1;
12135 break;
12136
12137 case 's':
12138 case 'r':
12139 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012140 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012141 temp = v;
12142 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012143 }
12144 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 if (c == 's')
12146 temp = PyObject_Str(v);
12147 else if (c == 'r')
12148 temp = PyObject_Repr(v);
12149 else
12150 temp = PyObject_ASCII(v);
12151 if (temp == NULL)
12152 goto onError;
12153 if (PyUnicode_Check(temp))
12154 /* nothing to do */;
12155 else {
12156 Py_DECREF(temp);
12157 PyErr_SetString(PyExc_TypeError,
12158 "%s argument has non-string str()");
12159 goto onError;
12160 }
12161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 if (PyUnicode_READY(temp) == -1) {
12163 Py_CLEAR(temp);
12164 goto onError;
12165 }
12166 pbuf = PyUnicode_DATA(temp);
12167 kind = PyUnicode_KIND(temp);
12168 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012169 if (prec >= 0 && len > prec)
12170 len = prec;
12171 break;
12172
12173 case 'i':
12174 case 'd':
12175 case 'u':
12176 case 'o':
12177 case 'x':
12178 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012179 isnumok = 0;
12180 if (PyNumber_Check(v)) {
12181 PyObject *iobj=NULL;
12182
12183 if (PyLong_Check(v)) {
12184 iobj = v;
12185 Py_INCREF(iobj);
12186 }
12187 else {
12188 iobj = PyNumber_Long(v);
12189 }
12190 if (iobj!=NULL) {
12191 if (PyLong_Check(iobj)) {
12192 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012193 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012194 Py_DECREF(iobj);
12195 if (!temp)
12196 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 if (PyUnicode_READY(temp) == -1) {
12198 Py_CLEAR(temp);
12199 goto onError;
12200 }
12201 pbuf = PyUnicode_DATA(temp);
12202 kind = PyUnicode_KIND(temp);
12203 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012204 sign = 1;
12205 }
12206 else {
12207 Py_DECREF(iobj);
12208 }
12209 }
12210 }
12211 if (!isnumok) {
12212 PyErr_Format(PyExc_TypeError,
12213 "%%%c format: a number is required, "
12214 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12215 goto onError;
12216 }
12217 if (flags & F_ZERO)
12218 fill = '0';
12219 break;
12220
12221 case 'e':
12222 case 'E':
12223 case 'f':
12224 case 'F':
12225 case 'g':
12226 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012227 temp = formatfloat(v, flags, prec, c);
12228 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012229 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 if (PyUnicode_READY(temp) == -1) {
12231 Py_CLEAR(temp);
12232 goto onError;
12233 }
12234 pbuf = PyUnicode_DATA(temp);
12235 kind = PyUnicode_KIND(temp);
12236 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012237 sign = 1;
12238 if (flags & F_ZERO)
12239 fill = '0';
12240 break;
12241
12242 case 'c':
12243 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012244 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012245 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
12246 if (len < 0)
12247 goto onError;
12248 break;
12249
12250 default:
12251 PyErr_Format(PyExc_ValueError,
12252 "unsupported format character '%c' (0x%x) "
12253 "at index %zd",
12254 (31<=c && c<=126) ? (char)c : '?',
12255 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012256 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012257 goto onError;
12258 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 /* pbuf is initialized here. */
12260 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012261 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012262 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12263 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12264 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012265 len--;
12266 }
12267 else if (flags & F_SIGN)
12268 sign = '+';
12269 else if (flags & F_BLANK)
12270 sign = ' ';
12271 else
12272 sign = 0;
12273 }
12274 if (width < len)
12275 width = len;
12276 if (rescnt - (sign != 0) < width) {
12277 reslen -= rescnt;
12278 rescnt = width + fmtcnt + 100;
12279 reslen += rescnt;
12280 if (reslen < 0) {
12281 Py_XDECREF(temp);
12282 PyErr_NoMemory();
12283 goto onError;
12284 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12286 if (res0 == 0) {
12287 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012288 Py_XDECREF(temp);
12289 goto onError;
12290 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012292 }
12293 if (sign) {
12294 if (fill != ' ')
12295 *res++ = sign;
12296 rescnt--;
12297 if (width > len)
12298 width--;
12299 }
12300 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12302 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12305 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012306 }
12307 rescnt -= 2;
12308 width -= 2;
12309 if (width < 0)
12310 width = 0;
12311 len -= 2;
12312 }
12313 if (width > len && !(flags & F_LJUST)) {
12314 do {
12315 --rescnt;
12316 *res++ = fill;
12317 } while (--width > len);
12318 }
12319 if (fill == ' ') {
12320 if (sign)
12321 *res++ = sign;
12322 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12324 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12325 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12326 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012327 }
12328 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 /* Copy all characters, preserving len */
12330 len1 = len;
12331 while (len1--) {
12332 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12333 rescnt--;
12334 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 while (--width >= len) {
12336 --rescnt;
12337 *res++ = ' ';
12338 }
12339 if (dict && (argidx < arglen) && c != '%') {
12340 PyErr_SetString(PyExc_TypeError,
12341 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012342 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 goto onError;
12344 }
12345 Py_XDECREF(temp);
12346 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347 } /* until end */
12348 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012349 PyErr_SetString(PyExc_TypeError,
12350 "not all arguments converted during string formatting");
12351 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012352 }
12353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354
12355 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12356 if (*res > max)
12357 max = *res;
12358 result = PyUnicode_New(reslen - rescnt, max);
12359 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012360 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 kind = PyUnicode_KIND(result);
12362 for (res = res0; res < res0+reslen-rescnt; res++)
12363 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12364 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012366 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012367 }
12368 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369 return (PyObject *)result;
12370
Benjamin Peterson29060642009-01-31 22:14:21 +000012371 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373 Py_DECREF(uformat);
12374 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376 }
12377 return NULL;
12378}
12379
Jeremy Hylton938ace62002-07-17 16:30:39 +000012380static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012381unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12382
Tim Peters6d6c1a32001-08-02 04:15:00 +000012383static PyObject *
12384unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12385{
Benjamin Peterson29060642009-01-31 22:14:21 +000012386 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012387 static char *kwlist[] = {"object", "encoding", "errors", 0};
12388 char *encoding = NULL;
12389 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012390
Benjamin Peterson14339b62009-01-31 16:36:08 +000012391 if (type != &PyUnicode_Type)
12392 return unicode_subtype_new(type, args, kwds);
12393 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012394 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012395 return NULL;
12396 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012398 if (encoding == NULL && errors == NULL)
12399 return PyObject_Str(x);
12400 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012401 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012402}
12403
Guido van Rossume023fe02001-08-30 03:12:59 +000012404static PyObject *
12405unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12406{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012407 PyUnicodeObject *tmp, *pnew;
12408 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012410
Benjamin Peterson14339b62009-01-31 16:36:08 +000012411 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12412 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12413 if (tmp == NULL)
12414 return NULL;
12415 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012416 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12417 // it seems kind of strange that tp_alloc gets passed the size
12418 // of the unicode string because there will follow another
12419 // malloc.
12420 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12421 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012422 if (pnew == NULL) {
12423 Py_DECREF(tmp);
12424 return NULL;
12425 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12427 if (_PyUnicode_WSTR(pnew) == NULL) {
12428 err = PyErr_NoMemory();
12429 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012430 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12432 _PyUnicode_WSTR_LENGTH(pnew) = n;
12433 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12434 _PyUnicode_STATE(pnew).interned = 0;
12435 _PyUnicode_STATE(pnew).kind = 0;
12436 _PyUnicode_STATE(pnew).compact = 0;
12437 _PyUnicode_STATE(pnew).ready = 0;
12438 _PyUnicode_STATE(pnew).ascii = 0;
12439 pnew->data.any = NULL;
12440 _PyUnicode_LENGTH(pnew) = 0;
12441 pnew->_base.utf8 = NULL;
12442 pnew->_base.utf8_length = 0;
12443
12444 if (PyUnicode_READY(pnew) == -1) {
12445 PyObject_FREE(_PyUnicode_WSTR(pnew));
12446 goto onError;
12447 }
12448
Benjamin Peterson14339b62009-01-31 16:36:08 +000012449 Py_DECREF(tmp);
12450 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451
12452 onError:
12453 _Py_ForgetReference((PyObject *)pnew);
12454 PyObject_Del(pnew);
12455 Py_DECREF(tmp);
12456 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012457}
12458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012459PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012460 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012461\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012462Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012463encoding defaults to the current default string encoding.\n\
12464errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012465
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012466static PyObject *unicode_iter(PyObject *seq);
12467
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012469 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012470 "str", /* tp_name */
12471 sizeof(PyUnicodeObject), /* tp_size */
12472 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012474 (destructor)unicode_dealloc, /* tp_dealloc */
12475 0, /* tp_print */
12476 0, /* tp_getattr */
12477 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012478 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012479 unicode_repr, /* tp_repr */
12480 &unicode_as_number, /* tp_as_number */
12481 &unicode_as_sequence, /* tp_as_sequence */
12482 &unicode_as_mapping, /* tp_as_mapping */
12483 (hashfunc) unicode_hash, /* tp_hash*/
12484 0, /* tp_call*/
12485 (reprfunc) unicode_str, /* tp_str */
12486 PyObject_GenericGetAttr, /* tp_getattro */
12487 0, /* tp_setattro */
12488 0, /* tp_as_buffer */
12489 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012490 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012491 unicode_doc, /* tp_doc */
12492 0, /* tp_traverse */
12493 0, /* tp_clear */
12494 PyUnicode_RichCompare, /* tp_richcompare */
12495 0, /* tp_weaklistoffset */
12496 unicode_iter, /* tp_iter */
12497 0, /* tp_iternext */
12498 unicode_methods, /* tp_methods */
12499 0, /* tp_members */
12500 0, /* tp_getset */
12501 &PyBaseObject_Type, /* tp_base */
12502 0, /* tp_dict */
12503 0, /* tp_descr_get */
12504 0, /* tp_descr_set */
12505 0, /* tp_dictoffset */
12506 0, /* tp_init */
12507 0, /* tp_alloc */
12508 unicode_new, /* tp_new */
12509 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510};
12511
12512/* Initialize the Unicode implementation */
12513
Thomas Wouters78890102000-07-22 19:25:51 +000012514void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012516 int i;
12517
Thomas Wouters477c8d52006-05-27 19:21:47 +000012518 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012520 0x000A, /* LINE FEED */
12521 0x000D, /* CARRIAGE RETURN */
12522 0x001C, /* FILE SEPARATOR */
12523 0x001D, /* GROUP SEPARATOR */
12524 0x001E, /* RECORD SEPARATOR */
12525 0x0085, /* NEXT LINE */
12526 0x2028, /* LINE SEPARATOR */
12527 0x2029, /* PARAGRAPH SEPARATOR */
12528 };
12529
Fred Drakee4315f52000-05-09 19:53:39 +000012530 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012532 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012534
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012535 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012536 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012537 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012538 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012539
12540 /* initialize the linebreak bloom filter */
12541 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012542 PyUnicode_2BYTE_KIND, linebreak,
12543 sizeof(linebreak) / sizeof(linebreak[0]));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012544
12545 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546}
12547
12548/* Finalize the Unicode implementation */
12549
Christian Heimesa156e092008-02-16 07:38:31 +000012550int
12551PyUnicode_ClearFreeList(void)
12552{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012554}
12555
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556void
Thomas Wouters78890102000-07-22 19:25:51 +000012557_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012559 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012561 Py_XDECREF(unicode_empty);
12562 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012563
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012564 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012565 if (unicode_latin1[i]) {
12566 Py_DECREF(unicode_latin1[i]);
12567 unicode_latin1[i] = NULL;
12568 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012569 }
Christian Heimesa156e092008-02-16 07:38:31 +000012570 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012572
Walter Dörwald16807132007-05-25 13:52:07 +000012573void
12574PyUnicode_InternInPlace(PyObject **p)
12575{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012576 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12577 PyObject *t;
12578 if (s == NULL || !PyUnicode_Check(s))
12579 Py_FatalError(
12580 "PyUnicode_InternInPlace: unicode strings only please!");
12581 /* If it's a subclass, we don't really know what putting
12582 it in the interned dict might do. */
12583 if (!PyUnicode_CheckExact(s))
12584 return;
12585 if (PyUnicode_CHECK_INTERNED(s))
12586 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 if (PyUnicode_READY(s) == -1) {
12588 assert(0 && "ready fail in intern...");
12589 return;
12590 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012591 if (interned == NULL) {
12592 interned = PyDict_New();
12593 if (interned == NULL) {
12594 PyErr_Clear(); /* Don't leave an exception */
12595 return;
12596 }
12597 }
12598 /* It might be that the GetItem call fails even
12599 though the key is present in the dictionary,
12600 namely when this happens during a stack overflow. */
12601 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012602 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012603 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012604
Benjamin Peterson29060642009-01-31 22:14:21 +000012605 if (t) {
12606 Py_INCREF(t);
12607 Py_DECREF(*p);
12608 *p = t;
12609 return;
12610 }
Walter Dörwald16807132007-05-25 13:52:07 +000012611
Benjamin Peterson14339b62009-01-31 16:36:08 +000012612 PyThreadState_GET()->recursion_critical = 1;
12613 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12614 PyErr_Clear();
12615 PyThreadState_GET()->recursion_critical = 0;
12616 return;
12617 }
12618 PyThreadState_GET()->recursion_critical = 0;
12619 /* The two references in interned are not counted by refcnt.
12620 The deallocator will take care of this */
12621 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012623}
12624
12625void
12626PyUnicode_InternImmortal(PyObject **p)
12627{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12629
Benjamin Peterson14339b62009-01-31 16:36:08 +000012630 PyUnicode_InternInPlace(p);
12631 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012633 Py_INCREF(*p);
12634 }
Walter Dörwald16807132007-05-25 13:52:07 +000012635}
12636
12637PyObject *
12638PyUnicode_InternFromString(const char *cp)
12639{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012640 PyObject *s = PyUnicode_FromString(cp);
12641 if (s == NULL)
12642 return NULL;
12643 PyUnicode_InternInPlace(&s);
12644 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012645}
12646
Alexander Belopolsky40018472011-02-26 01:02:56 +000012647void
12648_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012649{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012650 PyObject *keys;
12651 PyUnicodeObject *s;
12652 Py_ssize_t i, n;
12653 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012654
Benjamin Peterson14339b62009-01-31 16:36:08 +000012655 if (interned == NULL || !PyDict_Check(interned))
12656 return;
12657 keys = PyDict_Keys(interned);
12658 if (keys == NULL || !PyList_Check(keys)) {
12659 PyErr_Clear();
12660 return;
12661 }
Walter Dörwald16807132007-05-25 13:52:07 +000012662
Benjamin Peterson14339b62009-01-31 16:36:08 +000012663 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12664 detector, interned unicode strings are not forcibly deallocated;
12665 rather, we give them their stolen references back, and then clear
12666 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012667
Benjamin Peterson14339b62009-01-31 16:36:08 +000012668 n = PyList_GET_SIZE(keys);
12669 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012670 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012671 for (i = 0; i < n; i++) {
12672 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 if (PyUnicode_READY(s) == -1)
12674 fprintf(stderr, "could not ready string\n");
12675 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012676 case SSTATE_NOT_INTERNED:
12677 /* XXX Shouldn't happen */
12678 break;
12679 case SSTATE_INTERNED_IMMORTAL:
12680 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012682 break;
12683 case SSTATE_INTERNED_MORTAL:
12684 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012686 break;
12687 default:
12688 Py_FatalError("Inconsistent interned string state.");
12689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012691 }
12692 fprintf(stderr, "total size of all interned strings: "
12693 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12694 "mortal/immortal\n", mortal_size, immortal_size);
12695 Py_DECREF(keys);
12696 PyDict_Clear(interned);
12697 Py_DECREF(interned);
12698 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012699}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012700
12701
12702/********************* Unicode Iterator **************************/
12703
12704typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012705 PyObject_HEAD
12706 Py_ssize_t it_index;
12707 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012708} unicodeiterobject;
12709
12710static void
12711unicodeiter_dealloc(unicodeiterobject *it)
12712{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012713 _PyObject_GC_UNTRACK(it);
12714 Py_XDECREF(it->it_seq);
12715 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012716}
12717
12718static int
12719unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12720{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012721 Py_VISIT(it->it_seq);
12722 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012723}
12724
12725static PyObject *
12726unicodeiter_next(unicodeiterobject *it)
12727{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012728 PyUnicodeObject *seq;
12729 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012730
Benjamin Peterson14339b62009-01-31 16:36:08 +000012731 assert(it != NULL);
12732 seq = it->it_seq;
12733 if (seq == NULL)
12734 return NULL;
12735 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12738 int kind = PyUnicode_KIND(seq);
12739 void *data = PyUnicode_DATA(seq);
12740 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12741 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012742 if (item != NULL)
12743 ++it->it_index;
12744 return item;
12745 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012746
Benjamin Peterson14339b62009-01-31 16:36:08 +000012747 Py_DECREF(seq);
12748 it->it_seq = NULL;
12749 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012750}
12751
12752static PyObject *
12753unicodeiter_len(unicodeiterobject *it)
12754{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012755 Py_ssize_t len = 0;
12756 if (it->it_seq)
12757 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12758 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012759}
12760
12761PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12762
12763static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012764 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012765 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012766 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012767};
12768
12769PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012770 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12771 "str_iterator", /* tp_name */
12772 sizeof(unicodeiterobject), /* tp_basicsize */
12773 0, /* tp_itemsize */
12774 /* methods */
12775 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12776 0, /* tp_print */
12777 0, /* tp_getattr */
12778 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012779 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012780 0, /* tp_repr */
12781 0, /* tp_as_number */
12782 0, /* tp_as_sequence */
12783 0, /* tp_as_mapping */
12784 0, /* tp_hash */
12785 0, /* tp_call */
12786 0, /* tp_str */
12787 PyObject_GenericGetAttr, /* tp_getattro */
12788 0, /* tp_setattro */
12789 0, /* tp_as_buffer */
12790 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12791 0, /* tp_doc */
12792 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12793 0, /* tp_clear */
12794 0, /* tp_richcompare */
12795 0, /* tp_weaklistoffset */
12796 PyObject_SelfIter, /* tp_iter */
12797 (iternextfunc)unicodeiter_next, /* tp_iternext */
12798 unicodeiter_methods, /* tp_methods */
12799 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012800};
12801
12802static PyObject *
12803unicode_iter(PyObject *seq)
12804{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012805 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012806
Benjamin Peterson14339b62009-01-31 16:36:08 +000012807 if (!PyUnicode_Check(seq)) {
12808 PyErr_BadInternalCall();
12809 return NULL;
12810 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 if (PyUnicode_READY(seq) == -1)
12812 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012813 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12814 if (it == NULL)
12815 return NULL;
12816 it->it_index = 0;
12817 Py_INCREF(seq);
12818 it->it_seq = (PyUnicodeObject *)seq;
12819 _PyObject_GC_TRACK(it);
12820 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012821}
12822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012823#define UNIOP(x) Py_UNICODE_##x
12824#define UNIOP_t Py_UNICODE
12825#include "uniops.h"
12826#undef UNIOP
12827#undef UNIOP_t
12828#define UNIOP(x) Py_UCS4_##x
12829#define UNIOP_t Py_UCS4
12830#include "uniops.h"
12831#undef UNIOP
12832#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012833
Victor Stinner71133ff2010-09-01 23:43:53 +000012834Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012835PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012836{
12837 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12838 Py_UNICODE *copy;
12839 Py_ssize_t size;
12840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012841 if (!PyUnicode_Check(unicode)) {
12842 PyErr_BadArgument();
12843 return NULL;
12844 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012845 /* Ensure we won't overflow the size. */
12846 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12847 PyErr_NoMemory();
12848 return NULL;
12849 }
12850 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12851 size *= sizeof(Py_UNICODE);
12852 copy = PyMem_Malloc(size);
12853 if (copy == NULL) {
12854 PyErr_NoMemory();
12855 return NULL;
12856 }
12857 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12858 return copy;
12859}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012860
Georg Brandl66c221e2010-10-14 07:04:07 +000012861/* A _string module, to export formatter_parser and formatter_field_name_split
12862 to the string.Formatter class implemented in Python. */
12863
12864static PyMethodDef _string_methods[] = {
12865 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12866 METH_O, PyDoc_STR("split the argument as a field name")},
12867 {"formatter_parser", (PyCFunction) formatter_parser,
12868 METH_O, PyDoc_STR("parse the argument as a format string")},
12869 {NULL, NULL}
12870};
12871
12872static struct PyModuleDef _string_module = {
12873 PyModuleDef_HEAD_INIT,
12874 "_string",
12875 PyDoc_STR("string helper module"),
12876 0,
12877 _string_methods,
12878 NULL,
12879 NULL,
12880 NULL,
12881 NULL
12882};
12883
12884PyMODINIT_FUNC
12885PyInit__string(void)
12886{
12887 return PyModule_Create(&_string_module);
12888}
12889
12890
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012891#ifdef __cplusplus
12892}
12893#endif