blob: b61f0581b3204e173b79150528eacb5687d8d0b3 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
109#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
110#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
111#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
112#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
113#define _PyUnicode_KIND(op) \
114 (assert(PyUnicode_Check(op)), \
115 ((PyASCIIObject *)(op))->state.kind)
116#define _PyUnicode_GET_LENGTH(op) \
117 (assert(PyUnicode_Check(op)), \
118 ((PyASCIIObject *)(op))->length)
119
120
Walter Dörwald16807132007-05-25 13:52:07 +0000121/* This dictionary holds all interned unicode strings. Note that references
122 to strings in this dictionary are *not* counted in the string's ob_refcnt.
123 When the interned string reaches a refcnt of 0 the string deallocation
124 function will delete the reference from this dictionary.
125
126 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000127 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000128*/
129static PyObject *interned;
130
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000131/* The empty Unicode object is shared to improve performance. */
132static PyUnicodeObject *unicode_empty;
133
134/* Single character Unicode strings in the Latin-1 range are being
135 shared as well. */
136static PyUnicodeObject *unicode_latin1[256];
137
Christian Heimes190d79e2008-01-30 11:58:22 +0000138/* Fast detection of the most frequent whitespace characters */
139const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000141/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000142/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000143/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000144/* case 0x000C: * FORM FEED */
145/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000146 0, 1, 1, 1, 1, 1, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000148/* case 0x001C: * FILE SEPARATOR */
149/* case 0x001D: * GROUP SEPARATOR */
150/* case 0x001E: * RECORD SEPARATOR */
151/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000152 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000153/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000154 1, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000158
Benjamin Peterson14339b62009-01-31 16:36:08 +0000159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000167};
168
Alexander Belopolsky40018472011-02-26 01:02:56 +0000169static PyObject *
170unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000171 PyObject **errorHandler,const char *encoding, const char *reason,
172 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
173 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
174
Alexander Belopolsky40018472011-02-26 01:02:56 +0000175static void
176raise_encode_exception(PyObject **exceptionObject,
177 const char *encoding,
178 const Py_UNICODE *unicode, Py_ssize_t size,
179 Py_ssize_t startpos, Py_ssize_t endpos,
180 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000181
Christian Heimes190d79e2008-01-30 11:58:22 +0000182/* Same for linebreaks */
183static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000185/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000186/* 0x000B, * LINE TABULATION */
187/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000188/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000189 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* 0x001C, * FILE SEPARATOR */
192/* 0x001D, * GROUP SEPARATOR */
193/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 0, 0, 0, 0, 1, 1, 1, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
196 0, 0, 0, 0, 0, 0, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000199
Benjamin Peterson14339b62009-01-31 16:36:08 +0000200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000208};
209
210
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000211Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000212PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000213{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000214#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000216#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 /* This is actually an illegal character, so it should
218 not be passed to unichr. */
219 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000220#endif
221}
222
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223/* --- Bloom Filters ----------------------------------------------------- */
224
225/* stuff to implement simple "bloom filters" for Unicode characters.
226 to keep things simple, we use a single bitmask, using the least 5
227 bits from each unicode characters as the bit index. */
228
229/* the linebreak mask is set up by Unicode_Init below */
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#if LONG_BIT >= 128
232#define BLOOM_WIDTH 128
233#elif LONG_BIT >= 64
234#define BLOOM_WIDTH 64
235#elif LONG_BIT >= 32
236#define BLOOM_WIDTH 32
237#else
238#error "LONG_BIT is smaller than 32"
239#endif
240
Thomas Wouters477c8d52006-05-27 19:21:47 +0000241#define BLOOM_MASK unsigned long
242
243static BLOOM_MASK bloom_linebreak;
244
Antoine Pitrouf068f942010-01-13 14:19:12 +0000245#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
246#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000247
Benjamin Peterson29060642009-01-31 22:14:21 +0000248#define BLOOM_LINEBREAK(ch) \
249 ((ch) < 128U ? ascii_linebreak[(ch)] : \
250 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000251
Alexander Belopolsky40018472011-02-26 01:02:56 +0000252Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200253make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000254{
255 /* calculate simple bloom-style bitmask for a given unicode string */
256
Antoine Pitrouf068f942010-01-13 14:19:12 +0000257 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000258 Py_ssize_t i;
259
260 mask = 0;
261 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200262 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000263
264 return mask;
265}
266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200267#define BLOOM_MEMBER(mask, chr, str) \
268 (BLOOM(mask, chr) \
269 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271/* --- Unicode Object ----------------------------------------------------- */
272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200273static PyObject *
274substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len);
275
276static PyObject *
277fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
278
279Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
280 Py_ssize_t size, Py_UCS4 ch,
281 int direction)
282{
283 /* like wcschr, but doesn't stop at NULL characters */
284 Py_ssize_t i;
285 if (direction == 1) {
286 for(i = 0; i < size; i++)
287 if (PyUnicode_READ(kind, s, i) == ch)
288 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
289 }
290 else {
291 for(i = size-1; i >= 0; i--)
292 if (PyUnicode_READ(kind, s, i) == ch)
293 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
294 }
295 return NULL;
296}
297
Alexander Belopolsky40018472011-02-26 01:02:56 +0000298static int
299unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200300 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
302 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200304 /* Resizing is only supported for old unicode objects. */
305 assert(!PyUnicode_IS_COMPACT(unicode));
306 assert(_PyUnicode_WSTR(unicode) != NULL);
307
308 /* ... and only if they have not been readied yet, because
309 callees usually rely on the wstr representation when resizing. */
310 assert(unicode->data.any == NULL);
311
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000312 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200313 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 /* Resizing shared object (unicode_empty or single character
317 objects) in-place is not allowed. Use PyUnicode_Resize()
318 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
322 _PyUnicode_WSTR(unicode)[0] < 256U &&
323 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000325 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 return -1;
327 }
328
Thomas Wouters477c8d52006-05-27 19:21:47 +0000329 /* We allocate one more byte to make sure the string is Ux0000 terminated.
330 The overallocation is also used by fastsearch, which assumes that it's
331 safe to look at str[length] (without making any assumptions about what
332 it contains). */
333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200334 oldstr = _PyUnicode_WSTR(unicode);
335 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
336 sizeof(Py_UNICODE) * (length + 1));
337 if (!_PyUnicode_WSTR(unicode)) {
338 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 PyErr_NoMemory();
340 return -1;
341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200342 _PyUnicode_WSTR(unicode)[length] = 0;
343 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000344
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 if (unicode->data.any != NULL) {
347 PyObject_FREE(unicode->data.any);
348 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
349 PyObject_FREE(unicode->_base.utf8);
350 }
351 unicode->_base.utf8 = NULL;
352 unicode->_base.utf8_length = 0;
353 unicode->data.any = NULL;
354 _PyUnicode_LENGTH(unicode) = 0;
355 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
356 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 _PyUnicode_HASH(unicode) = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000359
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 return 0;
361}
362
363/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000364 Ux0000 terminated; some code (e.g. new_identifier)
365 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366
367 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369
370*/
371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200372#ifdef Py_DEBUG
373int unicode_old_new_calls = 0;
374#endif
375
Alexander Belopolsky40018472011-02-26 01:02:56 +0000376static PyUnicodeObject *
377_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378{
379 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200380 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381
Thomas Wouters477c8d52006-05-27 19:21:47 +0000382 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383 if (length == 0 && unicode_empty != NULL) {
384 Py_INCREF(unicode_empty);
385 return unicode_empty;
386 }
387
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000388 /* Ensure we won't overflow the size. */
389 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
390 return (PyUnicodeObject *)PyErr_NoMemory();
391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200392 if (length < 0) {
393 PyErr_SetString(PyExc_SystemError,
394 "Negative size passed to _PyUnicode_New");
395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396 }
397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200398#ifdef Py_DEBUG
399 ++unicode_old_new_calls;
400#endif
401
402 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
403 if (unicode == NULL)
404 return NULL;
405 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
406 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
407 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 PyErr_NoMemory();
409 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200411
Jeremy Hyltond8082792003-09-16 19:41:39 +0000412 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000413 * the caller fails before initializing str -- unicode_resize()
414 * reads str[0], and the Keep-Alive optimization can keep memory
415 * allocated for str alive across a call to unicode_dealloc(unicode).
416 * We don't want unicode_resize to read uninitialized memory in
417 * that case.
418 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419 _PyUnicode_WSTR(unicode)[0] = 0;
420 _PyUnicode_WSTR(unicode)[length] = 0;
421 _PyUnicode_WSTR_LENGTH(unicode) = length;
422 _PyUnicode_HASH(unicode) = -1;
423 _PyUnicode_STATE(unicode).interned = 0;
424 _PyUnicode_STATE(unicode).kind = 0;
425 _PyUnicode_STATE(unicode).compact = 0;
426 _PyUnicode_STATE(unicode).ready = 0;
427 _PyUnicode_STATE(unicode).ascii = 0;
428 unicode->data.any = NULL;
429 _PyUnicode_LENGTH(unicode) = 0;
430 unicode->_base.utf8 = NULL;
431 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000433
Benjamin Peterson29060642009-01-31 22:14:21 +0000434 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000435 /* XXX UNREF/NEWREF interface should be more symmetrical */
436 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000437 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000438 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440}
441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200442#ifdef Py_DEBUG
443int unicode_new_new_calls = 0;
444
445/* Functions wrapping macros for use in debugger */
446char *_PyUnicode_utf8(void *unicode){
447 return _PyUnicode_UTF8(unicode);
448}
449
450void *_PyUnicode_compact_data(void *unicode) {
451 return _PyUnicode_COMPACT_DATA(unicode);
452}
453void *_PyUnicode_data(void *unicode){
454 printf("obj %p\n", unicode);
455 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
456 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
457 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
458 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
459 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
460 return PyUnicode_DATA(unicode);
461}
462#endif
463
464PyObject *
465PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
466{
467 PyObject *obj;
468 PyCompactUnicodeObject *unicode;
469 void *data;
470 int kind_state;
471 int is_sharing = 0, is_ascii = 0;
472 Py_ssize_t char_size;
473 Py_ssize_t struct_size;
474
475 /* Optimization for empty strings */
476 if (size == 0 && unicode_empty != NULL) {
477 Py_INCREF(unicode_empty);
478 return (PyObject *)unicode_empty;
479 }
480
481#ifdef Py_DEBUG
482 ++unicode_new_new_calls;
483#endif
484
485 struct_size = sizeof(PyCompactUnicodeObject);
486 if (maxchar < 128) {
487 kind_state = PyUnicode_1BYTE_KIND;
488 char_size = 1;
489 is_ascii = 1;
490 struct_size = sizeof(PyASCIIObject);
491 }
492 else if (maxchar < 256) {
493 kind_state = PyUnicode_1BYTE_KIND;
494 char_size = 1;
495 }
496 else if (maxchar < 65536) {
497 kind_state = PyUnicode_2BYTE_KIND;
498 char_size = 2;
499 if (sizeof(wchar_t) == 2)
500 is_sharing = 1;
501 }
502 else {
503 kind_state = PyUnicode_4BYTE_KIND;
504 char_size = 4;
505 if (sizeof(wchar_t) == 4)
506 is_sharing = 1;
507 }
508
509 /* Ensure we won't overflow the size. */
510 if (size < 0) {
511 PyErr_SetString(PyExc_SystemError,
512 "Negative size passed to PyUnicode_New");
513 return NULL;
514 }
515 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
516 return PyErr_NoMemory();
517
518 /* Duplicated allocation code from _PyObject_New() instead of a call to
519 * PyObject_New() so we are able to allocate space for the object and
520 * it's data buffer.
521 */
522 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
523 if (obj == NULL)
524 return PyErr_NoMemory();
525 obj = PyObject_INIT(obj, &PyUnicode_Type);
526 if (obj == NULL)
527 return NULL;
528
529 unicode = (PyCompactUnicodeObject *)obj;
530 if (is_ascii)
531 data = ((PyASCIIObject*)obj) + 1;
532 else
533 data = unicode + 1;
534 _PyUnicode_LENGTH(unicode) = size;
535 _PyUnicode_HASH(unicode) = -1;
536 _PyUnicode_STATE(unicode).interned = 0;
537 _PyUnicode_STATE(unicode).kind = kind_state;
538 _PyUnicode_STATE(unicode).compact = 1;
539 _PyUnicode_STATE(unicode).ready = 1;
540 _PyUnicode_STATE(unicode).ascii = is_ascii;
541 if (is_ascii) {
542 ((char*)data)[size] = 0;
543 _PyUnicode_WSTR(unicode) = NULL;
544 }
545 else if (kind_state == PyUnicode_1BYTE_KIND) {
546 ((char*)data)[size] = 0;
547 _PyUnicode_WSTR(unicode) = NULL;
548 _PyUnicode_WSTR_LENGTH(unicode) = 0;
549 unicode->utf8_length = 0;
550 unicode->utf8 = NULL;
551 }
552 else {
553 unicode->utf8 = NULL;
554 if (kind_state == PyUnicode_2BYTE_KIND)
555 ((Py_UCS2*)data)[size] = 0;
556 else /* kind_state == PyUnicode_4BYTE_KIND */
557 ((Py_UCS4*)data)[size] = 0;
558 if (is_sharing) {
559 _PyUnicode_WSTR_LENGTH(unicode) = size;
560 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
561 }
562 else {
563 _PyUnicode_WSTR_LENGTH(unicode) = 0;
564 _PyUnicode_WSTR(unicode) = NULL;
565 }
566 }
567 return obj;
568}
569
570#if SIZEOF_WCHAR_T == 2
571/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
572 will decode surrogate pairs, the other conversions are implemented as macros
573 for efficency.
574
575 This function assumes that unicode can hold one more code point than wstr
576 characters for a terminating null character. */
577static int
578unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
579 PyUnicodeObject *unicode)
580{
581 const wchar_t *iter;
582 Py_UCS4 *ucs4_out;
583
584 assert(unicode && PyUnicode_Check(unicode));
585 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
586 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
587
588 for (iter = begin; iter < end; ) {
589 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
590 _PyUnicode_GET_LENGTH(unicode)));
591 if (*iter >= 0xD800 && *iter <= 0xDBFF
592 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
593 {
594 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
595 iter += 2;
596 }
597 else {
598 *ucs4_out++ = *iter;
599 iter++;
600 }
601 }
602 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
603 _PyUnicode_GET_LENGTH(unicode)));
604
605 return 0;
606}
607#endif
608
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200609Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
611 PyObject *from, Py_ssize_t from_start,
612 Py_ssize_t how_many)
613{
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200614 unsigned int from_kind;
615 unsigned int to_kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200616
617 assert(PyUnicode_Check(from));
618 assert(PyUnicode_Check(to));
619
620 if (PyUnicode_READY(from))
621 return -1;
622 if (PyUnicode_READY(to))
623 return -1;
624
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200625 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200626 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
627 PyErr_Format(PyExc_ValueError,
628 "Cannot write %zi characters at %zi "
629 "in a string of %zi characters",
630 how_many, to_start, PyUnicode_GET_LENGTH(to));
631 return -1;
632 }
633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634 from_kind = PyUnicode_KIND(from);
635 to_kind = PyUnicode_KIND(to);
636
637 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200638 /* fast path */
639 Py_MEMCPY((char*)PyUnicode_DATA(to)
640 + PyUnicode_KIND_SIZE(to_kind, to_start),
641 (char*)PyUnicode_DATA(from)
642 + PyUnicode_KIND_SIZE(from_kind, from_start),
643 PyUnicode_KIND_SIZE(to_kind, how_many));
644 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200646
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200647 if (from_kind > to_kind) {
648 /* slow path to check for character overflow */
649 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
650 void *from_data = PyUnicode_DATA(from);
651 void *to_data = PyUnicode_DATA(to);
652 Py_UCS4 ch, maxchar;
653 Py_ssize_t i;
654 int overflow;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200656 maxchar = 0;
Victor Stinner73f01c62011-09-28 22:28:04 +0200657 overflow = 0;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200658 for (i=0; i < how_many; i++) {
659 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
660 if (ch > maxchar) {
661 maxchar = ch;
662 if (maxchar > to_maxchar) {
663 overflow = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200664 break;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200666 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200667 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
668 }
669 if (!overflow)
670 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200671 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200672 else if (from_kind == PyUnicode_1BYTE_KIND && to_kind == PyUnicode_2BYTE_KIND)
673 {
674 _PyUnicode_CONVERT_BYTES(
675 Py_UCS1, Py_UCS2,
676 PyUnicode_1BYTE_DATA(from) + from_start,
677 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
678 PyUnicode_2BYTE_DATA(to) + to_start
679 );
680 return how_many;
681 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200682 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200683 && to_kind == PyUnicode_4BYTE_KIND)
684 {
685 _PyUnicode_CONVERT_BYTES(
686 Py_UCS1, Py_UCS4,
687 PyUnicode_1BYTE_DATA(from) + from_start,
688 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
689 PyUnicode_4BYTE_DATA(to) + to_start
690 );
691 return how_many;
692 }
693 else if (from_kind == PyUnicode_2BYTE_KIND
694 && to_kind == PyUnicode_4BYTE_KIND)
695 {
696 _PyUnicode_CONVERT_BYTES(
697 Py_UCS2, Py_UCS4,
698 PyUnicode_2BYTE_DATA(from) + from_start,
699 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
700 PyUnicode_4BYTE_DATA(to) + to_start
701 );
702 return how_many;
703 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200704 PyErr_Format(PyExc_ValueError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200705 "Cannot copy UCS%u characters "
706 "into a string of UCS%u characters",
Victor Stinner157f83f2011-09-28 21:41:31 +0200707 1 << (from_kind - 1),
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200708 1 << (to_kind -1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200709 return -1;
710}
711
Victor Stinner17222162011-09-28 22:15:37 +0200712/* Find the maximum code point and count the number of surrogate pairs so a
713 correct string length can be computed before converting a string to UCS4.
714 This function counts single surrogates as a character and not as a pair.
715
716 Return 0 on success, or -1 on error. */
717static int
718find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
719 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200720{
721 const wchar_t *iter;
722
723 if (num_surrogates == NULL || maxchar == NULL) {
724 PyErr_SetString(PyExc_SystemError,
725 "unexpected NULL arguments to "
726 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
727 return -1;
728 }
729
730 *num_surrogates = 0;
731 *maxchar = 0;
732
733 for (iter = begin; iter < end; ) {
734 if (*iter > *maxchar)
735 *maxchar = *iter;
736#if SIZEOF_WCHAR_T == 2
737 if (*iter >= 0xD800 && *iter <= 0xDBFF
738 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
739 {
740 Py_UCS4 surrogate_val;
741 surrogate_val = (((iter[0] & 0x3FF)<<10)
742 | (iter[1] & 0x3FF)) + 0x10000;
743 ++(*num_surrogates);
744 if (surrogate_val > *maxchar)
745 *maxchar = surrogate_val;
746 iter += 2;
747 }
748 else
749 iter++;
750#else
751 iter++;
752#endif
753 }
754 return 0;
755}
756
757#ifdef Py_DEBUG
758int unicode_ready_calls = 0;
759#endif
760
761int
762_PyUnicode_Ready(PyUnicodeObject *unicode)
763{
764 wchar_t *end;
765 Py_UCS4 maxchar = 0;
766 Py_ssize_t num_surrogates;
767#if SIZEOF_WCHAR_T == 2
768 Py_ssize_t length_wo_surrogates;
769#endif
770
771 assert(PyUnicode_Check(unicode));
772
773 if (unicode->data.any != NULL) {
774 assert(PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
775 return 0;
776 }
777
778 /* _PyUnicode_Ready() is only intented for old-style API usage where
779 * strings were created using _PyObject_New() and where no canonical
780 * representation (the str field) has been set yet aka strings
781 * which are not yet ready.
782 */
783 assert(_PyUnicode_WSTR(unicode) != NULL);
784 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
785 assert(!PyUnicode_IS_COMPACT(unicode));
786 assert(!PyUnicode_IS_READY(unicode));
787 /* Actually, it should neither be interned nor be anything else: */
788 assert(_PyUnicode_STATE(unicode).interned == 0);
789 assert(unicode->_base.utf8 == NULL);
790
791#ifdef Py_DEBUG
792 ++unicode_ready_calls;
793#endif
794
795 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200796 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200797 &maxchar,
798 &num_surrogates) == -1) {
799 assert(0 && "PyUnicode_FindMaxCharAndNumSurrogatePairs failed");
800 return -1;
801 }
802
803 if (maxchar < 256) {
804 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
805 if (!unicode->data.any) {
806 PyErr_NoMemory();
807 return -1;
808 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200809 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200810 _PyUnicode_WSTR(unicode), end,
811 PyUnicode_1BYTE_DATA(unicode));
812 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
813 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
814 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
815 if (maxchar < 128) {
816 unicode->_base.utf8 = unicode->data.any;
817 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
818 }
819 else {
820 unicode->_base.utf8 = NULL;
821 unicode->_base.utf8_length = 0;
822 }
823 PyObject_FREE(_PyUnicode_WSTR(unicode));
824 _PyUnicode_WSTR(unicode) = NULL;
825 _PyUnicode_WSTR_LENGTH(unicode) = 0;
826 }
827 /* In this case we might have to convert down from 4-byte native
828 wchar_t to 2-byte unicode. */
829 else if (maxchar < 65536) {
830 assert(num_surrogates == 0 &&
831 "FindMaxCharAndNumSurrogatePairs() messed up");
832
Victor Stinner506f5922011-09-28 22:34:18 +0200833#if SIZEOF_WCHAR_T == 2
834 /* We can share representations and are done. */
835 unicode->data.any = _PyUnicode_WSTR(unicode);
836 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
837 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
838 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
839 unicode->_base.utf8 = NULL;
840 unicode->_base.utf8_length = 0;
841#else
842 /* sizeof(wchar_t) == 4 */
843 unicode->data.any = PyObject_MALLOC(
844 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
845 if (!unicode->data.any) {
846 PyErr_NoMemory();
847 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200848 }
Victor Stinner506f5922011-09-28 22:34:18 +0200849 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
850 _PyUnicode_WSTR(unicode), end,
851 PyUnicode_2BYTE_DATA(unicode));
852 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
853 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
854 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
855 unicode->_base.utf8 = NULL;
856 unicode->_base.utf8_length = 0;
857 PyObject_FREE(_PyUnicode_WSTR(unicode));
858 _PyUnicode_WSTR(unicode) = NULL;
859 _PyUnicode_WSTR_LENGTH(unicode) = 0;
860#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861 }
862 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
863 else {
864#if SIZEOF_WCHAR_T == 2
865 /* in case the native representation is 2-bytes, we need to allocate a
866 new normalized 4-byte version. */
867 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
868 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
869 if (!unicode->data.any) {
870 PyErr_NoMemory();
871 return -1;
872 }
873 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
874 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
875 unicode->_base.utf8 = NULL;
876 unicode->_base.utf8_length = 0;
877 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
878 unicode) < 0) {
879 assert(0 && "ConvertWideCharToUCS4 failed");
880 return -1;
881 }
882 PyObject_FREE(_PyUnicode_WSTR(unicode));
883 _PyUnicode_WSTR(unicode) = NULL;
884 _PyUnicode_WSTR_LENGTH(unicode) = 0;
885#else
886 assert(num_surrogates == 0);
887
888 unicode->data.any = _PyUnicode_WSTR(unicode);
889 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
890 unicode->_base.utf8 = NULL;
891 unicode->_base.utf8_length = 0;
892 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
893#endif
894 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
895 }
896 _PyUnicode_STATE(unicode).ready = 1;
897 return 0;
898}
899
Alexander Belopolsky40018472011-02-26 01:02:56 +0000900static void
901unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000902{
Walter Dörwald16807132007-05-25 13:52:07 +0000903 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000904 case SSTATE_NOT_INTERNED:
905 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000906
Benjamin Peterson29060642009-01-31 22:14:21 +0000907 case SSTATE_INTERNED_MORTAL:
908 /* revive dead object temporarily for DelItem */
909 Py_REFCNT(unicode) = 3;
910 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
911 Py_FatalError(
912 "deletion of interned string failed");
913 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000914
Benjamin Peterson29060642009-01-31 22:14:21 +0000915 case SSTATE_INTERNED_IMMORTAL:
916 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000917
Benjamin Peterson29060642009-01-31 22:14:21 +0000918 default:
919 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000920 }
921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200922 if (_PyUnicode_WSTR(unicode) &&
923 (!PyUnicode_IS_READY(unicode) ||
924 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
925 PyObject_DEL(_PyUnicode_WSTR(unicode));
926 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
927 PyObject_DEL(unicode->_base.utf8);
928
929 if (PyUnicode_IS_COMPACT(unicode)) {
930 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000931 }
932 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200933 if (unicode->data.any)
934 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000935 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936 }
937}
938
Alexander Belopolsky40018472011-02-26 01:02:56 +0000939static int
940_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000941{
942 register PyUnicodeObject *v;
943
944 /* Argument checks */
945 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000946 PyErr_BadInternalCall();
947 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000948 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000949 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200950 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
951 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000952 PyErr_BadInternalCall();
953 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000954 }
955
956 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957 possible since these are being shared.
958 The same goes for new-representation unicode objects or objects which
959 have already been readied.
960 For these, we simply return a fresh copy with the same Unicode content.
961 */
962 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
963 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
964 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000965 PyUnicodeObject *w = _PyUnicode_New(length);
966 if (w == NULL)
967 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
969 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000970 Py_DECREF(*unicode);
971 *unicode = w;
972 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000973 }
974
975 /* Note that we don't have to modify *unicode for unshared Unicode
976 objects, since we can modify them in-place. */
977 return unicode_resize(v, length);
978}
979
Alexander Belopolsky40018472011-02-26 01:02:56 +0000980int
981PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000982{
983 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
984}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200986static PyObject*
987get_latin1_char(unsigned char ch)
988{
989 PyUnicodeObject *unicode = unicode_latin1[ch];
990 if (!unicode) {
991 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
992 if (!unicode)
993 return NULL;
994 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
995 unicode_latin1[ch] = unicode;
996 }
997 Py_INCREF(unicode);
998 return (PyObject *)unicode;
999}
1000
Alexander Belopolsky40018472011-02-26 01:02:56 +00001001PyObject *
1002PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001003{
1004 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001005 Py_UCS4 maxchar = 0;
1006 Py_ssize_t num_surrogates;
1007
1008 if (u == NULL)
1009 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001010
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001011 /* If the Unicode data is known at construction time, we can apply
1012 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001014 /* Optimization for empty strings */
1015 if (size == 0 && unicode_empty != NULL) {
1016 Py_INCREF(unicode_empty);
1017 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001018 }
Tim Petersced69f82003-09-16 20:30:58 +00001019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001020 /* Single character Unicode objects in the Latin-1 range are
1021 shared when using this constructor */
1022 if (size == 1 && *u < 256)
1023 return get_latin1_char((unsigned char)*u);
1024
1025 /* If not empty and not single character, copy the Unicode data
1026 into the new object */
Victor Stinner17222162011-09-28 22:15:37 +02001027 if (find_maxchar_surrogates(u, u + size, &maxchar,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028 &num_surrogates) == -1)
1029 return NULL;
1030
1031 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1032 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033 if (!unicode)
1034 return NULL;
1035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001036 switch (PyUnicode_KIND(unicode)) {
1037 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001038 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001039 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1040 break;
1041 case PyUnicode_2BYTE_KIND:
1042#if Py_UNICODE_SIZE == 2
1043 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1044#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001045 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1047#endif
1048 break;
1049 case PyUnicode_4BYTE_KIND:
1050#if SIZEOF_WCHAR_T == 2
1051 /* This is the only case which has to process surrogates, thus
1052 a simple copy loop is not enough and we need a function. */
1053 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1054 Py_DECREF(unicode);
1055 return NULL;
1056 }
1057#else
1058 assert(num_surrogates == 0);
1059 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1060#endif
1061 break;
1062 default:
1063 assert(0 && "Impossible state");
1064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001065
1066 return (PyObject *)unicode;
1067}
1068
Alexander Belopolsky40018472011-02-26 01:02:56 +00001069PyObject *
1070PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001071{
1072 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001073
Benjamin Peterson14339b62009-01-31 16:36:08 +00001074 if (size < 0) {
1075 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001076 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001077 return NULL;
1078 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001079
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001080 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001081 some optimizations which share commonly used objects.
1082 Also, this means the input must be UTF-8, so fall back to the
1083 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001084 if (u != NULL) {
1085
Benjamin Peterson29060642009-01-31 22:14:21 +00001086 /* Optimization for empty strings */
1087 if (size == 0 && unicode_empty != NULL) {
1088 Py_INCREF(unicode_empty);
1089 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001090 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001091
1092 /* Single characters are shared when using this constructor.
1093 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001094 if (size == 1 && Py_CHARMASK(*u) < 128)
1095 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001096
1097 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001098 }
1099
Walter Dörwald55507312007-05-18 13:12:10 +00001100 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001101 if (!unicode)
1102 return NULL;
1103
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001104 return (PyObject *)unicode;
1105}
1106
Alexander Belopolsky40018472011-02-26 01:02:56 +00001107PyObject *
1108PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001109{
1110 size_t size = strlen(u);
1111 if (size > PY_SSIZE_T_MAX) {
1112 PyErr_SetString(PyExc_OverflowError, "input too long");
1113 return NULL;
1114 }
1115
1116 return PyUnicode_FromStringAndSize(u, size);
1117}
1118
Victor Stinnere57b1c02011-09-28 22:20:48 +02001119static PyObject*
1120_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001121{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001122 PyObject *res;
1123 unsigned char max = 127;
1124 Py_ssize_t i;
1125 for (i = 0; i < size; i++) {
1126 if (u[i] & 0x80) {
1127 max = 255;
1128 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001129 }
1130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131 res = PyUnicode_New(size, max);
1132 if (!res)
1133 return NULL;
1134 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1135 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001136}
1137
Victor Stinnere57b1c02011-09-28 22:20:48 +02001138static PyObject*
1139_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001140{
1141 PyObject *res;
1142 Py_UCS2 max = 0;
1143 Py_ssize_t i;
1144 for (i = 0; i < size; i++)
1145 if (u[i] > max)
1146 max = u[i];
1147 res = PyUnicode_New(size, max);
1148 if (!res)
1149 return NULL;
1150 if (max >= 256)
1151 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1152 else
1153 for (i = 0; i < size; i++)
1154 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1155 return res;
1156}
1157
Victor Stinnere57b1c02011-09-28 22:20:48 +02001158static PyObject*
1159_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160{
1161 PyObject *res;
1162 Py_UCS4 max = 0;
1163 Py_ssize_t i;
1164 for (i = 0; i < size; i++)
1165 if (u[i] > max)
1166 max = u[i];
1167 res = PyUnicode_New(size, max);
1168 if (!res)
1169 return NULL;
1170 if (max >= 0x10000)
1171 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1172 else {
1173 int kind = PyUnicode_KIND(res);
1174 void *data = PyUnicode_DATA(res);
1175 for (i = 0; i < size; i++)
1176 PyUnicode_WRITE(kind, data, i, u[i]);
1177 }
1178 return res;
1179}
1180
1181PyObject*
1182PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1183{
1184 switch(kind) {
1185 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001186 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001187 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001188 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001189 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001190 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001191 }
1192 assert(0);
1193 return NULL;
1194}
1195
1196
1197/* Widen Unicode objects to larger buffers.
1198 Return NULL if the string is too wide already. */
1199
1200void*
1201_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1202{
1203 Py_ssize_t i;
1204 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1205 void *d = PyUnicode_DATA(s);
1206 unsigned int skind = PyUnicode_KIND(s);
1207 if (PyUnicode_KIND(s) >= kind) {
1208 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1209 return NULL;
1210 }
1211 switch(kind) {
1212 case PyUnicode_2BYTE_KIND: {
1213 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1214 if (!result) {
1215 PyErr_NoMemory();
1216 return 0;
1217 }
1218 for (i = 0; i < len; i++)
1219 result[i] = ((Py_UCS1*)d)[i];
1220 return result;
1221 }
1222 case PyUnicode_4BYTE_KIND: {
1223 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1224 if (!result) {
1225 PyErr_NoMemory();
1226 return 0;
1227 }
1228 for (i = 0; i < len; i++)
1229 result[i] = PyUnicode_READ(skind, d, i);
1230 return result;
1231 }
1232 }
1233 Py_FatalError("invalid kind");
1234 return NULL;
1235}
1236
1237static Py_UCS4*
1238as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1239 int copy_null)
1240{
1241 int kind;
1242 void *data;
1243 Py_ssize_t len, targetlen;
1244 if (PyUnicode_READY(string) == -1)
1245 return NULL;
1246 kind = PyUnicode_KIND(string);
1247 data = PyUnicode_DATA(string);
1248 len = PyUnicode_GET_LENGTH(string);
1249 targetlen = len;
1250 if (copy_null)
1251 targetlen++;
1252 if (!target) {
1253 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1254 PyErr_NoMemory();
1255 return NULL;
1256 }
1257 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1258 if (!target) {
1259 PyErr_NoMemory();
1260 return NULL;
1261 }
1262 }
1263 else {
1264 if (targetsize < targetlen) {
1265 PyErr_Format(PyExc_SystemError,
1266 "string is longer than the buffer");
1267 if (copy_null && 0 < targetsize)
1268 target[0] = 0;
1269 return NULL;
1270 }
1271 }
1272 if (kind != PyUnicode_4BYTE_KIND) {
1273 Py_ssize_t i;
1274 for (i = 0; i < len; i++)
1275 target[i] = PyUnicode_READ(kind, data, i);
1276 }
1277 else
1278 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1279 if (copy_null)
1280 target[len] = 0;
1281 return target;
1282}
1283
1284Py_UCS4*
1285PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1286 int copy_null)
1287{
1288 if (target == NULL || targetsize < 1) {
1289 PyErr_BadInternalCall();
1290 return NULL;
1291 }
1292 return as_ucs4(string, target, targetsize, copy_null);
1293}
1294
1295Py_UCS4*
1296PyUnicode_AsUCS4Copy(PyObject *string)
1297{
1298 return as_ucs4(string, NULL, 0, 1);
1299}
1300
1301#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001302
Alexander Belopolsky40018472011-02-26 01:02:56 +00001303PyObject *
1304PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001305{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001307 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001309 PyErr_BadInternalCall();
1310 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311 }
1312
Martin v. Löwis790465f2008-04-05 20:41:37 +00001313 if (size == -1) {
1314 size = wcslen(w);
1315 }
1316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318}
1319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001321
Walter Dörwald346737f2007-05-31 10:44:43 +00001322static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001323makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1324 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001325{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001326 *fmt++ = '%';
1327 if (width) {
1328 if (zeropad)
1329 *fmt++ = '0';
1330 fmt += sprintf(fmt, "%d", width);
1331 }
1332 if (precision)
1333 fmt += sprintf(fmt, ".%d", precision);
1334 if (longflag)
1335 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001336 else if (longlongflag) {
1337 /* longlongflag should only ever be nonzero on machines with
1338 HAVE_LONG_LONG defined */
1339#ifdef HAVE_LONG_LONG
1340 char *f = PY_FORMAT_LONG_LONG;
1341 while (*f)
1342 *fmt++ = *f++;
1343#else
1344 /* we shouldn't ever get here */
1345 assert(0);
1346 *fmt++ = 'l';
1347#endif
1348 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001349 else if (size_tflag) {
1350 char *f = PY_FORMAT_SIZE_T;
1351 while (*f)
1352 *fmt++ = *f++;
1353 }
1354 *fmt++ = c;
1355 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001356}
1357
Victor Stinner96865452011-03-01 23:44:09 +00001358/* helper for PyUnicode_FromFormatV() */
1359
1360static const char*
1361parse_format_flags(const char *f,
1362 int *p_width, int *p_precision,
1363 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1364{
1365 int width, precision, longflag, longlongflag, size_tflag;
1366
1367 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1368 f++;
1369 width = 0;
1370 while (Py_ISDIGIT((unsigned)*f))
1371 width = (width*10) + *f++ - '0';
1372 precision = 0;
1373 if (*f == '.') {
1374 f++;
1375 while (Py_ISDIGIT((unsigned)*f))
1376 precision = (precision*10) + *f++ - '0';
1377 if (*f == '%') {
1378 /* "%.3%s" => f points to "3" */
1379 f--;
1380 }
1381 }
1382 if (*f == '\0') {
1383 /* bogus format "%.1" => go backward, f points to "1" */
1384 f--;
1385 }
1386 if (p_width != NULL)
1387 *p_width = width;
1388 if (p_precision != NULL)
1389 *p_precision = precision;
1390
1391 /* Handle %ld, %lu, %lld and %llu. */
1392 longflag = 0;
1393 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001394 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001395
1396 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001397 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001398 longflag = 1;
1399 ++f;
1400 }
1401#ifdef HAVE_LONG_LONG
1402 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001403 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001404 longlongflag = 1;
1405 f += 2;
1406 }
1407#endif
1408 }
1409 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001410 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001411 size_tflag = 1;
1412 ++f;
1413 }
1414 if (p_longflag != NULL)
1415 *p_longflag = longflag;
1416 if (p_longlongflag != NULL)
1417 *p_longlongflag = longlongflag;
1418 if (p_size_tflag != NULL)
1419 *p_size_tflag = size_tflag;
1420 return f;
1421}
1422
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001423/* maximum number of characters required for output of %ld. 21 characters
1424 allows for 64-bit integers (in decimal) and an optional sign. */
1425#define MAX_LONG_CHARS 21
1426/* maximum number of characters required for output of %lld.
1427 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1428 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1429#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1430
Walter Dörwaldd2034312007-05-18 16:29:38 +00001431PyObject *
1432PyUnicode_FromFormatV(const char *format, va_list vargs)
1433{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001434 va_list count;
1435 Py_ssize_t callcount = 0;
1436 PyObject **callresults = NULL;
1437 PyObject **callresult = NULL;
1438 Py_ssize_t n = 0;
1439 int width = 0;
1440 int precision = 0;
1441 int zeropad;
1442 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001443 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001444 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001445 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1447 Py_UCS4 argmaxchar;
1448 Py_ssize_t numbersize = 0;
1449 char *numberresults = NULL;
1450 char *numberresult = NULL;
1451 Py_ssize_t i;
1452 int kind;
1453 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001454
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001455 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001456 /* step 1: count the number of %S/%R/%A/%s format specifications
1457 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1458 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 * result in an array)
1460 * also esimate a upper bound for all the number formats in the string,
1461 * numbers will be formated in step 3 and be keept in a '\0'-separated
1462 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001463 for (f = format; *f; f++) {
1464 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001465 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1467 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1468 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1469 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001472#ifdef HAVE_LONG_LONG
1473 if (longlongflag) {
1474 if (width < MAX_LONG_LONG_CHARS)
1475 width = MAX_LONG_LONG_CHARS;
1476 }
1477 else
1478#endif
1479 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1480 including sign. Decimal takes the most space. This
1481 isn't enough for octal. If a width is specified we
1482 need more (which we allocate later). */
1483 if (width < MAX_LONG_CHARS)
1484 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485
1486 /* account for the size + '\0' to separate numbers
1487 inside of the numberresults buffer */
1488 numbersize += (width + 1);
1489 }
1490 }
1491 else if ((unsigned char)*f > 127) {
1492 PyErr_Format(PyExc_ValueError,
1493 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1494 "string, got a non-ASCII byte: 0x%02x",
1495 (unsigned char)*f);
1496 return NULL;
1497 }
1498 }
1499 /* step 2: allocate memory for the results of
1500 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1501 if (callcount) {
1502 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1503 if (!callresults) {
1504 PyErr_NoMemory();
1505 return NULL;
1506 }
1507 callresult = callresults;
1508 }
1509 /* step 2.5: allocate memory for the results of formating numbers */
1510 if (numbersize) {
1511 numberresults = PyObject_Malloc(numbersize);
1512 if (!numberresults) {
1513 PyErr_NoMemory();
1514 goto fail;
1515 }
1516 numberresult = numberresults;
1517 }
1518
1519 /* step 3: format numbers and figure out how large a buffer we need */
1520 for (f = format; *f; f++) {
1521 if (*f == '%') {
1522 const char* p;
1523 int longflag;
1524 int longlongflag;
1525 int size_tflag;
1526 int numprinted;
1527
1528 p = f;
1529 zeropad = (f[1] == '0');
1530 f = parse_format_flags(f, &width, &precision,
1531 &longflag, &longlongflag, &size_tflag);
1532 switch (*f) {
1533 case 'c':
1534 {
1535 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001536 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 n++;
1538 break;
1539 }
1540 case '%':
1541 n++;
1542 break;
1543 case 'i':
1544 case 'd':
1545 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1546 width, precision, *f);
1547 if (longflag)
1548 numprinted = sprintf(numberresult, fmt,
1549 va_arg(count, long));
1550#ifdef HAVE_LONG_LONG
1551 else if (longlongflag)
1552 numprinted = sprintf(numberresult, fmt,
1553 va_arg(count, PY_LONG_LONG));
1554#endif
1555 else if (size_tflag)
1556 numprinted = sprintf(numberresult, fmt,
1557 va_arg(count, Py_ssize_t));
1558 else
1559 numprinted = sprintf(numberresult, fmt,
1560 va_arg(count, int));
1561 n += numprinted;
1562 /* advance by +1 to skip over the '\0' */
1563 numberresult += (numprinted + 1);
1564 assert(*(numberresult - 1) == '\0');
1565 assert(*(numberresult - 2) != '\0');
1566 assert(numprinted >= 0);
1567 assert(numberresult <= numberresults + numbersize);
1568 break;
1569 case 'u':
1570 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1571 width, precision, 'u');
1572 if (longflag)
1573 numprinted = sprintf(numberresult, fmt,
1574 va_arg(count, unsigned long));
1575#ifdef HAVE_LONG_LONG
1576 else if (longlongflag)
1577 numprinted = sprintf(numberresult, fmt,
1578 va_arg(count, unsigned PY_LONG_LONG));
1579#endif
1580 else if (size_tflag)
1581 numprinted = sprintf(numberresult, fmt,
1582 va_arg(count, size_t));
1583 else
1584 numprinted = sprintf(numberresult, fmt,
1585 va_arg(count, unsigned int));
1586 n += numprinted;
1587 numberresult += (numprinted + 1);
1588 assert(*(numberresult - 1) == '\0');
1589 assert(*(numberresult - 2) != '\0');
1590 assert(numprinted >= 0);
1591 assert(numberresult <= numberresults + numbersize);
1592 break;
1593 case 'x':
1594 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1595 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1596 n += numprinted;
1597 numberresult += (numprinted + 1);
1598 assert(*(numberresult - 1) == '\0');
1599 assert(*(numberresult - 2) != '\0');
1600 assert(numprinted >= 0);
1601 assert(numberresult <= numberresults + numbersize);
1602 break;
1603 case 'p':
1604 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1605 /* %p is ill-defined: ensure leading 0x. */
1606 if (numberresult[1] == 'X')
1607 numberresult[1] = 'x';
1608 else if (numberresult[1] != 'x') {
1609 memmove(numberresult + 2, numberresult,
1610 strlen(numberresult) + 1);
1611 numberresult[0] = '0';
1612 numberresult[1] = 'x';
1613 numprinted += 2;
1614 }
1615 n += numprinted;
1616 numberresult += (numprinted + 1);
1617 assert(*(numberresult - 1) == '\0');
1618 assert(*(numberresult - 2) != '\0');
1619 assert(numprinted >= 0);
1620 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001621 break;
1622 case 's':
1623 {
1624 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001625 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001626 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1627 if (!str)
1628 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629 /* since PyUnicode_DecodeUTF8 returns already flexible
1630 unicode objects, there is no need to call ready on them */
1631 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001632 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001634 /* Remember the str and switch to the next slot */
1635 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001636 break;
1637 }
1638 case 'U':
1639 {
1640 PyObject *obj = va_arg(count, PyObject *);
1641 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 if (PyUnicode_READY(obj) == -1)
1643 goto fail;
1644 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001645 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001647 break;
1648 }
1649 case 'V':
1650 {
1651 PyObject *obj = va_arg(count, PyObject *);
1652 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001653 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001654 assert(obj || str);
1655 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001656 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657 if (PyUnicode_READY(obj) == -1)
1658 goto fail;
1659 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001660 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001662 *callresult++ = NULL;
1663 }
1664 else {
1665 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1666 if (!str_obj)
1667 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001668 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001669 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001670 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001671 *callresult++ = str_obj;
1672 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001673 break;
1674 }
1675 case 'S':
1676 {
1677 PyObject *obj = va_arg(count, PyObject *);
1678 PyObject *str;
1679 assert(obj);
1680 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001682 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001684 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001686 /* Remember the str and switch to the next slot */
1687 *callresult++ = str;
1688 break;
1689 }
1690 case 'R':
1691 {
1692 PyObject *obj = va_arg(count, PyObject *);
1693 PyObject *repr;
1694 assert(obj);
1695 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001696 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001697 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001699 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001701 /* Remember the repr and switch to the next slot */
1702 *callresult++ = repr;
1703 break;
1704 }
1705 case 'A':
1706 {
1707 PyObject *obj = va_arg(count, PyObject *);
1708 PyObject *ascii;
1709 assert(obj);
1710 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001712 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001714 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001716 /* Remember the repr and switch to the next slot */
1717 *callresult++ = ascii;
1718 break;
1719 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001720 default:
1721 /* if we stumble upon an unknown
1722 formatting code, copy the rest of
1723 the format string to the output
1724 string. (we cannot just skip the
1725 code, since there's no way to know
1726 what's in the argument list) */
1727 n += strlen(p);
1728 goto expand;
1729 }
1730 } else
1731 n++;
1732 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001733 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001734 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001736 we don't have to resize the string.
1737 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001739 if (!string)
1740 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 kind = PyUnicode_KIND(string);
1742 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001743 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001747 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001748 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001749
1750 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1752 /* checking for == because the last argument could be a empty
1753 string, which causes i to point to end, the assert at the end of
1754 the loop */
1755 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001756
Benjamin Peterson14339b62009-01-31 16:36:08 +00001757 switch (*f) {
1758 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001759 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 const int ordinal = va_arg(vargs, int);
1761 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001762 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001763 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001764 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001765 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001766 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001767 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 case 'p':
1769 /* unused, since we already have the result */
1770 if (*f == 'p')
1771 (void) va_arg(vargs, void *);
1772 else
1773 (void) va_arg(vargs, int);
1774 /* extract the result from numberresults and append. */
1775 for (; *numberresult; ++i, ++numberresult)
1776 PyUnicode_WRITE(kind, data, i, *numberresult);
1777 /* skip over the separating '\0' */
1778 assert(*numberresult == '\0');
1779 numberresult++;
1780 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001781 break;
1782 case 's':
1783 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001784 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001786 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 size = PyUnicode_GET_LENGTH(*callresult);
1788 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001789 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1790 *callresult, 0,
1791 size) < 0)
1792 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001794 /* We're done with the unicode()/repr() => forget it */
1795 Py_DECREF(*callresult);
1796 /* switch to next unicode()/repr() result */
1797 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001798 break;
1799 }
1800 case 'U':
1801 {
1802 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 Py_ssize_t size;
1804 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1805 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001806 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1807 obj, 0,
1808 size) < 0)
1809 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001811 break;
1812 }
1813 case 'V':
1814 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001816 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001817 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001818 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819 size = PyUnicode_GET_LENGTH(obj);
1820 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001821 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1822 obj, 0,
1823 size) < 0)
1824 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001826 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 size = PyUnicode_GET_LENGTH(*callresult);
1828 assert(PyUnicode_KIND(*callresult) <=
1829 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001830 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1831 *callresult,
1832 0, size) < 0)
1833 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001835 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001836 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001837 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001838 break;
1839 }
1840 case 'S':
1841 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001842 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001843 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001844 /* unused, since we already have the result */
1845 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001847 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1848 *callresult, 0,
1849 PyUnicode_GET_LENGTH(*callresult)) < 0)
1850 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001851 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001852 /* We're done with the unicode()/repr() => forget it */
1853 Py_DECREF(*callresult);
1854 /* switch to next unicode()/repr() result */
1855 ++callresult;
1856 break;
1857 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001858 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001859 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001860 break;
1861 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862 for (; *p; ++p, ++i)
1863 PyUnicode_WRITE(kind, data, i, *p);
1864 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001865 goto end;
1866 }
Victor Stinner1205f272010-09-11 00:54:47 +00001867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 else {
1869 assert(i < PyUnicode_GET_LENGTH(string));
1870 PyUnicode_WRITE(kind, data, i++, *f);
1871 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001872 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001874
Benjamin Peterson29060642009-01-31 22:14:21 +00001875 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001876 if (callresults)
1877 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001878 if (numberresults)
1879 PyObject_Free(numberresults);
1880 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001882 if (callresults) {
1883 PyObject **callresult2 = callresults;
1884 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001885 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001886 ++callresult2;
1887 }
1888 PyObject_Free(callresults);
1889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001890 if (numberresults)
1891 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001892 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001893}
1894
Walter Dörwaldd2034312007-05-18 16:29:38 +00001895PyObject *
1896PyUnicode_FromFormat(const char *format, ...)
1897{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001898 PyObject* ret;
1899 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001900
1901#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001902 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001903#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001904 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001905#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001906 ret = PyUnicode_FromFormatV(format, vargs);
1907 va_end(vargs);
1908 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001909}
1910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001911#ifdef HAVE_WCHAR_H
1912
Victor Stinner5593d8a2010-10-02 11:11:27 +00001913/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1914 convert a Unicode object to a wide character string.
1915
Victor Stinnerd88d9832011-09-06 02:00:05 +02001916 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001917 character) required to convert the unicode object. Ignore size argument.
1918
Victor Stinnerd88d9832011-09-06 02:00:05 +02001919 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001920 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001921 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001922static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001923unicode_aswidechar(PyUnicodeObject *unicode,
1924 wchar_t *w,
1925 Py_ssize_t size)
1926{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001927 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001928 const wchar_t *wstr;
1929
1930 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1931 if (wstr == NULL)
1932 return -1;
1933
Victor Stinner5593d8a2010-10-02 11:11:27 +00001934 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001935 if (size > res)
1936 size = res + 1;
1937 else
1938 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001940 return res;
1941 }
1942 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001944}
1945
1946Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001947PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001948 wchar_t *w,
1949 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001950{
1951 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001952 PyErr_BadInternalCall();
1953 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001955 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956}
1957
Victor Stinner137c34c2010-09-29 10:25:54 +00001958wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001959PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001960 Py_ssize_t *size)
1961{
1962 wchar_t* buffer;
1963 Py_ssize_t buflen;
1964
1965 if (unicode == NULL) {
1966 PyErr_BadInternalCall();
1967 return NULL;
1968 }
1969
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001970 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001971 if (buflen == -1)
1972 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001973 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001974 PyErr_NoMemory();
1975 return NULL;
1976 }
1977
Victor Stinner137c34c2010-09-29 10:25:54 +00001978 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1979 if (buffer == NULL) {
1980 PyErr_NoMemory();
1981 return NULL;
1982 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001983 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 if (buflen == -1)
1985 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001986 if (size != NULL)
1987 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001988 return buffer;
1989}
1990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992
Alexander Belopolsky40018472011-02-26 01:02:56 +00001993PyObject *
1994PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001995{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001997 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001998 PyErr_SetString(PyExc_ValueError,
1999 "chr() arg not in range(0x110000)");
2000 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002001 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 if (ordinal < 256)
2004 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006 v = PyUnicode_New(1, ordinal);
2007 if (v == NULL)
2008 return NULL;
2009 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2010 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002011}
2012
Alexander Belopolsky40018472011-02-26 01:02:56 +00002013PyObject *
2014PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002016 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002017 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002018 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002019 Py_INCREF(obj);
2020 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002021 }
2022 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002023 /* For a Unicode subtype that's not a Unicode object,
2024 return a true Unicode object with the same data. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025 if (PyUnicode_READY(obj) == -1)
2026 return NULL;
2027 return substring((PyUnicodeObject *)obj, 0, PyUnicode_GET_LENGTH(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002028 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002029 PyErr_Format(PyExc_TypeError,
2030 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002031 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002032 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002033}
2034
Alexander Belopolsky40018472011-02-26 01:02:56 +00002035PyObject *
2036PyUnicode_FromEncodedObject(register PyObject *obj,
2037 const char *encoding,
2038 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002039{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002040 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002041 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002042
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002044 PyErr_BadInternalCall();
2045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002047
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002048 /* Decoding bytes objects is the most common case and should be fast */
2049 if (PyBytes_Check(obj)) {
2050 if (PyBytes_GET_SIZE(obj) == 0) {
2051 Py_INCREF(unicode_empty);
2052 v = (PyObject *) unicode_empty;
2053 }
2054 else {
2055 v = PyUnicode_Decode(
2056 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2057 encoding, errors);
2058 }
2059 return v;
2060 }
2061
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002062 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002063 PyErr_SetString(PyExc_TypeError,
2064 "decoding str is not supported");
2065 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002066 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002067
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002068 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2069 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2070 PyErr_Format(PyExc_TypeError,
2071 "coercing to str: need bytes, bytearray "
2072 "or buffer-like object, %.80s found",
2073 Py_TYPE(obj)->tp_name);
2074 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002075 }
Tim Petersced69f82003-09-16 20:30:58 +00002076
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002077 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002078 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002079 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 }
Tim Petersced69f82003-09-16 20:30:58 +00002081 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002082 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002083
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002084 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002085 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086}
2087
Victor Stinner600d3be2010-06-10 12:00:55 +00002088/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002089 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2090 1 on success. */
2091static int
2092normalize_encoding(const char *encoding,
2093 char *lower,
2094 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002095{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002096 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002097 char *l;
2098 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002099
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002100 e = encoding;
2101 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002102 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002103 while (*e) {
2104 if (l == l_end)
2105 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002106 if (Py_ISUPPER(*e)) {
2107 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002108 }
2109 else if (*e == '_') {
2110 *l++ = '-';
2111 e++;
2112 }
2113 else {
2114 *l++ = *e++;
2115 }
2116 }
2117 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002118 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002119}
2120
Alexander Belopolsky40018472011-02-26 01:02:56 +00002121PyObject *
2122PyUnicode_Decode(const char *s,
2123 Py_ssize_t size,
2124 const char *encoding,
2125 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002126{
2127 PyObject *buffer = NULL, *unicode;
2128 Py_buffer info;
2129 char lower[11]; /* Enough for any encoding shortcut */
2130
2131 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002132 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002133
2134 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002135 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002136 if ((strcmp(lower, "utf-8") == 0) ||
2137 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002138 return PyUnicode_DecodeUTF8(s, size, errors);
2139 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002140 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002141 (strcmp(lower, "iso-8859-1") == 0))
2142 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002143#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002144 else if (strcmp(lower, "mbcs") == 0)
2145 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002146#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002147 else if (strcmp(lower, "ascii") == 0)
2148 return PyUnicode_DecodeASCII(s, size, errors);
2149 else if (strcmp(lower, "utf-16") == 0)
2150 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2151 else if (strcmp(lower, "utf-32") == 0)
2152 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154
2155 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002156 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002157 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002158 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002159 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 if (buffer == NULL)
2161 goto onError;
2162 unicode = PyCodec_Decode(buffer, encoding, errors);
2163 if (unicode == NULL)
2164 goto onError;
2165 if (!PyUnicode_Check(unicode)) {
2166 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002167 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002168 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002169 Py_DECREF(unicode);
2170 goto onError;
2171 }
2172 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 if (PyUnicode_READY(unicode)) {
2174 Py_DECREF(unicode);
2175 return NULL;
2176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002178
Benjamin Peterson29060642009-01-31 22:14:21 +00002179 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 Py_XDECREF(buffer);
2181 return NULL;
2182}
2183
Alexander Belopolsky40018472011-02-26 01:02:56 +00002184PyObject *
2185PyUnicode_AsDecodedObject(PyObject *unicode,
2186 const char *encoding,
2187 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002188{
2189 PyObject *v;
2190
2191 if (!PyUnicode_Check(unicode)) {
2192 PyErr_BadArgument();
2193 goto onError;
2194 }
2195
2196 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002197 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002198
2199 /* Decode via the codec registry */
2200 v = PyCodec_Decode(unicode, encoding, errors);
2201 if (v == NULL)
2202 goto onError;
2203 return v;
2204
Benjamin Peterson29060642009-01-31 22:14:21 +00002205 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002206 return NULL;
2207}
2208
Alexander Belopolsky40018472011-02-26 01:02:56 +00002209PyObject *
2210PyUnicode_AsDecodedUnicode(PyObject *unicode,
2211 const char *encoding,
2212 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002213{
2214 PyObject *v;
2215
2216 if (!PyUnicode_Check(unicode)) {
2217 PyErr_BadArgument();
2218 goto onError;
2219 }
2220
2221 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002222 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002223
2224 /* Decode via the codec registry */
2225 v = PyCodec_Decode(unicode, encoding, errors);
2226 if (v == NULL)
2227 goto onError;
2228 if (!PyUnicode_Check(v)) {
2229 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002230 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002231 Py_TYPE(v)->tp_name);
2232 Py_DECREF(v);
2233 goto onError;
2234 }
2235 return v;
2236
Benjamin Peterson29060642009-01-31 22:14:21 +00002237 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002238 return NULL;
2239}
2240
Alexander Belopolsky40018472011-02-26 01:02:56 +00002241PyObject *
2242PyUnicode_Encode(const Py_UNICODE *s,
2243 Py_ssize_t size,
2244 const char *encoding,
2245 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246{
2247 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002248
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249 unicode = PyUnicode_FromUnicode(s, size);
2250 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2253 Py_DECREF(unicode);
2254 return v;
2255}
2256
Alexander Belopolsky40018472011-02-26 01:02:56 +00002257PyObject *
2258PyUnicode_AsEncodedObject(PyObject *unicode,
2259 const char *encoding,
2260 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002261{
2262 PyObject *v;
2263
2264 if (!PyUnicode_Check(unicode)) {
2265 PyErr_BadArgument();
2266 goto onError;
2267 }
2268
2269 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002270 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002271
2272 /* Encode via the codec registry */
2273 v = PyCodec_Encode(unicode, encoding, errors);
2274 if (v == NULL)
2275 goto onError;
2276 return v;
2277
Benjamin Peterson29060642009-01-31 22:14:21 +00002278 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002279 return NULL;
2280}
2281
Victor Stinnerad158722010-10-27 00:25:46 +00002282PyObject *
2283PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002284{
Victor Stinner99b95382011-07-04 14:23:54 +02002285#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002286 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2287 PyUnicode_GET_SIZE(unicode),
2288 NULL);
2289#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002290 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002291#else
Victor Stinner793b5312011-04-27 00:24:21 +02002292 PyInterpreterState *interp = PyThreadState_GET()->interp;
2293 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2294 cannot use it to encode and decode filenames before it is loaded. Load
2295 the Python codec requires to encode at least its own filename. Use the C
2296 version of the locale codec until the codec registry is initialized and
2297 the Python codec is loaded.
2298
2299 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2300 cannot only rely on it: check also interp->fscodec_initialized for
2301 subinterpreters. */
2302 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002303 return PyUnicode_AsEncodedString(unicode,
2304 Py_FileSystemDefaultEncoding,
2305 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002306 }
2307 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002308 /* locale encoding with surrogateescape */
2309 wchar_t *wchar;
2310 char *bytes;
2311 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002312 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002313
2314 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2315 if (wchar == NULL)
2316 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002317 bytes = _Py_wchar2char(wchar, &error_pos);
2318 if (bytes == NULL) {
2319 if (error_pos != (size_t)-1) {
2320 char *errmsg = strerror(errno);
2321 PyObject *exc = NULL;
2322 if (errmsg == NULL)
2323 errmsg = "Py_wchar2char() failed";
2324 raise_encode_exception(&exc,
2325 "filesystemencoding",
2326 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2327 error_pos, error_pos+1,
2328 errmsg);
2329 Py_XDECREF(exc);
2330 }
2331 else
2332 PyErr_NoMemory();
2333 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002334 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002335 }
2336 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002337
2338 bytes_obj = PyBytes_FromString(bytes);
2339 PyMem_Free(bytes);
2340 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002341 }
Victor Stinnerad158722010-10-27 00:25:46 +00002342#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002343}
2344
Alexander Belopolsky40018472011-02-26 01:02:56 +00002345PyObject *
2346PyUnicode_AsEncodedString(PyObject *unicode,
2347 const char *encoding,
2348 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349{
2350 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002351 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002352
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353 if (!PyUnicode_Check(unicode)) {
2354 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002355 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356 }
Fred Drakee4315f52000-05-09 19:53:39 +00002357
Victor Stinner2f283c22011-03-02 01:21:46 +00002358 if (encoding == NULL) {
2359 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002361 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002362 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002363 }
Fred Drakee4315f52000-05-09 19:53:39 +00002364
2365 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002366 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002367 if ((strcmp(lower, "utf-8") == 0) ||
2368 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002369 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002370 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002372 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002373 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002374 }
Victor Stinner37296e82010-06-10 13:36:23 +00002375 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002376 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002377 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002379#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002380 else if (strcmp(lower, "mbcs") == 0)
2381 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2382 PyUnicode_GET_SIZE(unicode),
2383 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002384#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002385 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002387 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002388
2389 /* Encode via the codec registry */
2390 v = PyCodec_Encode(unicode, encoding, errors);
2391 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002392 return NULL;
2393
2394 /* The normal path */
2395 if (PyBytes_Check(v))
2396 return v;
2397
2398 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002399 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002400 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002401 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002402
2403 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2404 "encoder %s returned bytearray instead of bytes",
2405 encoding);
2406 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002407 Py_DECREF(v);
2408 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002409 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002410
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002411 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2412 Py_DECREF(v);
2413 return b;
2414 }
2415
2416 PyErr_Format(PyExc_TypeError,
2417 "encoder did not return a bytes object (type=%.400s)",
2418 Py_TYPE(v)->tp_name);
2419 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002420 return NULL;
2421}
2422
Alexander Belopolsky40018472011-02-26 01:02:56 +00002423PyObject *
2424PyUnicode_AsEncodedUnicode(PyObject *unicode,
2425 const char *encoding,
2426 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002427{
2428 PyObject *v;
2429
2430 if (!PyUnicode_Check(unicode)) {
2431 PyErr_BadArgument();
2432 goto onError;
2433 }
2434
2435 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002436 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002437
2438 /* Encode via the codec registry */
2439 v = PyCodec_Encode(unicode, encoding, errors);
2440 if (v == NULL)
2441 goto onError;
2442 if (!PyUnicode_Check(v)) {
2443 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002444 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002445 Py_TYPE(v)->tp_name);
2446 Py_DECREF(v);
2447 goto onError;
2448 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002449 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002450
Benjamin Peterson29060642009-01-31 22:14:21 +00002451 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002452 return NULL;
2453}
2454
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002455PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002456PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002457 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002458 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2459}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002460
Christian Heimes5894ba72007-11-04 11:43:14 +00002461PyObject*
2462PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2463{
Victor Stinner99b95382011-07-04 14:23:54 +02002464#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002465 return PyUnicode_DecodeMBCS(s, size, NULL);
2466#elif defined(__APPLE__)
2467 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2468#else
Victor Stinner793b5312011-04-27 00:24:21 +02002469 PyInterpreterState *interp = PyThreadState_GET()->interp;
2470 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2471 cannot use it to encode and decode filenames before it is loaded. Load
2472 the Python codec requires to encode at least its own filename. Use the C
2473 version of the locale codec until the codec registry is initialized and
2474 the Python codec is loaded.
2475
2476 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2477 cannot only rely on it: check also interp->fscodec_initialized for
2478 subinterpreters. */
2479 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002480 return PyUnicode_Decode(s, size,
2481 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002482 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002483 }
2484 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002485 /* locale encoding with surrogateescape */
2486 wchar_t *wchar;
2487 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002488 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002489
2490 if (s[size] != '\0' || size != strlen(s)) {
2491 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2492 return NULL;
2493 }
2494
Victor Stinner168e1172010-10-16 23:16:16 +00002495 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002496 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002497 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002498
Victor Stinner168e1172010-10-16 23:16:16 +00002499 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002500 PyMem_Free(wchar);
2501 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002502 }
Victor Stinnerad158722010-10-27 00:25:46 +00002503#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002504}
2505
Martin v. Löwis011e8422009-05-05 04:43:17 +00002506
2507int
2508PyUnicode_FSConverter(PyObject* arg, void* addr)
2509{
2510 PyObject *output = NULL;
2511 Py_ssize_t size;
2512 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002513 if (arg == NULL) {
2514 Py_DECREF(*(PyObject**)addr);
2515 return 1;
2516 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002517 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002518 output = arg;
2519 Py_INCREF(output);
2520 }
2521 else {
2522 arg = PyUnicode_FromObject(arg);
2523 if (!arg)
2524 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002525 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002526 Py_DECREF(arg);
2527 if (!output)
2528 return 0;
2529 if (!PyBytes_Check(output)) {
2530 Py_DECREF(output);
2531 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2532 return 0;
2533 }
2534 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002535 size = PyBytes_GET_SIZE(output);
2536 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002537 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002538 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002539 Py_DECREF(output);
2540 return 0;
2541 }
2542 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002543 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002544}
2545
2546
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002547int
2548PyUnicode_FSDecoder(PyObject* arg, void* addr)
2549{
2550 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002551 if (arg == NULL) {
2552 Py_DECREF(*(PyObject**)addr);
2553 return 1;
2554 }
2555 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 if (PyUnicode_READY(arg))
2557 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002558 output = arg;
2559 Py_INCREF(output);
2560 }
2561 else {
2562 arg = PyBytes_FromObject(arg);
2563 if (!arg)
2564 return 0;
2565 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2566 PyBytes_GET_SIZE(arg));
2567 Py_DECREF(arg);
2568 if (!output)
2569 return 0;
2570 if (!PyUnicode_Check(output)) {
2571 Py_DECREF(output);
2572 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2573 return 0;
2574 }
2575 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2577 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002578 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2579 Py_DECREF(output);
2580 return 0;
2581 }
2582 *(PyObject**)addr = output;
2583 return Py_CLEANUP_SUPPORTED;
2584}
2585
2586
Martin v. Löwis5b222132007-06-10 09:51:05 +00002587char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002588PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002589{
Christian Heimesf3863112007-11-22 07:46:41 +00002590 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002591 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2592
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002593 if (!PyUnicode_Check(unicode)) {
2594 PyErr_BadArgument();
2595 return NULL;
2596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002598 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002599
2600 if (_PyUnicode_UTF8(unicode) == NULL) {
2601 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2602 if (bytes == NULL)
2603 return NULL;
2604 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2605 if (u->_base.utf8 == NULL) {
2606 Py_DECREF(bytes);
2607 return NULL;
2608 }
2609 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2610 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2611 Py_DECREF(bytes);
2612 }
2613
2614 if (psize)
2615 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2616 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002617}
2618
2619char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002621{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2623}
2624
2625#ifdef Py_DEBUG
2626int unicode_as_unicode_calls = 0;
2627#endif
2628
2629
2630Py_UNICODE *
2631PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2632{
2633 PyUnicodeObject *u;
2634 const unsigned char *one_byte;
2635#if SIZEOF_WCHAR_T == 4
2636 const Py_UCS2 *two_bytes;
2637#else
2638 const Py_UCS4 *four_bytes;
2639 const Py_UCS4 *ucs4_end;
2640 Py_ssize_t num_surrogates;
2641#endif
2642 wchar_t *w;
2643 wchar_t *wchar_end;
2644
2645 if (!PyUnicode_Check(unicode)) {
2646 PyErr_BadArgument();
2647 return NULL;
2648 }
2649 u = (PyUnicodeObject*)unicode;
2650 if (_PyUnicode_WSTR(u) == NULL) {
2651 /* Non-ASCII compact unicode object */
2652 assert(_PyUnicode_KIND(u) != 0);
2653 assert(PyUnicode_IS_READY(u));
2654
2655#ifdef Py_DEBUG
2656 ++unicode_as_unicode_calls;
2657#endif
2658
2659 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2660#if SIZEOF_WCHAR_T == 2
2661 four_bytes = PyUnicode_4BYTE_DATA(u);
2662 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2663 num_surrogates = 0;
2664
2665 for (; four_bytes < ucs4_end; ++four_bytes) {
2666 if (*four_bytes > 0xFFFF)
2667 ++num_surrogates;
2668 }
2669
2670 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2671 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2672 if (!_PyUnicode_WSTR(u)) {
2673 PyErr_NoMemory();
2674 return NULL;
2675 }
2676 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2677
2678 w = _PyUnicode_WSTR(u);
2679 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2680 four_bytes = PyUnicode_4BYTE_DATA(u);
2681 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2682 if (*four_bytes > 0xFFFF) {
2683 /* encode surrogate pair in this case */
2684 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2685 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2686 }
2687 else
2688 *w = *four_bytes;
2689
2690 if (w > wchar_end) {
2691 assert(0 && "Miscalculated string end");
2692 }
2693 }
2694 *w = 0;
2695#else
2696 /* sizeof(wchar_t) == 4 */
2697 Py_FatalError("Impossible unicode object state, wstr and str "
2698 "should share memory already.");
2699 return NULL;
2700#endif
2701 }
2702 else {
2703 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2704 (_PyUnicode_LENGTH(u) + 1));
2705 if (!_PyUnicode_WSTR(u)) {
2706 PyErr_NoMemory();
2707 return NULL;
2708 }
2709 if (!PyUnicode_IS_COMPACT_ASCII(u))
2710 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2711 w = _PyUnicode_WSTR(u);
2712 wchar_end = w + _PyUnicode_LENGTH(u);
2713
2714 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2715 one_byte = PyUnicode_1BYTE_DATA(u);
2716 for (; w < wchar_end; ++one_byte, ++w)
2717 *w = *one_byte;
2718 /* null-terminate the wstr */
2719 *w = 0;
2720 }
2721 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2722#if SIZEOF_WCHAR_T == 4
2723 two_bytes = PyUnicode_2BYTE_DATA(u);
2724 for (; w < wchar_end; ++two_bytes, ++w)
2725 *w = *two_bytes;
2726 /* null-terminate the wstr */
2727 *w = 0;
2728#else
2729 /* sizeof(wchar_t) == 2 */
2730 PyObject_FREE(_PyUnicode_WSTR(u));
2731 _PyUnicode_WSTR(u) = NULL;
2732 Py_FatalError("Impossible unicode object state, wstr "
2733 "and str should share memory already.");
2734 return NULL;
2735#endif
2736 }
2737 else {
2738 assert(0 && "This should never happen.");
2739 }
2740 }
2741 }
2742 if (size != NULL)
2743 *size = PyUnicode_WSTR_LENGTH(u);
2744 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002745}
2746
Alexander Belopolsky40018472011-02-26 01:02:56 +00002747Py_UNICODE *
2748PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002750 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751}
2752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002753
Alexander Belopolsky40018472011-02-26 01:02:56 +00002754Py_ssize_t
2755PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756{
2757 if (!PyUnicode_Check(unicode)) {
2758 PyErr_BadArgument();
2759 goto onError;
2760 }
2761 return PyUnicode_GET_SIZE(unicode);
2762
Benjamin Peterson29060642009-01-31 22:14:21 +00002763 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 return -1;
2765}
2766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002767Py_ssize_t
2768PyUnicode_GetLength(PyObject *unicode)
2769{
2770 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2771 PyErr_BadArgument();
2772 return -1;
2773 }
2774
2775 return PyUnicode_GET_LENGTH(unicode);
2776}
2777
2778Py_UCS4
2779PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2780{
2781 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2782 return PyErr_BadArgument();
2783 return (Py_UCS4)-1;
2784 }
2785 return PyUnicode_READ_CHAR(unicode, index);
2786}
2787
2788int
2789PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2790{
2791 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2792 return PyErr_BadArgument();
2793 return -1;
2794 }
2795
2796 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2797 index, ch);
2798 return 0;
2799}
2800
Alexander Belopolsky40018472011-02-26 01:02:56 +00002801const char *
2802PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002803{
Victor Stinner42cb4622010-09-01 19:39:01 +00002804 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002805}
2806
Victor Stinner554f3f02010-06-16 23:33:54 +00002807/* create or adjust a UnicodeDecodeError */
2808static void
2809make_decode_exception(PyObject **exceptionObject,
2810 const char *encoding,
2811 const char *input, Py_ssize_t length,
2812 Py_ssize_t startpos, Py_ssize_t endpos,
2813 const char *reason)
2814{
2815 if (*exceptionObject == NULL) {
2816 *exceptionObject = PyUnicodeDecodeError_Create(
2817 encoding, input, length, startpos, endpos, reason);
2818 }
2819 else {
2820 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2821 goto onError;
2822 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2823 goto onError;
2824 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2825 goto onError;
2826 }
2827 return;
2828
2829onError:
2830 Py_DECREF(*exceptionObject);
2831 *exceptionObject = NULL;
2832}
2833
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002834/* error handling callback helper:
2835 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002836 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002837 and adjust various state variables.
2838 return 0 on success, -1 on error
2839*/
2840
Alexander Belopolsky40018472011-02-26 01:02:56 +00002841static int
2842unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2843 const char *encoding, const char *reason,
2844 const char **input, const char **inend, Py_ssize_t *startinpos,
2845 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2846 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002847{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002848 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002849
2850 PyObject *restuple = NULL;
2851 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002852 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002853 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002854 Py_ssize_t requiredsize;
2855 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002856 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002857 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002858 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002859 int res = -1;
2860
2861 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002862 *errorHandler = PyCodec_LookupError(errors);
2863 if (*errorHandler == NULL)
2864 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002865 }
2866
Victor Stinner554f3f02010-06-16 23:33:54 +00002867 make_decode_exception(exceptionObject,
2868 encoding,
2869 *input, *inend - *input,
2870 *startinpos, *endinpos,
2871 reason);
2872 if (*exceptionObject == NULL)
2873 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002874
2875 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2876 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002877 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002878 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002879 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002880 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002881 }
2882 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002883 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002884
2885 /* Copy back the bytes variables, which might have been modified by the
2886 callback */
2887 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2888 if (!inputobj)
2889 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002890 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002891 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002892 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002893 *input = PyBytes_AS_STRING(inputobj);
2894 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002895 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002896 /* we can DECREF safely, as the exception has another reference,
2897 so the object won't go away. */
2898 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002899
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002900 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002901 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002902 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002903 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2904 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002905 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002906
2907 /* need more space? (at least enough for what we
2908 have+the replacement+the rest of the string (starting
2909 at the new input position), so we won't have to check space
2910 when there are no errors in the rest of the string) */
2911 repptr = PyUnicode_AS_UNICODE(repunicode);
2912 repsize = PyUnicode_GET_SIZE(repunicode);
2913 requiredsize = *outpos + repsize + insize-newpos;
2914 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002915 if (requiredsize<2*outsize)
2916 requiredsize = 2*outsize;
2917 if (_PyUnicode_Resize(output, requiredsize) < 0)
2918 goto onError;
2919 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002920 }
2921 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002922 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002923 Py_UNICODE_COPY(*outptr, repptr, repsize);
2924 *outptr += repsize;
2925 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002926
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002927 /* we made it! */
2928 res = 0;
2929
Benjamin Peterson29060642009-01-31 22:14:21 +00002930 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002931 Py_XDECREF(restuple);
2932 return res;
2933}
2934
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002935/* --- UTF-7 Codec -------------------------------------------------------- */
2936
Antoine Pitrou244651a2009-05-04 18:56:13 +00002937/* See RFC2152 for details. We encode conservatively and decode liberally. */
2938
2939/* Three simple macros defining base-64. */
2940
2941/* Is c a base-64 character? */
2942
2943#define IS_BASE64(c) \
2944 (((c) >= 'A' && (c) <= 'Z') || \
2945 ((c) >= 'a' && (c) <= 'z') || \
2946 ((c) >= '0' && (c) <= '9') || \
2947 (c) == '+' || (c) == '/')
2948
2949/* given that c is a base-64 character, what is its base-64 value? */
2950
2951#define FROM_BASE64(c) \
2952 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2953 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2954 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2955 (c) == '+' ? 62 : 63)
2956
2957/* What is the base-64 character of the bottom 6 bits of n? */
2958
2959#define TO_BASE64(n) \
2960 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2961
2962/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2963 * decoded as itself. We are permissive on decoding; the only ASCII
2964 * byte not decoding to itself is the + which begins a base64
2965 * string. */
2966
2967#define DECODE_DIRECT(c) \
2968 ((c) <= 127 && (c) != '+')
2969
2970/* The UTF-7 encoder treats ASCII characters differently according to
2971 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2972 * the above). See RFC2152. This array identifies these different
2973 * sets:
2974 * 0 : "Set D"
2975 * alphanumeric and '(),-./:?
2976 * 1 : "Set O"
2977 * !"#$%&*;<=>@[]^_`{|}
2978 * 2 : "whitespace"
2979 * ht nl cr sp
2980 * 3 : special (must be base64 encoded)
2981 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2982 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002983
Tim Petersced69f82003-09-16 20:30:58 +00002984static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002985char utf7_category[128] = {
2986/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2987 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2988/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2989 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2990/* sp ! " # $ % & ' ( ) * + , - . / */
2991 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2992/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2993 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2994/* @ A B C D E F G H I J K L M N O */
2995 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2996/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2997 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2998/* ` a b c d e f g h i j k l m n o */
2999 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3000/* p q r s t u v w x y z { | } ~ del */
3001 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003002};
3003
Antoine Pitrou244651a2009-05-04 18:56:13 +00003004/* ENCODE_DIRECT: this character should be encoded as itself. The
3005 * answer depends on whether we are encoding set O as itself, and also
3006 * on whether we are encoding whitespace as itself. RFC2152 makes it
3007 * clear that the answers to these questions vary between
3008 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003009
Antoine Pitrou244651a2009-05-04 18:56:13 +00003010#define ENCODE_DIRECT(c, directO, directWS) \
3011 ((c) < 128 && (c) > 0 && \
3012 ((utf7_category[(c)] == 0) || \
3013 (directWS && (utf7_category[(c)] == 2)) || \
3014 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003015
Alexander Belopolsky40018472011-02-26 01:02:56 +00003016PyObject *
3017PyUnicode_DecodeUTF7(const char *s,
3018 Py_ssize_t size,
3019 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003020{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003021 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3022}
3023
Antoine Pitrou244651a2009-05-04 18:56:13 +00003024/* The decoder. The only state we preserve is our read position,
3025 * i.e. how many characters we have consumed. So if we end in the
3026 * middle of a shift sequence we have to back off the read position
3027 * and the output to the beginning of the sequence, otherwise we lose
3028 * all the shift state (seen bits, number of bits seen, high
3029 * surrogate). */
3030
Alexander Belopolsky40018472011-02-26 01:02:56 +00003031PyObject *
3032PyUnicode_DecodeUTF7Stateful(const char *s,
3033 Py_ssize_t size,
3034 const char *errors,
3035 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003036{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003037 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003038 Py_ssize_t startinpos;
3039 Py_ssize_t endinpos;
3040 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003041 const char *e;
3042 PyUnicodeObject *unicode;
3043 Py_UNICODE *p;
3044 const char *errmsg = "";
3045 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003046 Py_UNICODE *shiftOutStart;
3047 unsigned int base64bits = 0;
3048 unsigned long base64buffer = 0;
3049 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003050 PyObject *errorHandler = NULL;
3051 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003052
3053 unicode = _PyUnicode_New(size);
3054 if (!unicode)
3055 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003056 if (size == 0) {
3057 if (consumed)
3058 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003059 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003060 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003062 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003063 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003064 e = s + size;
3065
3066 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003067 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003068 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003069 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003070
Antoine Pitrou244651a2009-05-04 18:56:13 +00003071 if (inShift) { /* in a base-64 section */
3072 if (IS_BASE64(ch)) { /* consume a base-64 character */
3073 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3074 base64bits += 6;
3075 s++;
3076 if (base64bits >= 16) {
3077 /* we have enough bits for a UTF-16 value */
3078 Py_UNICODE outCh = (Py_UNICODE)
3079 (base64buffer >> (base64bits-16));
3080 base64bits -= 16;
3081 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3082 if (surrogate) {
3083 /* expecting a second surrogate */
3084 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3085#ifdef Py_UNICODE_WIDE
3086 *p++ = (((surrogate & 0x3FF)<<10)
3087 | (outCh & 0x3FF)) + 0x10000;
3088#else
3089 *p++ = surrogate;
3090 *p++ = outCh;
3091#endif
3092 surrogate = 0;
3093 }
3094 else {
3095 surrogate = 0;
3096 errmsg = "second surrogate missing";
3097 goto utf7Error;
3098 }
3099 }
3100 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3101 /* first surrogate */
3102 surrogate = outCh;
3103 }
3104 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3105 errmsg = "unexpected second surrogate";
3106 goto utf7Error;
3107 }
3108 else {
3109 *p++ = outCh;
3110 }
3111 }
3112 }
3113 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003114 inShift = 0;
3115 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003116 if (surrogate) {
3117 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003118 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003119 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003120 if (base64bits > 0) { /* left-over bits */
3121 if (base64bits >= 6) {
3122 /* We've seen at least one base-64 character */
3123 errmsg = "partial character in shift sequence";
3124 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003125 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003126 else {
3127 /* Some bits remain; they should be zero */
3128 if (base64buffer != 0) {
3129 errmsg = "non-zero padding bits in shift sequence";
3130 goto utf7Error;
3131 }
3132 }
3133 }
3134 if (ch != '-') {
3135 /* '-' is absorbed; other terminating
3136 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003137 *p++ = ch;
3138 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003139 }
3140 }
3141 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003142 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003143 s++; /* consume '+' */
3144 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003145 s++;
3146 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003147 }
3148 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003149 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003150 shiftOutStart = p;
3151 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003152 }
3153 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003154 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003155 *p++ = ch;
3156 s++;
3157 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003158 else {
3159 startinpos = s-starts;
3160 s++;
3161 errmsg = "unexpected special character";
3162 goto utf7Error;
3163 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003164 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003165utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003166 outpos = p-PyUnicode_AS_UNICODE(unicode);
3167 endinpos = s-starts;
3168 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003169 errors, &errorHandler,
3170 "utf7", errmsg,
3171 &starts, &e, &startinpos, &endinpos, &exc, &s,
3172 &unicode, &outpos, &p))
3173 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003174 }
3175
Antoine Pitrou244651a2009-05-04 18:56:13 +00003176 /* end of string */
3177
3178 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3179 /* if we're in an inconsistent state, that's an error */
3180 if (surrogate ||
3181 (base64bits >= 6) ||
3182 (base64bits > 0 && base64buffer != 0)) {
3183 outpos = p-PyUnicode_AS_UNICODE(unicode);
3184 endinpos = size;
3185 if (unicode_decode_call_errorhandler(
3186 errors, &errorHandler,
3187 "utf7", "unterminated shift sequence",
3188 &starts, &e, &startinpos, &endinpos, &exc, &s,
3189 &unicode, &outpos, &p))
3190 goto onError;
3191 if (s < e)
3192 goto restart;
3193 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003194 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003195
3196 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003197 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003198 if (inShift) {
3199 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003200 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003201 }
3202 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003203 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003204 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003205 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003206
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003207 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003208 goto onError;
3209
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003210 Py_XDECREF(errorHandler);
3211 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003212 if (PyUnicode_READY(unicode) == -1) {
3213 Py_DECREF(unicode);
3214 return NULL;
3215 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003216 return (PyObject *)unicode;
3217
Benjamin Peterson29060642009-01-31 22:14:21 +00003218 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003219 Py_XDECREF(errorHandler);
3220 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003221 Py_DECREF(unicode);
3222 return NULL;
3223}
3224
3225
Alexander Belopolsky40018472011-02-26 01:02:56 +00003226PyObject *
3227PyUnicode_EncodeUTF7(const Py_UNICODE *s,
3228 Py_ssize_t size,
3229 int base64SetO,
3230 int base64WhiteSpace,
3231 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003232{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003233 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003234 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003235 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003236 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003237 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003238 unsigned int base64bits = 0;
3239 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003240 char * out;
3241 char * start;
3242
3243 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003244 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003245
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003246 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003247 return PyErr_NoMemory();
3248
Antoine Pitrou244651a2009-05-04 18:56:13 +00003249 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003250 if (v == NULL)
3251 return NULL;
3252
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003253 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003254 for (;i < size; ++i) {
3255 Py_UNICODE ch = s[i];
3256
Antoine Pitrou244651a2009-05-04 18:56:13 +00003257 if (inShift) {
3258 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3259 /* shifting out */
3260 if (base64bits) { /* output remaining bits */
3261 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3262 base64buffer = 0;
3263 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003264 }
3265 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003266 /* Characters not in the BASE64 set implicitly unshift the sequence
3267 so no '-' is required, except if the character is itself a '-' */
3268 if (IS_BASE64(ch) || ch == '-') {
3269 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003270 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003271 *out++ = (char) ch;
3272 }
3273 else {
3274 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003275 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003276 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003277 else { /* not in a shift sequence */
3278 if (ch == '+') {
3279 *out++ = '+';
3280 *out++ = '-';
3281 }
3282 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3283 *out++ = (char) ch;
3284 }
3285 else {
3286 *out++ = '+';
3287 inShift = 1;
3288 goto encode_char;
3289 }
3290 }
3291 continue;
3292encode_char:
3293#ifdef Py_UNICODE_WIDE
3294 if (ch >= 0x10000) {
3295 /* code first surrogate */
3296 base64bits += 16;
3297 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3298 while (base64bits >= 6) {
3299 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3300 base64bits -= 6;
3301 }
3302 /* prepare second surrogate */
3303 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3304 }
3305#endif
3306 base64bits += 16;
3307 base64buffer = (base64buffer << 16) | ch;
3308 while (base64bits >= 6) {
3309 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3310 base64bits -= 6;
3311 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003312 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003313 if (base64bits)
3314 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3315 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003316 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003317 if (_PyBytes_Resize(&v, out - start) < 0)
3318 return NULL;
3319 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003320}
3321
Antoine Pitrou244651a2009-05-04 18:56:13 +00003322#undef IS_BASE64
3323#undef FROM_BASE64
3324#undef TO_BASE64
3325#undef DECODE_DIRECT
3326#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003327
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328/* --- UTF-8 Codec -------------------------------------------------------- */
3329
Tim Petersced69f82003-09-16 20:30:58 +00003330static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003332 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3333 illegal prefix. See RFC 3629 for details */
3334 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3335 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003336 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3338 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3339 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3340 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003341 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3342 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3344 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003345 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3346 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3347 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3348 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3349 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350};
3351
Alexander Belopolsky40018472011-02-26 01:02:56 +00003352PyObject *
3353PyUnicode_DecodeUTF8(const char *s,
3354 Py_ssize_t size,
3355 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356{
Walter Dörwald69652032004-09-07 20:24:22 +00003357 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3358}
3359
Antoine Pitrouab868312009-01-10 15:40:25 +00003360/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3361#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3362
3363/* Mask to quickly check whether a C 'long' contains a
3364 non-ASCII, UTF8-encoded char. */
3365#if (SIZEOF_LONG == 8)
3366# define ASCII_CHAR_MASK 0x8080808080808080L
3367#elif (SIZEOF_LONG == 4)
3368# define ASCII_CHAR_MASK 0x80808080L
3369#else
3370# error C 'long' size should be either 4 or 8!
3371#endif
3372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003373/* Scans a UTF-8 string and returns the maximum character to be expected,
3374 the size of the decoded unicode string and if any major errors were
3375 encountered.
3376
3377 This function does check basic UTF-8 sanity, it does however NOT CHECK
3378 if the string contains surrogates, and if all continuation bytes are
3379 within the correct ranges, these checks are performed in
3380 PyUnicode_DecodeUTF8Stateful.
3381
3382 If it sets has_errors to 1, it means the value of unicode_size and max_char
3383 will be bogus and you should not rely on useful information in them.
3384 */
3385static Py_UCS4
3386utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3387 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3388 int *has_errors)
3389{
3390 Py_ssize_t n;
3391 Py_ssize_t char_count = 0;
3392 Py_UCS4 max_char = 127, new_max;
3393 Py_UCS4 upper_bound;
3394 const unsigned char *p = (const unsigned char *)s;
3395 const unsigned char *end = p + string_size;
3396 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3397 int err = 0;
3398
3399 for (; p < end && !err; ++p, ++char_count) {
3400 /* Only check value if it's not a ASCII char... */
3401 if (*p < 0x80) {
3402 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3403 an explanation. */
3404 if (!((size_t) p & LONG_PTR_MASK)) {
3405 /* Help register allocation */
3406 register const unsigned char *_p = p;
3407 while (_p < aligned_end) {
3408 unsigned long value = *(unsigned long *) _p;
3409 if (value & ASCII_CHAR_MASK)
3410 break;
3411 _p += SIZEOF_LONG;
3412 char_count += SIZEOF_LONG;
3413 }
3414 p = _p;
3415 if (p == end)
3416 break;
3417 }
3418 }
3419 if (*p >= 0x80) {
3420 n = utf8_code_length[*p];
3421 new_max = max_char;
3422 switch (n) {
3423 /* invalid start byte */
3424 case 0:
3425 err = 1;
3426 break;
3427 case 2:
3428 /* Code points between 0x00FF and 0x07FF inclusive.
3429 Approximate the upper bound of the code point,
3430 if this flips over 255 we can be sure it will be more
3431 than 255 and the string will need 2 bytes per code coint,
3432 if it stays under or equal to 255, we can be sure 1 byte
3433 is enough.
3434 ((*p & 0b00011111) << 6) | 0b00111111 */
3435 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3436 if (max_char < upper_bound)
3437 new_max = upper_bound;
3438 /* Ensure we track at least that we left ASCII space. */
3439 if (new_max < 128)
3440 new_max = 128;
3441 break;
3442 case 3:
3443 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3444 always > 255 and <= 65535 and will always need 2 bytes. */
3445 if (max_char < 65535)
3446 new_max = 65535;
3447 break;
3448 case 4:
3449 /* Code point will be above 0xFFFF for sure in this case. */
3450 new_max = 65537;
3451 break;
3452 /* Internal error, this should be caught by the first if */
3453 case 1:
3454 default:
3455 assert(0 && "Impossible case in utf8_max_char_and_size");
3456 err = 1;
3457 }
3458 /* Instead of number of overall bytes for this code point,
3459 n containts the number of following bytes: */
3460 --n;
3461 /* Check if the follow up chars are all valid continuation bytes */
3462 if (n >= 1) {
3463 const unsigned char *cont;
3464 if ((p + n) >= end) {
3465 if (consumed == 0)
3466 /* incomplete data, non-incremental decoding */
3467 err = 1;
3468 break;
3469 }
3470 for (cont = p + 1; cont < (p + n); ++cont) {
3471 if ((*cont & 0xc0) != 0x80) {
3472 err = 1;
3473 break;
3474 }
3475 }
3476 p += n;
3477 }
3478 else
3479 err = 1;
3480 max_char = new_max;
3481 }
3482 }
3483
3484 if (unicode_size)
3485 *unicode_size = char_count;
3486 if (has_errors)
3487 *has_errors = err;
3488 return max_char;
3489}
3490
3491/* Similar to PyUnicode_WRITE but can also write into wstr field
3492 of the legacy unicode representation */
3493#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3494 do { \
3495 const int k_ = (kind); \
3496 if (k_ == PyUnicode_WCHAR_KIND) \
3497 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3498 else if (k_ == PyUnicode_1BYTE_KIND) \
3499 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3500 else if (k_ == PyUnicode_2BYTE_KIND) \
3501 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3502 else \
3503 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3504 } while (0)
3505
Alexander Belopolsky40018472011-02-26 01:02:56 +00003506PyObject *
3507PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003508 Py_ssize_t size,
3509 const char *errors,
3510 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003511{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003514 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003515 Py_ssize_t startinpos;
3516 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003517 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003519 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520 PyObject *errorHandler = NULL;
3521 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003522 Py_UCS4 maxchar = 0;
3523 Py_ssize_t unicode_size;
3524 Py_ssize_t i;
3525 int kind;
3526 void *data;
3527 int has_errors;
3528 Py_UNICODE *error_outptr;
3529#if SIZEOF_WCHAR_T == 2
3530 Py_ssize_t wchar_offset = 0;
3531#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532
Walter Dörwald69652032004-09-07 20:24:22 +00003533 if (size == 0) {
3534 if (consumed)
3535 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003536 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003537 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003538 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3539 consumed, &has_errors);
3540 if (has_errors) {
3541 unicode = _PyUnicode_New(size);
3542 if (!unicode)
3543 return NULL;
3544 kind = PyUnicode_WCHAR_KIND;
3545 data = PyUnicode_AS_UNICODE(unicode);
3546 assert(data != NULL);
3547 }
3548 else {
3549 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3550 if (!unicode)
3551 return NULL;
3552 /* When the string is ASCII only, just use memcpy and return.
3553 unicode_size may be != size if there is an incomplete UTF-8
3554 sequence at the end of the ASCII block. */
3555 if (maxchar < 128 && size == unicode_size) {
3556 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3557 return (PyObject *)unicode;
3558 }
3559 kind = PyUnicode_KIND(unicode);
3560 data = PyUnicode_DATA(unicode);
3561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003563 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003565 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566
3567 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003568 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569
3570 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003571 /* Fast path for runs of ASCII characters. Given that common UTF-8
3572 input will consist of an overwhelming majority of ASCII
3573 characters, we try to optimize for this case by checking
3574 as many characters as a C 'long' can contain.
3575 First, check if we can do an aligned read, as most CPUs have
3576 a penalty for unaligned reads.
3577 */
3578 if (!((size_t) s & LONG_PTR_MASK)) {
3579 /* Help register allocation */
3580 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003581 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003582 while (_s < aligned_end) {
3583 /* Read a whole long at a time (either 4 or 8 bytes),
3584 and do a fast unrolled copy if it only contains ASCII
3585 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003586 unsigned long value = *(unsigned long *) _s;
3587 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003588 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003589 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3590 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3591 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3592 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003593#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003594 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3595 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3596 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3597 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003598#endif
3599 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003600 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003601 }
3602 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003603 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003604 if (s == e)
3605 break;
3606 ch = (unsigned char)*s;
3607 }
3608 }
3609
3610 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003611 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612 s++;
3613 continue;
3614 }
3615
3616 n = utf8_code_length[ch];
3617
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003618 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003619 if (consumed)
3620 break;
3621 else {
3622 errmsg = "unexpected end of data";
3623 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003624 endinpos = startinpos+1;
3625 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3626 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003627 goto utf8Error;
3628 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003629 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630
3631 switch (n) {
3632
3633 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003634 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 startinpos = s-starts;
3636 endinpos = startinpos+1;
3637 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638
3639 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003640 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003641 startinpos = s-starts;
3642 endinpos = startinpos+1;
3643 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644
3645 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003646 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003647 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003648 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003649 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003650 goto utf8Error;
3651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003653 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003654 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 break;
3656
3657 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003658 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3659 will result in surrogates in range d800-dfff. Surrogates are
3660 not valid UTF-8 so they are rejected.
3661 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3662 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003663 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003664 (s[2] & 0xc0) != 0x80 ||
3665 ((unsigned char)s[0] == 0xE0 &&
3666 (unsigned char)s[1] < 0xA0) ||
3667 ((unsigned char)s[0] == 0xED &&
3668 (unsigned char)s[1] > 0x9F)) {
3669 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003670 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003671 endinpos = startinpos + 1;
3672
3673 /* if s[1] first two bits are 1 and 0, then the invalid
3674 continuation byte is s[2], so increment endinpos by 1,
3675 if not, s[1] is invalid and endinpos doesn't need to
3676 be incremented. */
3677 if ((s[1] & 0xC0) == 0x80)
3678 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003679 goto utf8Error;
3680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003682 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003683 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003684 break;
3685
3686 case 4:
3687 if ((s[1] & 0xc0) != 0x80 ||
3688 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003689 (s[3] & 0xc0) != 0x80 ||
3690 ((unsigned char)s[0] == 0xF0 &&
3691 (unsigned char)s[1] < 0x90) ||
3692 ((unsigned char)s[0] == 0xF4 &&
3693 (unsigned char)s[1] > 0x8F)) {
3694 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003695 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003696 endinpos = startinpos + 1;
3697 if ((s[1] & 0xC0) == 0x80) {
3698 endinpos++;
3699 if ((s[2] & 0xC0) == 0x80)
3700 endinpos++;
3701 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003702 goto utf8Error;
3703 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003704 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003705 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3706 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003708 /* If the string is flexible or we have native UCS-4, write
3709 directly.. */
3710 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3711 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003713 else {
3714 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003716 /* translate from 10000..10FFFF to 0..FFFF */
3717 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003719 /* high surrogate = top 10 bits added to D800 */
3720 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3721 (Py_UNICODE)(0xD800 + (ch >> 10)));
3722
3723 /* low surrogate = bottom 10 bits added to DC00 */
3724 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3725 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3726 }
3727#if SIZEOF_WCHAR_T == 2
3728 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003729#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 }
3732 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003733 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003734
Benjamin Peterson29060642009-01-31 22:14:21 +00003735 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003736 /* If this is not yet a resizable string, make it one.. */
3737 if (kind != PyUnicode_WCHAR_KIND) {
3738 const Py_UNICODE *u;
3739 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3740 if (!new_unicode)
3741 goto onError;
3742 u = PyUnicode_AsUnicode((PyObject *)unicode);
3743 if (!u)
3744 goto onError;
3745#if SIZEOF_WCHAR_T == 2
3746 i += wchar_offset;
3747#endif
3748 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3749 Py_DECREF(unicode);
3750 unicode = new_unicode;
3751 kind = 0;
3752 data = PyUnicode_AS_UNICODE(new_unicode);
3753 assert(data != NULL);
3754 }
3755 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 if (unicode_decode_call_errorhandler(
3757 errors, &errorHandler,
3758 "utf8", errmsg,
3759 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003760 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003761 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762 /* Update data because unicode_decode_call_errorhandler might have
3763 re-created or resized the unicode object. */
3764 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003765 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003767 /* Ensure the unicode_size calculation above was correct: */
3768 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3769
Walter Dörwald69652032004-09-07 20:24:22 +00003770 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003771 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773 /* Adjust length and ready string when it contained errors and
3774 is of the old resizable kind. */
3775 if (kind == PyUnicode_WCHAR_KIND) {
3776 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3777 PyUnicode_READY(unicode) == -1)
3778 goto onError;
3779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 Py_XDECREF(errorHandler);
3782 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003783 if (PyUnicode_READY(unicode) == -1) {
3784 Py_DECREF(unicode);
3785 return NULL;
3786 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003787 return (PyObject *)unicode;
3788
Benjamin Peterson29060642009-01-31 22:14:21 +00003789 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003790 Py_XDECREF(errorHandler);
3791 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792 Py_DECREF(unicode);
3793 return NULL;
3794}
3795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003797
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003798#ifdef __APPLE__
3799
3800/* Simplified UTF-8 decoder using surrogateescape error handler,
3801 used to decode the command line arguments on Mac OS X. */
3802
3803wchar_t*
3804_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3805{
3806 int n;
3807 const char *e;
3808 wchar_t *unicode, *p;
3809
3810 /* Note: size will always be longer than the resulting Unicode
3811 character count */
3812 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3813 PyErr_NoMemory();
3814 return NULL;
3815 }
3816 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3817 if (!unicode)
3818 return NULL;
3819
3820 /* Unpack UTF-8 encoded data */
3821 p = unicode;
3822 e = s + size;
3823 while (s < e) {
3824 Py_UCS4 ch = (unsigned char)*s;
3825
3826 if (ch < 0x80) {
3827 *p++ = (wchar_t)ch;
3828 s++;
3829 continue;
3830 }
3831
3832 n = utf8_code_length[ch];
3833 if (s + n > e) {
3834 goto surrogateescape;
3835 }
3836
3837 switch (n) {
3838 case 0:
3839 case 1:
3840 goto surrogateescape;
3841
3842 case 2:
3843 if ((s[1] & 0xc0) != 0x80)
3844 goto surrogateescape;
3845 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3846 assert ((ch > 0x007F) && (ch <= 0x07FF));
3847 *p++ = (wchar_t)ch;
3848 break;
3849
3850 case 3:
3851 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3852 will result in surrogates in range d800-dfff. Surrogates are
3853 not valid UTF-8 so they are rejected.
3854 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3855 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3856 if ((s[1] & 0xc0) != 0x80 ||
3857 (s[2] & 0xc0) != 0x80 ||
3858 ((unsigned char)s[0] == 0xE0 &&
3859 (unsigned char)s[1] < 0xA0) ||
3860 ((unsigned char)s[0] == 0xED &&
3861 (unsigned char)s[1] > 0x9F)) {
3862
3863 goto surrogateescape;
3864 }
3865 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3866 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003868 break;
3869
3870 case 4:
3871 if ((s[1] & 0xc0) != 0x80 ||
3872 (s[2] & 0xc0) != 0x80 ||
3873 (s[3] & 0xc0) != 0x80 ||
3874 ((unsigned char)s[0] == 0xF0 &&
3875 (unsigned char)s[1] < 0x90) ||
3876 ((unsigned char)s[0] == 0xF4 &&
3877 (unsigned char)s[1] > 0x8F)) {
3878 goto surrogateescape;
3879 }
3880 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3881 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3882 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3883
3884#if SIZEOF_WCHAR_T == 4
3885 *p++ = (wchar_t)ch;
3886#else
3887 /* compute and append the two surrogates: */
3888
3889 /* translate from 10000..10FFFF to 0..FFFF */
3890 ch -= 0x10000;
3891
3892 /* high surrogate = top 10 bits added to D800 */
3893 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3894
3895 /* low surrogate = bottom 10 bits added to DC00 */
3896 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3897#endif
3898 break;
3899 }
3900 s += n;
3901 continue;
3902
3903 surrogateescape:
3904 *p++ = 0xDC00 + ch;
3905 s++;
3906 }
3907 *p = L'\0';
3908 return unicode;
3909}
3910
3911#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003913/* Primary internal function which creates utf8 encoded bytes objects.
3914
3915 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003916 and allocate exactly as much space needed at the end. Else allocate the
3917 maximum possible needed (4 result bytes per Unicode character), and return
3918 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003919*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003920PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003921_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922{
Tim Peters602f7402002-04-27 18:03:26 +00003923#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003924
Guido van Rossum98297ee2007-11-06 21:34:58 +00003925 Py_ssize_t i; /* index into s of next input byte */
3926 PyObject *result; /* result string object */
3927 char *p; /* next free byte in output buffer */
3928 Py_ssize_t nallocated; /* number of result bytes allocated */
3929 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003930 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003931 PyObject *errorHandler = NULL;
3932 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 int kind;
3934 void *data;
3935 Py_ssize_t size;
3936 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3937#if SIZEOF_WCHAR_T == 2
3938 Py_ssize_t wchar_offset = 0;
3939#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941 if (!PyUnicode_Check(unicode)) {
3942 PyErr_BadArgument();
3943 return NULL;
3944 }
3945
3946 if (PyUnicode_READY(unicode) == -1)
3947 return NULL;
3948
3949 if (_PyUnicode_UTF8(unicode))
3950 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
3951 _PyUnicode_UTF8_LENGTH(unicode));
3952
3953 kind = PyUnicode_KIND(unicode);
3954 data = PyUnicode_DATA(unicode);
3955 size = PyUnicode_GET_LENGTH(unicode);
3956
Tim Peters602f7402002-04-27 18:03:26 +00003957 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958
Tim Peters602f7402002-04-27 18:03:26 +00003959 if (size <= MAX_SHORT_UNICHARS) {
3960 /* Write into the stack buffer; nallocated can't overflow.
3961 * At the end, we'll allocate exactly as much heap space as it
3962 * turns out we need.
3963 */
3964 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003965 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00003966 p = stackbuf;
3967 }
3968 else {
3969 /* Overallocate on the heap, and give the excess back at the end. */
3970 nallocated = size * 4;
3971 if (nallocated / 4 != size) /* overflow! */
3972 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00003973 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003974 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00003975 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003976 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003977 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003978
Tim Peters602f7402002-04-27 18:03:26 +00003979 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003980 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003981
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003982 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00003983 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003985
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00003987 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00003988 *p++ = (char)(0xc0 | (ch >> 6));
3989 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00003990 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003991 Py_ssize_t newpos;
3992 PyObject *rep;
3993 Py_ssize_t repsize, k, startpos;
3994 startpos = i-1;
3995#if SIZEOF_WCHAR_T == 2
3996 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00003997#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998 rep = unicode_encode_call_errorhandler(
3999 errors, &errorHandler, "utf-8", "surrogates not allowed",
4000 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4001 &exc, startpos, startpos+1, &newpos);
4002 if (!rep)
4003 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005 if (PyBytes_Check(rep))
4006 repsize = PyBytes_GET_SIZE(rep);
4007 else
4008 repsize = PyUnicode_GET_SIZE(rep);
4009
4010 if (repsize > 4) {
4011 Py_ssize_t offset;
4012
4013 if (result == NULL)
4014 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004015 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4019 /* integer overflow */
4020 PyErr_NoMemory();
4021 goto error;
4022 }
4023 nallocated += repsize - 4;
4024 if (result != NULL) {
4025 if (_PyBytes_Resize(&result, nallocated) < 0)
4026 goto error;
4027 } else {
4028 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004029 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 goto error;
4031 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4032 }
4033 p = PyBytes_AS_STRING(result) + offset;
4034 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036 if (PyBytes_Check(rep)) {
4037 char *prep = PyBytes_AS_STRING(rep);
4038 for(k = repsize; k > 0; k--)
4039 *p++ = *prep++;
4040 } else /* rep is unicode */ {
4041 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4042 Py_UNICODE c;
4043
4044 for(k=0; k<repsize; k++) {
4045 c = prep[k];
4046 if (0x80 <= c) {
4047 raise_encode_exception(&exc, "utf-8",
4048 PyUnicode_AS_UNICODE(unicode),
4049 size, i-1, i,
4050 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004051 goto error;
4052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004054 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004055 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004057 } else if (ch < 0x10000) {
4058 *p++ = (char)(0xe0 | (ch >> 12));
4059 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4060 *p++ = (char)(0x80 | (ch & 0x3f));
4061 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004062 /* Encode UCS4 Unicode ordinals */
4063 *p++ = (char)(0xf0 | (ch >> 18));
4064 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4065 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4066 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004067#if SIZEOF_WCHAR_T == 2
4068 wchar_offset++;
4069#endif
Tim Peters602f7402002-04-27 18:03:26 +00004070 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004072
Guido van Rossum98297ee2007-11-06 21:34:58 +00004073 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004074 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004075 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004076 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004077 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004078 }
4079 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004080 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004081 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004082 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004083 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004085
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004086 Py_XDECREF(errorHandler);
4087 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004088 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004089 error:
4090 Py_XDECREF(errorHandler);
4091 Py_XDECREF(exc);
4092 Py_XDECREF(result);
4093 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004094
Tim Peters602f7402002-04-27 18:03:26 +00004095#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096}
4097
Alexander Belopolsky40018472011-02-26 01:02:56 +00004098PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004099PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4100 Py_ssize_t size,
4101 const char *errors)
4102{
4103 PyObject *v, *unicode;
4104
4105 unicode = PyUnicode_FromUnicode(s, size);
4106 if (unicode == NULL)
4107 return NULL;
4108 v = _PyUnicode_AsUTF8String(unicode, errors);
4109 Py_DECREF(unicode);
4110 return v;
4111}
4112
4113PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004114PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004116 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117}
4118
Walter Dörwald41980ca2007-08-16 21:55:45 +00004119/* --- UTF-32 Codec ------------------------------------------------------- */
4120
4121PyObject *
4122PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004123 Py_ssize_t size,
4124 const char *errors,
4125 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004126{
4127 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4128}
4129
4130PyObject *
4131PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 Py_ssize_t size,
4133 const char *errors,
4134 int *byteorder,
4135 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004136{
4137 const char *starts = s;
4138 Py_ssize_t startinpos;
4139 Py_ssize_t endinpos;
4140 Py_ssize_t outpos;
4141 PyUnicodeObject *unicode;
4142 Py_UNICODE *p;
4143#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004144 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004145 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004146#else
4147 const int pairs = 0;
4148#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004149 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004150 int bo = 0; /* assume native ordering by default */
4151 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004152 /* Offsets from q for retrieving bytes in the right order. */
4153#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4154 int iorder[] = {0, 1, 2, 3};
4155#else
4156 int iorder[] = {3, 2, 1, 0};
4157#endif
4158 PyObject *errorHandler = NULL;
4159 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004160
Walter Dörwald41980ca2007-08-16 21:55:45 +00004161 q = (unsigned char *)s;
4162 e = q + size;
4163
4164 if (byteorder)
4165 bo = *byteorder;
4166
4167 /* Check for BOM marks (U+FEFF) in the input and adjust current
4168 byte order setting accordingly. In native mode, the leading BOM
4169 mark is skipped, in all other modes, it is copied to the output
4170 stream as-is (giving a ZWNBSP character). */
4171 if (bo == 0) {
4172 if (size >= 4) {
4173 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004175#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 if (bom == 0x0000FEFF) {
4177 q += 4;
4178 bo = -1;
4179 }
4180 else if (bom == 0xFFFE0000) {
4181 q += 4;
4182 bo = 1;
4183 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004184#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004185 if (bom == 0x0000FEFF) {
4186 q += 4;
4187 bo = 1;
4188 }
4189 else if (bom == 0xFFFE0000) {
4190 q += 4;
4191 bo = -1;
4192 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004193#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004195 }
4196
4197 if (bo == -1) {
4198 /* force LE */
4199 iorder[0] = 0;
4200 iorder[1] = 1;
4201 iorder[2] = 2;
4202 iorder[3] = 3;
4203 }
4204 else if (bo == 1) {
4205 /* force BE */
4206 iorder[0] = 3;
4207 iorder[1] = 2;
4208 iorder[2] = 1;
4209 iorder[3] = 0;
4210 }
4211
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004212 /* On narrow builds we split characters outside the BMP into two
4213 codepoints => count how much extra space we need. */
4214#ifndef Py_UNICODE_WIDE
4215 for (qq = q; qq < e; qq += 4)
4216 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4217 pairs++;
4218#endif
4219
4220 /* This might be one to much, because of a BOM */
4221 unicode = _PyUnicode_New((size+3)/4+pairs);
4222 if (!unicode)
4223 return NULL;
4224 if (size == 0)
4225 return (PyObject *)unicode;
4226
4227 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004228 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004229
Walter Dörwald41980ca2007-08-16 21:55:45 +00004230 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004231 Py_UCS4 ch;
4232 /* remaining bytes at the end? (size should be divisible by 4) */
4233 if (e-q<4) {
4234 if (consumed)
4235 break;
4236 errmsg = "truncated data";
4237 startinpos = ((const char *)q)-starts;
4238 endinpos = ((const char *)e)-starts;
4239 goto utf32Error;
4240 /* The remaining input chars are ignored if the callback
4241 chooses to skip the input */
4242 }
4243 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4244 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004245
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 if (ch >= 0x110000)
4247 {
4248 errmsg = "codepoint not in range(0x110000)";
4249 startinpos = ((const char *)q)-starts;
4250 endinpos = startinpos+4;
4251 goto utf32Error;
4252 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004253#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004254 if (ch >= 0x10000)
4255 {
4256 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4257 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4258 }
4259 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004260#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004261 *p++ = ch;
4262 q += 4;
4263 continue;
4264 utf32Error:
4265 outpos = p-PyUnicode_AS_UNICODE(unicode);
4266 if (unicode_decode_call_errorhandler(
4267 errors, &errorHandler,
4268 "utf32", errmsg,
4269 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4270 &unicode, &outpos, &p))
4271 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004272 }
4273
4274 if (byteorder)
4275 *byteorder = bo;
4276
4277 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004278 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004279
4280 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004281 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004282 goto onError;
4283
4284 Py_XDECREF(errorHandler);
4285 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004286 if (PyUnicode_READY(unicode) == -1) {
4287 Py_DECREF(unicode);
4288 return NULL;
4289 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004290 return (PyObject *)unicode;
4291
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004293 Py_DECREF(unicode);
4294 Py_XDECREF(errorHandler);
4295 Py_XDECREF(exc);
4296 return NULL;
4297}
4298
4299PyObject *
4300PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004301 Py_ssize_t size,
4302 const char *errors,
4303 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004304{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004305 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004306 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004307 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004308#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004309 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004310#else
4311 const int pairs = 0;
4312#endif
4313 /* Offsets from p for storing byte pairs in the right order. */
4314#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4315 int iorder[] = {0, 1, 2, 3};
4316#else
4317 int iorder[] = {3, 2, 1, 0};
4318#endif
4319
Benjamin Peterson29060642009-01-31 22:14:21 +00004320#define STORECHAR(CH) \
4321 do { \
4322 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4323 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4324 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4325 p[iorder[0]] = (CH) & 0xff; \
4326 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004327 } while(0)
4328
4329 /* In narrow builds we can output surrogate pairs as one codepoint,
4330 so we need less space. */
4331#ifndef Py_UNICODE_WIDE
4332 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004333 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4334 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4335 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004336#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004337 nsize = (size - pairs + (byteorder == 0));
4338 bytesize = nsize * 4;
4339 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004340 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004341 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004342 if (v == NULL)
4343 return NULL;
4344
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004345 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004346 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004347 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004348 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004349 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004350
4351 if (byteorder == -1) {
4352 /* force LE */
4353 iorder[0] = 0;
4354 iorder[1] = 1;
4355 iorder[2] = 2;
4356 iorder[3] = 3;
4357 }
4358 else if (byteorder == 1) {
4359 /* force BE */
4360 iorder[0] = 3;
4361 iorder[1] = 2;
4362 iorder[2] = 1;
4363 iorder[3] = 0;
4364 }
4365
4366 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004368#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4370 Py_UCS4 ch2 = *s;
4371 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4372 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4373 s++;
4374 size--;
4375 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004376 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004377#endif
4378 STORECHAR(ch);
4379 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004380
4381 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004382 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004383#undef STORECHAR
4384}
4385
Alexander Belopolsky40018472011-02-26 01:02:56 +00004386PyObject *
4387PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004388{
4389 if (!PyUnicode_Check(unicode)) {
4390 PyErr_BadArgument();
4391 return NULL;
4392 }
4393 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004394 PyUnicode_GET_SIZE(unicode),
4395 NULL,
4396 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004397}
4398
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399/* --- UTF-16 Codec ------------------------------------------------------- */
4400
Tim Peters772747b2001-08-09 22:21:55 +00004401PyObject *
4402PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004403 Py_ssize_t size,
4404 const char *errors,
4405 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406{
Walter Dörwald69652032004-09-07 20:24:22 +00004407 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4408}
4409
Antoine Pitrouab868312009-01-10 15:40:25 +00004410/* Two masks for fast checking of whether a C 'long' may contain
4411 UTF16-encoded surrogate characters. This is an efficient heuristic,
4412 assuming that non-surrogate characters with a code point >= 0x8000 are
4413 rare in most input.
4414 FAST_CHAR_MASK is used when the input is in native byte ordering,
4415 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004416*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004417#if (SIZEOF_LONG == 8)
4418# define FAST_CHAR_MASK 0x8000800080008000L
4419# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4420#elif (SIZEOF_LONG == 4)
4421# define FAST_CHAR_MASK 0x80008000L
4422# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4423#else
4424# error C 'long' size should be either 4 or 8!
4425#endif
4426
Walter Dörwald69652032004-09-07 20:24:22 +00004427PyObject *
4428PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 Py_ssize_t size,
4430 const char *errors,
4431 int *byteorder,
4432 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004433{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004435 Py_ssize_t startinpos;
4436 Py_ssize_t endinpos;
4437 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438 PyUnicodeObject *unicode;
4439 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004440 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004441 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004442 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004443 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004444 /* Offsets from q for retrieving byte pairs in the right order. */
4445#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4446 int ihi = 1, ilo = 0;
4447#else
4448 int ihi = 0, ilo = 1;
4449#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 PyObject *errorHandler = NULL;
4451 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452
4453 /* Note: size will always be longer than the resulting Unicode
4454 character count */
4455 unicode = _PyUnicode_New(size);
4456 if (!unicode)
4457 return NULL;
4458 if (size == 0)
4459 return (PyObject *)unicode;
4460
4461 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004462 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004463 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004464 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465
4466 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004467 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004469 /* Check for BOM marks (U+FEFF) in the input and adjust current
4470 byte order setting accordingly. In native mode, the leading BOM
4471 mark is skipped, in all other modes, it is copied to the output
4472 stream as-is (giving a ZWNBSP character). */
4473 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004474 if (size >= 2) {
4475 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004476#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 if (bom == 0xFEFF) {
4478 q += 2;
4479 bo = -1;
4480 }
4481 else if (bom == 0xFFFE) {
4482 q += 2;
4483 bo = 1;
4484 }
Tim Petersced69f82003-09-16 20:30:58 +00004485#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004486 if (bom == 0xFEFF) {
4487 q += 2;
4488 bo = 1;
4489 }
4490 else if (bom == 0xFFFE) {
4491 q += 2;
4492 bo = -1;
4493 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004494#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004495 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497
Tim Peters772747b2001-08-09 22:21:55 +00004498 if (bo == -1) {
4499 /* force LE */
4500 ihi = 1;
4501 ilo = 0;
4502 }
4503 else if (bo == 1) {
4504 /* force BE */
4505 ihi = 0;
4506 ilo = 1;
4507 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004508#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4509 native_ordering = ilo < ihi;
4510#else
4511 native_ordering = ilo > ihi;
4512#endif
Tim Peters772747b2001-08-09 22:21:55 +00004513
Antoine Pitrouab868312009-01-10 15:40:25 +00004514 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004515 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004516 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004517 /* First check for possible aligned read of a C 'long'. Unaligned
4518 reads are more expensive, better to defer to another iteration. */
4519 if (!((size_t) q & LONG_PTR_MASK)) {
4520 /* Fast path for runs of non-surrogate chars. */
4521 register const unsigned char *_q = q;
4522 Py_UNICODE *_p = p;
4523 if (native_ordering) {
4524 /* Native ordering is simple: as long as the input cannot
4525 possibly contain a surrogate char, do an unrolled copy
4526 of several 16-bit code points to the target object.
4527 The non-surrogate check is done on several input bytes
4528 at a time (as many as a C 'long' can contain). */
4529 while (_q < aligned_end) {
4530 unsigned long data = * (unsigned long *) _q;
4531 if (data & FAST_CHAR_MASK)
4532 break;
4533 _p[0] = ((unsigned short *) _q)[0];
4534 _p[1] = ((unsigned short *) _q)[1];
4535#if (SIZEOF_LONG == 8)
4536 _p[2] = ((unsigned short *) _q)[2];
4537 _p[3] = ((unsigned short *) _q)[3];
4538#endif
4539 _q += SIZEOF_LONG;
4540 _p += SIZEOF_LONG / 2;
4541 }
4542 }
4543 else {
4544 /* Byteswapped ordering is similar, but we must decompose
4545 the copy bytewise, and take care of zero'ing out the
4546 upper bytes if the target object is in 32-bit units
4547 (that is, in UCS-4 builds). */
4548 while (_q < aligned_end) {
4549 unsigned long data = * (unsigned long *) _q;
4550 if (data & SWAPPED_FAST_CHAR_MASK)
4551 break;
4552 /* Zero upper bytes in UCS-4 builds */
4553#if (Py_UNICODE_SIZE > 2)
4554 _p[0] = 0;
4555 _p[1] = 0;
4556#if (SIZEOF_LONG == 8)
4557 _p[2] = 0;
4558 _p[3] = 0;
4559#endif
4560#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004561 /* Issue #4916; UCS-4 builds on big endian machines must
4562 fill the two last bytes of each 4-byte unit. */
4563#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4564# define OFF 2
4565#else
4566# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004567#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004568 ((unsigned char *) _p)[OFF + 1] = _q[0];
4569 ((unsigned char *) _p)[OFF + 0] = _q[1];
4570 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4571 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4572#if (SIZEOF_LONG == 8)
4573 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4574 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4575 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4576 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4577#endif
4578#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004579 _q += SIZEOF_LONG;
4580 _p += SIZEOF_LONG / 2;
4581 }
4582 }
4583 p = _p;
4584 q = _q;
4585 if (q >= e)
4586 break;
4587 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004588 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589
Benjamin Peterson14339b62009-01-31 16:36:08 +00004590 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004591
4592 if (ch < 0xD800 || ch > 0xDFFF) {
4593 *p++ = ch;
4594 continue;
4595 }
4596
4597 /* UTF-16 code pair: */
4598 if (q > e) {
4599 errmsg = "unexpected end of data";
4600 startinpos = (((const char *)q) - 2) - starts;
4601 endinpos = ((const char *)e) + 1 - starts;
4602 goto utf16Error;
4603 }
4604 if (0xD800 <= ch && ch <= 0xDBFF) {
4605 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4606 q += 2;
4607 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004608#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004609 *p++ = ch;
4610 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004611#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004613#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004614 continue;
4615 }
4616 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004617 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 startinpos = (((const char *)q)-4)-starts;
4619 endinpos = startinpos+2;
4620 goto utf16Error;
4621 }
4622
Benjamin Peterson14339b62009-01-31 16:36:08 +00004623 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004624 errmsg = "illegal encoding";
4625 startinpos = (((const char *)q)-2)-starts;
4626 endinpos = startinpos+2;
4627 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004628
Benjamin Peterson29060642009-01-31 22:14:21 +00004629 utf16Error:
4630 outpos = p - PyUnicode_AS_UNICODE(unicode);
4631 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004632 errors,
4633 &errorHandler,
4634 "utf16", errmsg,
4635 &starts,
4636 (const char **)&e,
4637 &startinpos,
4638 &endinpos,
4639 &exc,
4640 (const char **)&q,
4641 &unicode,
4642 &outpos,
4643 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004644 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004646 /* remaining byte at the end? (size should be even) */
4647 if (e == q) {
4648 if (!consumed) {
4649 errmsg = "truncated data";
4650 startinpos = ((const char *)q) - starts;
4651 endinpos = ((const char *)e) + 1 - starts;
4652 outpos = p - PyUnicode_AS_UNICODE(unicode);
4653 if (unicode_decode_call_errorhandler(
4654 errors,
4655 &errorHandler,
4656 "utf16", errmsg,
4657 &starts,
4658 (const char **)&e,
4659 &startinpos,
4660 &endinpos,
4661 &exc,
4662 (const char **)&q,
4663 &unicode,
4664 &outpos,
4665 &p))
4666 goto onError;
4667 /* The remaining input chars are ignored if the callback
4668 chooses to skip the input */
4669 }
4670 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671
4672 if (byteorder)
4673 *byteorder = bo;
4674
Walter Dörwald69652032004-09-07 20:24:22 +00004675 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004676 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004677
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004679 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 goto onError;
4681
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 Py_XDECREF(errorHandler);
4683 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004684 if (PyUnicode_READY(unicode) == -1) {
4685 Py_DECREF(unicode);
4686 return NULL;
4687 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688 return (PyObject *)unicode;
4689
Benjamin Peterson29060642009-01-31 22:14:21 +00004690 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004691 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004692 Py_XDECREF(errorHandler);
4693 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 return NULL;
4695}
4696
Antoine Pitrouab868312009-01-10 15:40:25 +00004697#undef FAST_CHAR_MASK
4698#undef SWAPPED_FAST_CHAR_MASK
4699
Tim Peters772747b2001-08-09 22:21:55 +00004700PyObject *
4701PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004702 Py_ssize_t size,
4703 const char *errors,
4704 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004706 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004707 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004708 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004709#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004710 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004711#else
4712 const int pairs = 0;
4713#endif
Tim Peters772747b2001-08-09 22:21:55 +00004714 /* Offsets from p for storing byte pairs in the right order. */
4715#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4716 int ihi = 1, ilo = 0;
4717#else
4718 int ihi = 0, ilo = 1;
4719#endif
4720
Benjamin Peterson29060642009-01-31 22:14:21 +00004721#define STORECHAR(CH) \
4722 do { \
4723 p[ihi] = ((CH) >> 8) & 0xff; \
4724 p[ilo] = (CH) & 0xff; \
4725 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004726 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004728#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004729 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 if (s[i] >= 0x10000)
4731 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004732#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004733 /* 2 * (size + pairs + (byteorder == 0)) */
4734 if (size > PY_SSIZE_T_MAX ||
4735 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004736 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004737 nsize = size + pairs + (byteorder == 0);
4738 bytesize = nsize * 2;
4739 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004740 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004741 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742 if (v == NULL)
4743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004745 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004747 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004748 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004749 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004750
4751 if (byteorder == -1) {
4752 /* force LE */
4753 ihi = 1;
4754 ilo = 0;
4755 }
4756 else if (byteorder == 1) {
4757 /* force BE */
4758 ihi = 0;
4759 ilo = 1;
4760 }
4761
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004762 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004763 Py_UNICODE ch = *s++;
4764 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004765#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004766 if (ch >= 0x10000) {
4767 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4768 ch = 0xD800 | ((ch-0x10000) >> 10);
4769 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004770#endif
Tim Peters772747b2001-08-09 22:21:55 +00004771 STORECHAR(ch);
4772 if (ch2)
4773 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004774 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004775
4776 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004777 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004778#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779}
4780
Alexander Belopolsky40018472011-02-26 01:02:56 +00004781PyObject *
4782PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783{
4784 if (!PyUnicode_Check(unicode)) {
4785 PyErr_BadArgument();
4786 return NULL;
4787 }
4788 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004789 PyUnicode_GET_SIZE(unicode),
4790 NULL,
4791 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792}
4793
4794/* --- Unicode Escape Codec ----------------------------------------------- */
4795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004796/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4797 if all the escapes in the string make it still a valid ASCII string.
4798 Returns -1 if any escapes were found which cause the string to
4799 pop out of ASCII range. Otherwise returns the length of the
4800 required buffer to hold the string.
4801 */
4802Py_ssize_t
4803length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4804{
4805 const unsigned char *p = (const unsigned char *)s;
4806 const unsigned char *end = p + size;
4807 Py_ssize_t length = 0;
4808
4809 if (size < 0)
4810 return -1;
4811
4812 for (; p < end; ++p) {
4813 if (*p > 127) {
4814 /* Non-ASCII */
4815 return -1;
4816 }
4817 else if (*p != '\\') {
4818 /* Normal character */
4819 ++length;
4820 }
4821 else {
4822 /* Backslash-escape, check next char */
4823 ++p;
4824 /* Escape sequence reaches till end of string or
4825 non-ASCII follow-up. */
4826 if (p >= end || *p > 127)
4827 return -1;
4828 switch (*p) {
4829 case '\n':
4830 /* backslash + \n result in zero characters */
4831 break;
4832 case '\\': case '\'': case '\"':
4833 case 'b': case 'f': case 't':
4834 case 'n': case 'r': case 'v': case 'a':
4835 ++length;
4836 break;
4837 case '0': case '1': case '2': case '3':
4838 case '4': case '5': case '6': case '7':
4839 case 'x': case 'u': case 'U': case 'N':
4840 /* these do not guarantee ASCII characters */
4841 return -1;
4842 default:
4843 /* count the backslash + the other character */
4844 length += 2;
4845 }
4846 }
4847 }
4848 return length;
4849}
4850
4851/* Similar to PyUnicode_WRITE but either write into wstr field
4852 or treat string as ASCII. */
4853#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4854 do { \
4855 if ((kind) != PyUnicode_WCHAR_KIND) \
4856 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4857 else \
4858 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4859 } while (0)
4860
4861#define WRITE_WSTR(buf, index, value) \
4862 assert(kind == PyUnicode_WCHAR_KIND), \
4863 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4864
4865
Fredrik Lundh06d12682001-01-24 07:59:11 +00004866static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004867
Alexander Belopolsky40018472011-02-26 01:02:56 +00004868PyObject *
4869PyUnicode_DecodeUnicodeEscape(const char *s,
4870 Py_ssize_t size,
4871 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004873 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004874 Py_ssize_t startinpos;
4875 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004876 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004878 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004880 char* message;
4881 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004882 PyObject *errorHandler = NULL;
4883 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004884 Py_ssize_t ascii_length;
4885 Py_ssize_t i;
4886 int kind;
4887 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004888
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004889 ascii_length = length_of_escaped_ascii_string(s, size);
4890
4891 /* After length_of_escaped_ascii_string() there are two alternatives,
4892 either the string is pure ASCII with named escapes like \n, etc.
4893 and we determined it's exact size (common case)
4894 or it contains \x, \u, ... escape sequences. then we create a
4895 legacy wchar string and resize it at the end of this function. */
4896 if (ascii_length >= 0) {
4897 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4898 if (!v)
4899 goto onError;
4900 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4901 kind = PyUnicode_1BYTE_KIND;
4902 data = PyUnicode_DATA(v);
4903 }
4904 else {
4905 /* Escaped strings will always be longer than the resulting
4906 Unicode string, so we start with size here and then reduce the
4907 length after conversion to the true value.
4908 (but if the error callback returns a long replacement string
4909 we'll have to allocate more space) */
4910 v = _PyUnicode_New(size);
4911 if (!v)
4912 goto onError;
4913 kind = PyUnicode_WCHAR_KIND;
4914 data = PyUnicode_AS_UNICODE(v);
4915 }
4916
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 if (size == 0)
4918 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004919 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004921
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922 while (s < end) {
4923 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004924 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004925 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004927 if (kind == PyUnicode_WCHAR_KIND) {
4928 assert(i < _PyUnicode_WSTR_LENGTH(v));
4929 }
4930 else {
4931 /* The only case in which i == ascii_length is a backslash
4932 followed by a newline. */
4933 assert(i <= ascii_length);
4934 }
4935
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 /* Non-escape characters are interpreted as Unicode ordinals */
4937 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004938 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 continue;
4940 }
4941
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004942 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943 /* \ - Escapes */
4944 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004945 c = *s++;
4946 if (s > end)
4947 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004948
4949 if (kind == PyUnicode_WCHAR_KIND) {
4950 assert(i < _PyUnicode_WSTR_LENGTH(v));
4951 }
4952 else {
4953 /* The only case in which i == ascii_length is a backslash
4954 followed by a newline. */
4955 assert(i < ascii_length || (i == ascii_length && c == '\n'));
4956 }
4957
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004958 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959
Benjamin Peterson29060642009-01-31 22:14:21 +00004960 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004962 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
4963 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
4964 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
4965 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
4966 /* FF */
4967 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
4968 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
4969 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
4970 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
4971 /* VT */
4972 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
4973 /* BEL, not classic C */
4974 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 case '0': case '1': case '2': case '3':
4978 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004979 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004980 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004981 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004982 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004983 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004985 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 break;
4987
Benjamin Peterson29060642009-01-31 22:14:21 +00004988 /* hex escapes */
4989 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00004991 digits = 2;
4992 message = "truncated \\xXX escape";
4993 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994
Benjamin Peterson29060642009-01-31 22:14:21 +00004995 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00004997 digits = 4;
4998 message = "truncated \\uXXXX escape";
4999 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000
Benjamin Peterson29060642009-01-31 22:14:21 +00005001 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005002 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005003 digits = 8;
5004 message = "truncated \\UXXXXXXXX escape";
5005 hexescape:
5006 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005007 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005008 if (s+digits>end) {
5009 endinpos = size;
5010 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005011 errors, &errorHandler,
5012 "unicodeescape", "end of string in escape sequence",
5013 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005014 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005015 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005016 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005017 goto nextByte;
5018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005019 for (j = 0; j < digits; ++j) {
5020 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005021 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005022 endinpos = (s+j+1)-starts;
5023 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005024 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 errors, &errorHandler,
5026 "unicodeescape", message,
5027 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005028 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005029 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005030 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005032 }
5033 chr = (chr<<4) & ~0xF;
5034 if (c >= '0' && c <= '9')
5035 chr += c - '0';
5036 else if (c >= 'a' && c <= 'f')
5037 chr += 10 + c - 'a';
5038 else
5039 chr += 10 + c - 'A';
5040 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005041 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005042 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005043 /* _decoding_error will have already written into the
5044 target buffer. */
5045 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005046 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005047 /* when we get here, chr is a 32-bit unicode character */
5048 if (chr <= 0xffff)
5049 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005050 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005051 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005052 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005053 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005054#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005055 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005056#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005057 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005058 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5059 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005060#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005061 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005062 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005063 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005064 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 errors, &errorHandler,
5066 "unicodeescape", "illegal Unicode character",
5067 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005068 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005069 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005070 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005071 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005072 break;
5073
Benjamin Peterson29060642009-01-31 22:14:21 +00005074 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005075 case 'N':
5076 message = "malformed \\N character escape";
5077 if (ucnhash_CAPI == NULL) {
5078 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005079 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5080 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005081 if (ucnhash_CAPI == NULL)
5082 goto ucnhashError;
5083 }
5084 if (*s == '{') {
5085 const char *start = s+1;
5086 /* look for the closing brace */
5087 while (*s != '}' && s < end)
5088 s++;
5089 if (s > start && s < end && *s == '}') {
5090 /* found a name. look it up in the unicode database */
5091 message = "unknown Unicode character name";
5092 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005093 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5094 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005095 goto store;
5096 }
5097 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005098 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005099 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005100 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 errors, &errorHandler,
5102 "unicodeescape", message,
5103 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005104 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005105 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005106 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005107 break;
5108
5109 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005110 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005111 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005112 message = "\\ at end of string";
5113 s--;
5114 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005115 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005116 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005117 errors, &errorHandler,
5118 "unicodeescape", message,
5119 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005120 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005121 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005122 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005123 }
5124 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005125 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5126 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005127 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005128 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005130 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005131 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005133 /* Ensure the length prediction worked in case of ASCII strings */
5134 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5135
5136 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5137 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005138 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005139 Py_XDECREF(errorHandler);
5140 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005142
Benjamin Peterson29060642009-01-31 22:14:21 +00005143 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005144 PyErr_SetString(
5145 PyExc_UnicodeError,
5146 "\\N escapes not supported (can't load unicodedata module)"
5147 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005148 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005149 Py_XDECREF(errorHandler);
5150 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005151 return NULL;
5152
Benjamin Peterson29060642009-01-31 22:14:21 +00005153 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005155 Py_XDECREF(errorHandler);
5156 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 return NULL;
5158}
5159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005160#undef WRITE_ASCII_OR_WSTR
5161#undef WRITE_WSTR
5162
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163/* Return a Unicode-Escape string version of the Unicode object.
5164
5165 If quotes is true, the string is enclosed in u"" or u'' quotes as
5166 appropriate.
5167
5168*/
5169
Walter Dörwald79e913e2007-05-12 11:08:06 +00005170static const char *hexdigits = "0123456789abcdef";
5171
Alexander Belopolsky40018472011-02-26 01:02:56 +00005172PyObject *
5173PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5174 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005176 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005179#ifdef Py_UNICODE_WIDE
5180 const Py_ssize_t expandsize = 10;
5181#else
5182 const Py_ssize_t expandsize = 6;
5183#endif
5184
Thomas Wouters89f507f2006-12-13 04:49:30 +00005185 /* XXX(nnorwitz): rather than over-allocating, it would be
5186 better to choose a different scheme. Perhaps scan the
5187 first N-chars of the string and allocate based on that size.
5188 */
5189 /* Initial allocation is based on the longest-possible unichr
5190 escape.
5191
5192 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5193 unichr, so in this case it's the longest unichr escape. In
5194 narrow (UTF-16) builds this is five chars per source unichr
5195 since there are two unichrs in the surrogate pair, so in narrow
5196 (UTF-16) builds it's not the longest unichr escape.
5197
5198 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5199 so in the narrow (UTF-16) build case it's the longest unichr
5200 escape.
5201 */
5202
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005203 if (size == 0)
5204 return PyBytes_FromStringAndSize(NULL, 0);
5205
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005206 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005207 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005208
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005209 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005210 2
5211 + expandsize*size
5212 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213 if (repr == NULL)
5214 return NULL;
5215
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005216 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 while (size-- > 0) {
5219 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005220
Walter Dörwald79e913e2007-05-12 11:08:06 +00005221 /* Escape backslashes */
5222 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223 *p++ = '\\';
5224 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005225 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005226 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005227
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005228#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005229 /* Map 21-bit characters to '\U00xxxxxx' */
5230 else if (ch >= 0x10000) {
5231 *p++ = '\\';
5232 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005233 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5234 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5235 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5236 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5237 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5238 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5239 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5240 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005242 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005243#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005244 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5245 else if (ch >= 0xD800 && ch < 0xDC00) {
5246 Py_UNICODE ch2;
5247 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005248
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 ch2 = *s++;
5250 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005251 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5253 *p++ = '\\';
5254 *p++ = 'U';
5255 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5256 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5257 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5258 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5259 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5260 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5261 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5262 *p++ = hexdigits[ucs & 0x0000000F];
5263 continue;
5264 }
5265 /* Fall through: isolated surrogates are copied as-is */
5266 s--;
5267 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005268 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005269#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005270
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005272 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 *p++ = '\\';
5274 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005275 *p++ = hexdigits[(ch >> 12) & 0x000F];
5276 *p++ = hexdigits[(ch >> 8) & 0x000F];
5277 *p++ = hexdigits[(ch >> 4) & 0x000F];
5278 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005280
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005281 /* Map special whitespace to '\t', \n', '\r' */
5282 else if (ch == '\t') {
5283 *p++ = '\\';
5284 *p++ = 't';
5285 }
5286 else if (ch == '\n') {
5287 *p++ = '\\';
5288 *p++ = 'n';
5289 }
5290 else if (ch == '\r') {
5291 *p++ = '\\';
5292 *p++ = 'r';
5293 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005294
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005295 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005296 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005298 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005299 *p++ = hexdigits[(ch >> 4) & 0x000F];
5300 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005301 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005302
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 /* Copy everything else as-is */
5304 else
5305 *p++ = (char) ch;
5306 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005308 assert(p - PyBytes_AS_STRING(repr) > 0);
5309 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5310 return NULL;
5311 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312}
5313
Alexander Belopolsky40018472011-02-26 01:02:56 +00005314PyObject *
5315PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005317 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 if (!PyUnicode_Check(unicode)) {
5319 PyErr_BadArgument();
5320 return NULL;
5321 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005322 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5323 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005324 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325}
5326
5327/* --- Raw Unicode Escape Codec ------------------------------------------- */
5328
Alexander Belopolsky40018472011-02-26 01:02:56 +00005329PyObject *
5330PyUnicode_DecodeRawUnicodeEscape(const char *s,
5331 Py_ssize_t size,
5332 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005334 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005335 Py_ssize_t startinpos;
5336 Py_ssize_t endinpos;
5337 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005339 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 const char *end;
5341 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005342 PyObject *errorHandler = NULL;
5343 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005344
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 /* Escaped strings will always be longer than the resulting
5346 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347 length after conversion to the true value. (But decoding error
5348 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 v = _PyUnicode_New(size);
5350 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005354 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 end = s + size;
5356 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 unsigned char c;
5358 Py_UCS4 x;
5359 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005360 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 /* Non-escape characters are interpreted as Unicode ordinals */
5363 if (*s != '\\') {
5364 *p++ = (unsigned char)*s++;
5365 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005366 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 startinpos = s-starts;
5368
5369 /* \u-escapes are only interpreted iff the number of leading
5370 backslashes if odd */
5371 bs = s;
5372 for (;s < end;) {
5373 if (*s != '\\')
5374 break;
5375 *p++ = (unsigned char)*s++;
5376 }
5377 if (((s - bs) & 1) == 0 ||
5378 s >= end ||
5379 (*s != 'u' && *s != 'U')) {
5380 continue;
5381 }
5382 p--;
5383 count = *s=='u' ? 4 : 8;
5384 s++;
5385
5386 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5387 outpos = p-PyUnicode_AS_UNICODE(v);
5388 for (x = 0, i = 0; i < count; ++i, ++s) {
5389 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005390 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 endinpos = s-starts;
5392 if (unicode_decode_call_errorhandler(
5393 errors, &errorHandler,
5394 "rawunicodeescape", "truncated \\uXXXX",
5395 &starts, &end, &startinpos, &endinpos, &exc, &s,
5396 &v, &outpos, &p))
5397 goto onError;
5398 goto nextByte;
5399 }
5400 x = (x<<4) & ~0xF;
5401 if (c >= '0' && c <= '9')
5402 x += c - '0';
5403 else if (c >= 'a' && c <= 'f')
5404 x += 10 + c - 'a';
5405 else
5406 x += 10 + c - 'A';
5407 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005408 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 /* UCS-2 character */
5410 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005411 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 /* UCS-4 character. Either store directly, or as
5413 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005414#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005415 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005416#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 x -= 0x10000L;
5418 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5419 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005420#endif
5421 } else {
5422 endinpos = s-starts;
5423 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005424 if (unicode_decode_call_errorhandler(
5425 errors, &errorHandler,
5426 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005427 &starts, &end, &startinpos, &endinpos, &exc, &s,
5428 &v, &outpos, &p))
5429 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005430 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005431 nextByte:
5432 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005434 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005435 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005436 Py_XDECREF(errorHandler);
5437 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005438 if (PyUnicode_READY(v) == -1) {
5439 Py_DECREF(v);
5440 return NULL;
5441 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005443
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005446 Py_XDECREF(errorHandler);
5447 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 return NULL;
5449}
5450
Alexander Belopolsky40018472011-02-26 01:02:56 +00005451PyObject *
5452PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5453 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005455 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 char *p;
5457 char *q;
5458
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005459#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005460 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005461#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005462 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005463#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005464
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005465 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005466 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005467
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005468 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 if (repr == NULL)
5470 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005471 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005472 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005474 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 while (size-- > 0) {
5476 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005477#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 /* Map 32-bit characters to '\Uxxxxxxxx' */
5479 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005480 *p++ = '\\';
5481 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005482 *p++ = hexdigits[(ch >> 28) & 0xf];
5483 *p++ = hexdigits[(ch >> 24) & 0xf];
5484 *p++ = hexdigits[(ch >> 20) & 0xf];
5485 *p++ = hexdigits[(ch >> 16) & 0xf];
5486 *p++ = hexdigits[(ch >> 12) & 0xf];
5487 *p++ = hexdigits[(ch >> 8) & 0xf];
5488 *p++ = hexdigits[(ch >> 4) & 0xf];
5489 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005490 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005491 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005492#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5494 if (ch >= 0xD800 && ch < 0xDC00) {
5495 Py_UNICODE ch2;
5496 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005497
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 ch2 = *s++;
5499 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005500 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5502 *p++ = '\\';
5503 *p++ = 'U';
5504 *p++ = hexdigits[(ucs >> 28) & 0xf];
5505 *p++ = hexdigits[(ucs >> 24) & 0xf];
5506 *p++ = hexdigits[(ucs >> 20) & 0xf];
5507 *p++ = hexdigits[(ucs >> 16) & 0xf];
5508 *p++ = hexdigits[(ucs >> 12) & 0xf];
5509 *p++ = hexdigits[(ucs >> 8) & 0xf];
5510 *p++ = hexdigits[(ucs >> 4) & 0xf];
5511 *p++ = hexdigits[ucs & 0xf];
5512 continue;
5513 }
5514 /* Fall through: isolated surrogates are copied as-is */
5515 s--;
5516 size++;
5517 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005518#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 /* Map 16-bit characters to '\uxxxx' */
5520 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 *p++ = '\\';
5522 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005523 *p++ = hexdigits[(ch >> 12) & 0xf];
5524 *p++ = hexdigits[(ch >> 8) & 0xf];
5525 *p++ = hexdigits[(ch >> 4) & 0xf];
5526 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 /* Copy everything else as-is */
5529 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530 *p++ = (char) ch;
5531 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005532 size = p - q;
5533
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005534 assert(size > 0);
5535 if (_PyBytes_Resize(&repr, size) < 0)
5536 return NULL;
5537 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538}
5539
Alexander Belopolsky40018472011-02-26 01:02:56 +00005540PyObject *
5541PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005543 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005545 PyErr_BadArgument();
5546 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005548 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5549 PyUnicode_GET_SIZE(unicode));
5550
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005551 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552}
5553
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005554/* --- Unicode Internal Codec ------------------------------------------- */
5555
Alexander Belopolsky40018472011-02-26 01:02:56 +00005556PyObject *
5557_PyUnicode_DecodeUnicodeInternal(const char *s,
5558 Py_ssize_t size,
5559 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005560{
5561 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005562 Py_ssize_t startinpos;
5563 Py_ssize_t endinpos;
5564 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005565 PyUnicodeObject *v;
5566 Py_UNICODE *p;
5567 const char *end;
5568 const char *reason;
5569 PyObject *errorHandler = NULL;
5570 PyObject *exc = NULL;
5571
Neal Norwitzd43069c2006-01-08 01:12:10 +00005572#ifdef Py_UNICODE_WIDE
5573 Py_UNICODE unimax = PyUnicode_GetMax();
5574#endif
5575
Thomas Wouters89f507f2006-12-13 04:49:30 +00005576 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005577 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5578 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005580 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5581 as string was created with the old API. */
5582 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005584 p = PyUnicode_AS_UNICODE(v);
5585 end = s + size;
5586
5587 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005588 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005589 /* We have to sanity check the raw data, otherwise doom looms for
5590 some malformed UCS-4 data. */
5591 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005592#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005593 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005594#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005595 end-s < Py_UNICODE_SIZE
5596 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005598 startinpos = s - starts;
5599 if (end-s < Py_UNICODE_SIZE) {
5600 endinpos = end-starts;
5601 reason = "truncated input";
5602 }
5603 else {
5604 endinpos = s - starts + Py_UNICODE_SIZE;
5605 reason = "illegal code point (> 0x10FFFF)";
5606 }
5607 outpos = p - PyUnicode_AS_UNICODE(v);
5608 if (unicode_decode_call_errorhandler(
5609 errors, &errorHandler,
5610 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005611 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005612 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005613 goto onError;
5614 }
5615 }
5616 else {
5617 p++;
5618 s += Py_UNICODE_SIZE;
5619 }
5620 }
5621
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005622 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005623 goto onError;
5624 Py_XDECREF(errorHandler);
5625 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005626 if (PyUnicode_READY(v) == -1) {
5627 Py_DECREF(v);
5628 return NULL;
5629 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005630 return (PyObject *)v;
5631
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005633 Py_XDECREF(v);
5634 Py_XDECREF(errorHandler);
5635 Py_XDECREF(exc);
5636 return NULL;
5637}
5638
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639/* --- Latin-1 Codec ------------------------------------------------------ */
5640
Alexander Belopolsky40018472011-02-26 01:02:56 +00005641PyObject *
5642PyUnicode_DecodeLatin1(const char *s,
5643 Py_ssize_t size,
5644 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005647 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648}
5649
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005650/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005651static void
5652make_encode_exception(PyObject **exceptionObject,
5653 const char *encoding,
5654 const Py_UNICODE *unicode, Py_ssize_t size,
5655 Py_ssize_t startpos, Py_ssize_t endpos,
5656 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005658 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005659 *exceptionObject = PyUnicodeEncodeError_Create(
5660 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 }
5662 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5664 goto onError;
5665 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5666 goto onError;
5667 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5668 goto onError;
5669 return;
5670 onError:
5671 Py_DECREF(*exceptionObject);
5672 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673 }
5674}
5675
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005676/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005677static void
5678raise_encode_exception(PyObject **exceptionObject,
5679 const char *encoding,
5680 const Py_UNICODE *unicode, Py_ssize_t size,
5681 Py_ssize_t startpos, Py_ssize_t endpos,
5682 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005683{
5684 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005688}
5689
5690/* error handling callback helper:
5691 build arguments, call the callback and check the arguments,
5692 put the result into newpos and return the replacement string, which
5693 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005694static PyObject *
5695unicode_encode_call_errorhandler(const char *errors,
5696 PyObject **errorHandler,
5697 const char *encoding, const char *reason,
5698 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5699 Py_ssize_t startpos, Py_ssize_t endpos,
5700 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005702 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703
5704 PyObject *restuple;
5705 PyObject *resunicode;
5706
5707 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005710 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005711 }
5712
5713 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005717
5718 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005721 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005722 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005723 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005724 Py_DECREF(restuple);
5725 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005726 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005727 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 &resunicode, newpos)) {
5729 Py_DECREF(restuple);
5730 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005732 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5733 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5734 Py_DECREF(restuple);
5735 return NULL;
5736 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005739 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5741 Py_DECREF(restuple);
5742 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005743 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005744 Py_INCREF(resunicode);
5745 Py_DECREF(restuple);
5746 return resunicode;
5747}
5748
Alexander Belopolsky40018472011-02-26 01:02:56 +00005749static PyObject *
5750unicode_encode_ucs1(const Py_UNICODE *p,
5751 Py_ssize_t size,
5752 const char *errors,
5753 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005754{
5755 /* output object */
5756 PyObject *res;
5757 /* pointers to the beginning and end+1 of input */
5758 const Py_UNICODE *startp = p;
5759 const Py_UNICODE *endp = p + size;
5760 /* pointer to the beginning of the unencodable characters */
5761 /* const Py_UNICODE *badp = NULL; */
5762 /* pointer into the output */
5763 char *str;
5764 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005765 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005766 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5767 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768 PyObject *errorHandler = NULL;
5769 PyObject *exc = NULL;
5770 /* the following variable is used for caching string comparisons
5771 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5772 int known_errorHandler = -1;
5773
5774 /* allocate enough for a simple encoding without
5775 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005776 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005777 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005778 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005779 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005780 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005781 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 ressize = size;
5783
5784 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005786
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 /* can we encode this? */
5788 if (c<limit) {
5789 /* no overflow check, because we know that the space is enough */
5790 *str++ = (char)c;
5791 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005792 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 else {
5794 Py_ssize_t unicodepos = p-startp;
5795 Py_ssize_t requiredsize;
5796 PyObject *repunicode;
5797 Py_ssize_t repsize;
5798 Py_ssize_t newpos;
5799 Py_ssize_t respos;
5800 Py_UNICODE *uni2;
5801 /* startpos for collecting unencodable chars */
5802 const Py_UNICODE *collstart = p;
5803 const Py_UNICODE *collend = p;
5804 /* find all unecodable characters */
5805 while ((collend < endp) && ((*collend)>=limit))
5806 ++collend;
5807 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5808 if (known_errorHandler==-1) {
5809 if ((errors==NULL) || (!strcmp(errors, "strict")))
5810 known_errorHandler = 1;
5811 else if (!strcmp(errors, "replace"))
5812 known_errorHandler = 2;
5813 else if (!strcmp(errors, "ignore"))
5814 known_errorHandler = 3;
5815 else if (!strcmp(errors, "xmlcharrefreplace"))
5816 known_errorHandler = 4;
5817 else
5818 known_errorHandler = 0;
5819 }
5820 switch (known_errorHandler) {
5821 case 1: /* strict */
5822 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5823 goto onError;
5824 case 2: /* replace */
5825 while (collstart++<collend)
5826 *str++ = '?'; /* fall through */
5827 case 3: /* ignore */
5828 p = collend;
5829 break;
5830 case 4: /* xmlcharrefreplace */
5831 respos = str - PyBytes_AS_STRING(res);
5832 /* determine replacement size (temporarily (mis)uses p) */
5833 for (p = collstart, repsize = 0; p < collend; ++p) {
5834 if (*p<10)
5835 repsize += 2+1+1;
5836 else if (*p<100)
5837 repsize += 2+2+1;
5838 else if (*p<1000)
5839 repsize += 2+3+1;
5840 else if (*p<10000)
5841 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005842#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 else
5844 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005845#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 else if (*p<100000)
5847 repsize += 2+5+1;
5848 else if (*p<1000000)
5849 repsize += 2+6+1;
5850 else
5851 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005852#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 }
5854 requiredsize = respos+repsize+(endp-collend);
5855 if (requiredsize > ressize) {
5856 if (requiredsize<2*ressize)
5857 requiredsize = 2*ressize;
5858 if (_PyBytes_Resize(&res, requiredsize))
5859 goto onError;
5860 str = PyBytes_AS_STRING(res) + respos;
5861 ressize = requiredsize;
5862 }
5863 /* generate replacement (temporarily (mis)uses p) */
5864 for (p = collstart; p < collend; ++p) {
5865 str += sprintf(str, "&#%d;", (int)*p);
5866 }
5867 p = collend;
5868 break;
5869 default:
5870 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5871 encoding, reason, startp, size, &exc,
5872 collstart-startp, collend-startp, &newpos);
5873 if (repunicode == NULL)
5874 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005875 if (PyBytes_Check(repunicode)) {
5876 /* Directly copy bytes result to output. */
5877 repsize = PyBytes_Size(repunicode);
5878 if (repsize > 1) {
5879 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005880 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005881 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5882 Py_DECREF(repunicode);
5883 goto onError;
5884 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005885 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005886 ressize += repsize-1;
5887 }
5888 memcpy(str, PyBytes_AsString(repunicode), repsize);
5889 str += repsize;
5890 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005891 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005892 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005893 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005894 /* need more space? (at least enough for what we
5895 have+the replacement+the rest of the string, so
5896 we won't have to check space for encodable characters) */
5897 respos = str - PyBytes_AS_STRING(res);
5898 repsize = PyUnicode_GET_SIZE(repunicode);
5899 requiredsize = respos+repsize+(endp-collend);
5900 if (requiredsize > ressize) {
5901 if (requiredsize<2*ressize)
5902 requiredsize = 2*ressize;
5903 if (_PyBytes_Resize(&res, requiredsize)) {
5904 Py_DECREF(repunicode);
5905 goto onError;
5906 }
5907 str = PyBytes_AS_STRING(res) + respos;
5908 ressize = requiredsize;
5909 }
5910 /* check if there is anything unencodable in the replacement
5911 and copy it to the output */
5912 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5913 c = *uni2;
5914 if (c >= limit) {
5915 raise_encode_exception(&exc, encoding, startp, size,
5916 unicodepos, unicodepos+1, reason);
5917 Py_DECREF(repunicode);
5918 goto onError;
5919 }
5920 *str = (char)c;
5921 }
5922 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005923 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005924 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005925 }
5926 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005927 /* Resize if we allocated to much */
5928 size = str - PyBytes_AS_STRING(res);
5929 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005930 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005931 if (_PyBytes_Resize(&res, size) < 0)
5932 goto onError;
5933 }
5934
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005935 Py_XDECREF(errorHandler);
5936 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005937 return res;
5938
5939 onError:
5940 Py_XDECREF(res);
5941 Py_XDECREF(errorHandler);
5942 Py_XDECREF(exc);
5943 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005944}
5945
Alexander Belopolsky40018472011-02-26 01:02:56 +00005946PyObject *
5947PyUnicode_EncodeLatin1(const Py_UNICODE *p,
5948 Py_ssize_t size,
5949 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005951 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952}
5953
Alexander Belopolsky40018472011-02-26 01:02:56 +00005954PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005955_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956{
5957 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005958 PyErr_BadArgument();
5959 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005961 if (PyUnicode_READY(unicode) == -1)
5962 return NULL;
5963 /* Fast path: if it is a one-byte string, construct
5964 bytes object directly. */
5965 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
5966 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
5967 PyUnicode_GET_LENGTH(unicode));
5968 /* Non-Latin-1 characters present. Defer to above function to
5969 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005972 errors);
5973}
5974
5975PyObject*
5976PyUnicode_AsLatin1String(PyObject *unicode)
5977{
5978 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979}
5980
5981/* --- 7-bit ASCII Codec -------------------------------------------------- */
5982
Alexander Belopolsky40018472011-02-26 01:02:56 +00005983PyObject *
5984PyUnicode_DecodeASCII(const char *s,
5985 Py_ssize_t size,
5986 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005988 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 PyUnicodeObject *v;
5990 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005991 Py_ssize_t startinpos;
5992 Py_ssize_t endinpos;
5993 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005995 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996 PyObject *errorHandler = NULL;
5997 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005998 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00005999
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006001 if (size == 1 && *(unsigned char*)s < 128)
6002 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6003
6004 /* Fast path. Assume the input actually *is* ASCII, and allocate
6005 a single-block Unicode object with that assumption. If there is
6006 an error, drop the object and start over. */
6007 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6008 if (v == NULL)
6009 goto onError;
6010 d = PyUnicode_1BYTE_DATA(v);
6011 for (i = 0; i < size; i++) {
6012 unsigned char ch = ((unsigned char*)s)[i];
6013 if (ch < 128)
6014 d[i] = ch;
6015 else
6016 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006018 if (i == size)
6019 return (PyObject*)v;
6020 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006021
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 v = _PyUnicode_New(size);
6023 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006028 e = s + size;
6029 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006030 register unsigned char c = (unsigned char)*s;
6031 if (c < 128) {
6032 *p++ = c;
6033 ++s;
6034 }
6035 else {
6036 startinpos = s-starts;
6037 endinpos = startinpos + 1;
6038 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6039 if (unicode_decode_call_errorhandler(
6040 errors, &errorHandler,
6041 "ascii", "ordinal not in range(128)",
6042 &starts, &e, &startinpos, &endinpos, &exc, &s,
6043 &v, &outpos, &p))
6044 goto onError;
6045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006047 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6049 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006050 Py_XDECREF(errorHandler);
6051 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006052 if (PyUnicode_READY(v) == -1) {
6053 Py_DECREF(v);
6054 return NULL;
6055 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006057
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006060 Py_XDECREF(errorHandler);
6061 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 return NULL;
6063}
6064
Alexander Belopolsky40018472011-02-26 01:02:56 +00006065PyObject *
6066PyUnicode_EncodeASCII(const Py_UNICODE *p,
6067 Py_ssize_t size,
6068 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006070 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071}
6072
Alexander Belopolsky40018472011-02-26 01:02:56 +00006073PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006074_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075{
6076 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 PyErr_BadArgument();
6078 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006080 if (PyUnicode_READY(unicode) == -1)
6081 return NULL;
6082 /* Fast path: if it is an ASCII-only string, construct bytes object
6083 directly. Else defer to above function to raise the exception. */
6084 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6085 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6086 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006089 errors);
6090}
6091
6092PyObject *
6093PyUnicode_AsASCIIString(PyObject *unicode)
6094{
6095 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096}
6097
Victor Stinner99b95382011-07-04 14:23:54 +02006098#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006099
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006100/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006101
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006102#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006103#define NEED_RETRY
6104#endif
6105
6106/* XXX This code is limited to "true" double-byte encodings, as
6107 a) it assumes an incomplete character consists of a single byte, and
6108 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006110
Alexander Belopolsky40018472011-02-26 01:02:56 +00006111static int
6112is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006113{
6114 const char *curr = s + offset;
6115
6116 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 const char *prev = CharPrev(s, curr);
6118 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006119 }
6120 return 0;
6121}
6122
6123/*
6124 * Decode MBCS string into unicode object. If 'final' is set, converts
6125 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6126 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006127static int
6128decode_mbcs(PyUnicodeObject **v,
6129 const char *s, /* MBCS string */
6130 int size, /* sizeof MBCS string */
6131 int final,
6132 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006133{
6134 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006135 Py_ssize_t n;
6136 DWORD usize;
6137 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006138
6139 assert(size >= 0);
6140
Victor Stinner554f3f02010-06-16 23:33:54 +00006141 /* check and handle 'errors' arg */
6142 if (errors==NULL || strcmp(errors, "strict")==0)
6143 flags = MB_ERR_INVALID_CHARS;
6144 else if (strcmp(errors, "ignore")==0)
6145 flags = 0;
6146 else {
6147 PyErr_Format(PyExc_ValueError,
6148 "mbcs encoding does not support errors='%s'",
6149 errors);
6150 return -1;
6151 }
6152
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006153 /* Skip trailing lead-byte unless 'final' is set */
6154 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006156
6157 /* First get the size of the result */
6158 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006159 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6160 if (usize==0)
6161 goto mbcs_decode_error;
6162 } else
6163 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006164
6165 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 /* Create unicode object */
6167 *v = _PyUnicode_New(usize);
6168 if (*v == NULL)
6169 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006170 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006171 }
6172 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006173 /* Extend unicode object */
6174 n = PyUnicode_GET_SIZE(*v);
6175 if (_PyUnicode_Resize(v, n + usize) < 0)
6176 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006177 }
6178
6179 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006180 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006182 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6183 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006185 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006186 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006187
6188mbcs_decode_error:
6189 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6190 we raise a UnicodeDecodeError - else it is a 'generic'
6191 windows error
6192 */
6193 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6194 /* Ideally, we should get reason from FormatMessage - this
6195 is the Windows 2000 English version of the message
6196 */
6197 PyObject *exc = NULL;
6198 const char *reason = "No mapping for the Unicode character exists "
6199 "in the target multi-byte code page.";
6200 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6201 if (exc != NULL) {
6202 PyCodec_StrictErrors(exc);
6203 Py_DECREF(exc);
6204 }
6205 } else {
6206 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6207 }
6208 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006209}
6210
Alexander Belopolsky40018472011-02-26 01:02:56 +00006211PyObject *
6212PyUnicode_DecodeMBCSStateful(const char *s,
6213 Py_ssize_t size,
6214 const char *errors,
6215 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006216{
6217 PyUnicodeObject *v = NULL;
6218 int done;
6219
6220 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006222
6223#ifdef NEED_RETRY
6224 retry:
6225 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006226 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006227 else
6228#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006229 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006230
6231 if (done < 0) {
6232 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006234 }
6235
6236 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006237 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006238
6239#ifdef NEED_RETRY
6240 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 s += done;
6242 size -= done;
6243 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006244 }
6245#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006246 if (PyUnicode_READY(v) == -1) {
6247 Py_DECREF(v);
6248 return NULL;
6249 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006250 return (PyObject *)v;
6251}
6252
Alexander Belopolsky40018472011-02-26 01:02:56 +00006253PyObject *
6254PyUnicode_DecodeMBCS(const char *s,
6255 Py_ssize_t size,
6256 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006257{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006258 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6259}
6260
6261/*
6262 * Convert unicode into string object (MBCS).
6263 * Returns 0 if succeed, -1 otherwise.
6264 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006265static int
6266encode_mbcs(PyObject **repr,
6267 const Py_UNICODE *p, /* unicode */
6268 int size, /* size of unicode */
6269 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006270{
Victor Stinner554f3f02010-06-16 23:33:54 +00006271 BOOL usedDefaultChar = FALSE;
6272 BOOL *pusedDefaultChar;
6273 int mbcssize;
6274 Py_ssize_t n;
6275 PyObject *exc = NULL;
6276 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006277
6278 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006279
Victor Stinner554f3f02010-06-16 23:33:54 +00006280 /* check and handle 'errors' arg */
6281 if (errors==NULL || strcmp(errors, "strict")==0) {
6282 flags = WC_NO_BEST_FIT_CHARS;
6283 pusedDefaultChar = &usedDefaultChar;
6284 } else if (strcmp(errors, "replace")==0) {
6285 flags = 0;
6286 pusedDefaultChar = NULL;
6287 } else {
6288 PyErr_Format(PyExc_ValueError,
6289 "mbcs encoding does not support errors='%s'",
6290 errors);
6291 return -1;
6292 }
6293
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006294 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006295 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006296 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6297 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 if (mbcssize == 0) {
6299 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6300 return -1;
6301 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006302 /* If we used a default char, then we failed! */
6303 if (pusedDefaultChar && *pusedDefaultChar)
6304 goto mbcs_encode_error;
6305 } else {
6306 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006307 }
6308
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006309 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 /* Create string object */
6311 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6312 if (*repr == NULL)
6313 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006314 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006315 }
6316 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 /* Extend string object */
6318 n = PyBytes_Size(*repr);
6319 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6320 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006321 }
6322
6323 /* Do the conversion */
6324 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006326 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6327 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6329 return -1;
6330 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006331 if (pusedDefaultChar && *pusedDefaultChar)
6332 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006333 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006334 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006335
6336mbcs_encode_error:
6337 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6338 Py_XDECREF(exc);
6339 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006340}
6341
Alexander Belopolsky40018472011-02-26 01:02:56 +00006342PyObject *
6343PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6344 Py_ssize_t size,
6345 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006346{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006347 PyObject *repr = NULL;
6348 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006349
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006350#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006352 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006353 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006354 else
6355#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006356 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006357
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006358 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 Py_XDECREF(repr);
6360 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006361 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006362
6363#ifdef NEED_RETRY
6364 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 p += INT_MAX;
6366 size -= INT_MAX;
6367 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006368 }
6369#endif
6370
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006371 return repr;
6372}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006373
Alexander Belopolsky40018472011-02-26 01:02:56 +00006374PyObject *
6375PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006376{
6377 if (!PyUnicode_Check(unicode)) {
6378 PyErr_BadArgument();
6379 return NULL;
6380 }
6381 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 PyUnicode_GET_SIZE(unicode),
6383 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006384}
6385
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006386#undef NEED_RETRY
6387
Victor Stinner99b95382011-07-04 14:23:54 +02006388#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006389
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390/* --- Character Mapping Codec -------------------------------------------- */
6391
Alexander Belopolsky40018472011-02-26 01:02:56 +00006392PyObject *
6393PyUnicode_DecodeCharmap(const char *s,
6394 Py_ssize_t size,
6395 PyObject *mapping,
6396 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006399 Py_ssize_t startinpos;
6400 Py_ssize_t endinpos;
6401 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403 PyUnicodeObject *v;
6404 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006405 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 PyObject *errorHandler = NULL;
6407 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006408 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006409 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006410
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 /* Default to Latin-1 */
6412 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414
6415 v = _PyUnicode_New(size);
6416 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006422 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 mapstring = PyUnicode_AS_UNICODE(mapping);
6424 maplen = PyUnicode_GET_SIZE(mapping);
6425 while (s < e) {
6426 unsigned char ch = *s;
6427 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 if (ch < maplen)
6430 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 if (x == 0xfffe) {
6433 /* undefined mapping */
6434 outpos = p-PyUnicode_AS_UNICODE(v);
6435 startinpos = s-starts;
6436 endinpos = startinpos+1;
6437 if (unicode_decode_call_errorhandler(
6438 errors, &errorHandler,
6439 "charmap", "character maps to <undefined>",
6440 &starts, &e, &startinpos, &endinpos, &exc, &s,
6441 &v, &outpos, &p)) {
6442 goto onError;
6443 }
6444 continue;
6445 }
6446 *p++ = x;
6447 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006448 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006449 }
6450 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006451 while (s < e) {
6452 unsigned char ch = *s;
6453 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006454
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6456 w = PyLong_FromLong((long)ch);
6457 if (w == NULL)
6458 goto onError;
6459 x = PyObject_GetItem(mapping, w);
6460 Py_DECREF(w);
6461 if (x == NULL) {
6462 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6463 /* No mapping found means: mapping is undefined. */
6464 PyErr_Clear();
6465 x = Py_None;
6466 Py_INCREF(x);
6467 } else
6468 goto onError;
6469 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006470
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 /* Apply mapping */
6472 if (PyLong_Check(x)) {
6473 long value = PyLong_AS_LONG(x);
6474 if (value < 0 || value > 65535) {
6475 PyErr_SetString(PyExc_TypeError,
6476 "character mapping must be in range(65536)");
6477 Py_DECREF(x);
6478 goto onError;
6479 }
6480 *p++ = (Py_UNICODE)value;
6481 }
6482 else if (x == Py_None) {
6483 /* undefined mapping */
6484 outpos = p-PyUnicode_AS_UNICODE(v);
6485 startinpos = s-starts;
6486 endinpos = startinpos+1;
6487 if (unicode_decode_call_errorhandler(
6488 errors, &errorHandler,
6489 "charmap", "character maps to <undefined>",
6490 &starts, &e, &startinpos, &endinpos, &exc, &s,
6491 &v, &outpos, &p)) {
6492 Py_DECREF(x);
6493 goto onError;
6494 }
6495 Py_DECREF(x);
6496 continue;
6497 }
6498 else if (PyUnicode_Check(x)) {
6499 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006500
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 if (targetsize == 1)
6502 /* 1-1 mapping */
6503 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006504
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 else if (targetsize > 1) {
6506 /* 1-n mapping */
6507 if (targetsize > extrachars) {
6508 /* resize first */
6509 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6510 Py_ssize_t needed = (targetsize - extrachars) + \
6511 (targetsize << 2);
6512 extrachars += needed;
6513 /* XXX overflow detection missing */
6514 if (_PyUnicode_Resize(&v,
6515 PyUnicode_GET_SIZE(v) + needed) < 0) {
6516 Py_DECREF(x);
6517 goto onError;
6518 }
6519 p = PyUnicode_AS_UNICODE(v) + oldpos;
6520 }
6521 Py_UNICODE_COPY(p,
6522 PyUnicode_AS_UNICODE(x),
6523 targetsize);
6524 p += targetsize;
6525 extrachars -= targetsize;
6526 }
6527 /* 1-0 mapping: skip the character */
6528 }
6529 else {
6530 /* wrong return value */
6531 PyErr_SetString(PyExc_TypeError,
6532 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006533 Py_DECREF(x);
6534 goto onError;
6535 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 Py_DECREF(x);
6537 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 }
6540 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6542 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006543 Py_XDECREF(errorHandler);
6544 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006545 if (PyUnicode_READY(v) == -1) {
6546 Py_DECREF(v);
6547 return NULL;
6548 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006550
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006552 Py_XDECREF(errorHandler);
6553 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554 Py_XDECREF(v);
6555 return NULL;
6556}
6557
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006558/* Charmap encoding: the lookup table */
6559
Alexander Belopolsky40018472011-02-26 01:02:56 +00006560struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 PyObject_HEAD
6562 unsigned char level1[32];
6563 int count2, count3;
6564 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006565};
6566
6567static PyObject*
6568encoding_map_size(PyObject *obj, PyObject* args)
6569{
6570 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006571 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006573}
6574
6575static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006576 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 PyDoc_STR("Return the size (in bytes) of this object") },
6578 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006579};
6580
6581static void
6582encoding_map_dealloc(PyObject* o)
6583{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006584 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006585}
6586
6587static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006588 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 "EncodingMap", /*tp_name*/
6590 sizeof(struct encoding_map), /*tp_basicsize*/
6591 0, /*tp_itemsize*/
6592 /* methods */
6593 encoding_map_dealloc, /*tp_dealloc*/
6594 0, /*tp_print*/
6595 0, /*tp_getattr*/
6596 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006597 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 0, /*tp_repr*/
6599 0, /*tp_as_number*/
6600 0, /*tp_as_sequence*/
6601 0, /*tp_as_mapping*/
6602 0, /*tp_hash*/
6603 0, /*tp_call*/
6604 0, /*tp_str*/
6605 0, /*tp_getattro*/
6606 0, /*tp_setattro*/
6607 0, /*tp_as_buffer*/
6608 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6609 0, /*tp_doc*/
6610 0, /*tp_traverse*/
6611 0, /*tp_clear*/
6612 0, /*tp_richcompare*/
6613 0, /*tp_weaklistoffset*/
6614 0, /*tp_iter*/
6615 0, /*tp_iternext*/
6616 encoding_map_methods, /*tp_methods*/
6617 0, /*tp_members*/
6618 0, /*tp_getset*/
6619 0, /*tp_base*/
6620 0, /*tp_dict*/
6621 0, /*tp_descr_get*/
6622 0, /*tp_descr_set*/
6623 0, /*tp_dictoffset*/
6624 0, /*tp_init*/
6625 0, /*tp_alloc*/
6626 0, /*tp_new*/
6627 0, /*tp_free*/
6628 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006629};
6630
6631PyObject*
6632PyUnicode_BuildEncodingMap(PyObject* string)
6633{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006634 PyObject *result;
6635 struct encoding_map *mresult;
6636 int i;
6637 int need_dict = 0;
6638 unsigned char level1[32];
6639 unsigned char level2[512];
6640 unsigned char *mlevel1, *mlevel2, *mlevel3;
6641 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006642 int kind;
6643 void *data;
6644 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006646 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006647 PyErr_BadArgument();
6648 return NULL;
6649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006650 kind = PyUnicode_KIND(string);
6651 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006652 memset(level1, 0xFF, sizeof level1);
6653 memset(level2, 0xFF, sizeof level2);
6654
6655 /* If there isn't a one-to-one mapping of NULL to \0,
6656 or if there are non-BMP characters, we need to use
6657 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006658 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006659 need_dict = 1;
6660 for (i = 1; i < 256; i++) {
6661 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006662 ch = PyUnicode_READ(kind, data, i);
6663 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006664 need_dict = 1;
6665 break;
6666 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006667 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006668 /* unmapped character */
6669 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006670 l1 = ch >> 11;
6671 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006672 if (level1[l1] == 0xFF)
6673 level1[l1] = count2++;
6674 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006675 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006676 }
6677
6678 if (count2 >= 0xFF || count3 >= 0xFF)
6679 need_dict = 1;
6680
6681 if (need_dict) {
6682 PyObject *result = PyDict_New();
6683 PyObject *key, *value;
6684 if (!result)
6685 return NULL;
6686 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006687 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006688 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006689 if (!key || !value)
6690 goto failed1;
6691 if (PyDict_SetItem(result, key, value) == -1)
6692 goto failed1;
6693 Py_DECREF(key);
6694 Py_DECREF(value);
6695 }
6696 return result;
6697 failed1:
6698 Py_XDECREF(key);
6699 Py_XDECREF(value);
6700 Py_DECREF(result);
6701 return NULL;
6702 }
6703
6704 /* Create a three-level trie */
6705 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6706 16*count2 + 128*count3 - 1);
6707 if (!result)
6708 return PyErr_NoMemory();
6709 PyObject_Init(result, &EncodingMapType);
6710 mresult = (struct encoding_map*)result;
6711 mresult->count2 = count2;
6712 mresult->count3 = count3;
6713 mlevel1 = mresult->level1;
6714 mlevel2 = mresult->level23;
6715 mlevel3 = mresult->level23 + 16*count2;
6716 memcpy(mlevel1, level1, 32);
6717 memset(mlevel2, 0xFF, 16*count2);
6718 memset(mlevel3, 0, 128*count3);
6719 count3 = 0;
6720 for (i = 1; i < 256; i++) {
6721 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006722 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006723 /* unmapped character */
6724 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006725 o1 = PyUnicode_READ(kind, data, i)>>11;
6726 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006727 i2 = 16*mlevel1[o1] + o2;
6728 if (mlevel2[i2] == 0xFF)
6729 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006730 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006731 i3 = 128*mlevel2[i2] + o3;
6732 mlevel3[i3] = i;
6733 }
6734 return result;
6735}
6736
6737static int
6738encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6739{
6740 struct encoding_map *map = (struct encoding_map*)mapping;
6741 int l1 = c>>11;
6742 int l2 = (c>>7) & 0xF;
6743 int l3 = c & 0x7F;
6744 int i;
6745
6746#ifdef Py_UNICODE_WIDE
6747 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006749 }
6750#endif
6751 if (c == 0)
6752 return 0;
6753 /* level 1*/
6754 i = map->level1[l1];
6755 if (i == 0xFF) {
6756 return -1;
6757 }
6758 /* level 2*/
6759 i = map->level23[16*i+l2];
6760 if (i == 0xFF) {
6761 return -1;
6762 }
6763 /* level 3 */
6764 i = map->level23[16*map->count2 + 128*i + l3];
6765 if (i == 0) {
6766 return -1;
6767 }
6768 return i;
6769}
6770
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006771/* Lookup the character ch in the mapping. If the character
6772 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006773 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006774static PyObject *
6775charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776{
Christian Heimes217cfd12007-12-02 14:31:20 +00006777 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006778 PyObject *x;
6779
6780 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006782 x = PyObject_GetItem(mapping, w);
6783 Py_DECREF(w);
6784 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6786 /* No mapping found means: mapping is undefined. */
6787 PyErr_Clear();
6788 x = Py_None;
6789 Py_INCREF(x);
6790 return x;
6791 } else
6792 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006794 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006795 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006796 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 long value = PyLong_AS_LONG(x);
6798 if (value < 0 || value > 255) {
6799 PyErr_SetString(PyExc_TypeError,
6800 "character mapping must be in range(256)");
6801 Py_DECREF(x);
6802 return NULL;
6803 }
6804 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006806 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 /* wrong return value */
6810 PyErr_Format(PyExc_TypeError,
6811 "character mapping must return integer, bytes or None, not %.400s",
6812 x->ob_type->tp_name);
6813 Py_DECREF(x);
6814 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 }
6816}
6817
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006818static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006819charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006820{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006821 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6822 /* exponentially overallocate to minimize reallocations */
6823 if (requiredsize < 2*outsize)
6824 requiredsize = 2*outsize;
6825 if (_PyBytes_Resize(outobj, requiredsize))
6826 return -1;
6827 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006828}
6829
Benjamin Peterson14339b62009-01-31 16:36:08 +00006830typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006832} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006833/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006834 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006835 space is available. Return a new reference to the object that
6836 was put in the output buffer, or Py_None, if the mapping was undefined
6837 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006838 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006839static charmapencode_result
6840charmapencode_output(Py_UNICODE c, PyObject *mapping,
6841 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006842{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006843 PyObject *rep;
6844 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006845 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006846
Christian Heimes90aa7642007-12-19 02:45:37 +00006847 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006848 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006850 if (res == -1)
6851 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006852 if (outsize<requiredsize)
6853 if (charmapencode_resize(outobj, outpos, requiredsize))
6854 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006855 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006856 outstart[(*outpos)++] = (char)res;
6857 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006858 }
6859
6860 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006861 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006863 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006864 Py_DECREF(rep);
6865 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006866 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006867 if (PyLong_Check(rep)) {
6868 Py_ssize_t requiredsize = *outpos+1;
6869 if (outsize<requiredsize)
6870 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6871 Py_DECREF(rep);
6872 return enc_EXCEPTION;
6873 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006874 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006876 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 else {
6878 const char *repchars = PyBytes_AS_STRING(rep);
6879 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6880 Py_ssize_t requiredsize = *outpos+repsize;
6881 if (outsize<requiredsize)
6882 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6883 Py_DECREF(rep);
6884 return enc_EXCEPTION;
6885 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006886 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 memcpy(outstart + *outpos, repchars, repsize);
6888 *outpos += repsize;
6889 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006890 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006891 Py_DECREF(rep);
6892 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006893}
6894
6895/* handle an error in PyUnicode_EncodeCharmap
6896 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006897static int
6898charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006899 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006900 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006901 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006902 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006903{
6904 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006905 Py_ssize_t repsize;
6906 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006907 Py_UNICODE *uni2;
6908 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006909 Py_ssize_t collstartpos = *inpos;
6910 Py_ssize_t collendpos = *inpos+1;
6911 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006912 char *encoding = "charmap";
6913 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006914 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006915
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006916 /* find all unencodable characters */
6917 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006918 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006919 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 int res = encoding_map_lookup(p[collendpos], mapping);
6921 if (res != -1)
6922 break;
6923 ++collendpos;
6924 continue;
6925 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006926
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 rep = charmapencode_lookup(p[collendpos], mapping);
6928 if (rep==NULL)
6929 return -1;
6930 else if (rep!=Py_None) {
6931 Py_DECREF(rep);
6932 break;
6933 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006934 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006935 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006936 }
6937 /* cache callback name lookup
6938 * (if not done yet, i.e. it's the first error) */
6939 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006940 if ((errors==NULL) || (!strcmp(errors, "strict")))
6941 *known_errorHandler = 1;
6942 else if (!strcmp(errors, "replace"))
6943 *known_errorHandler = 2;
6944 else if (!strcmp(errors, "ignore"))
6945 *known_errorHandler = 3;
6946 else if (!strcmp(errors, "xmlcharrefreplace"))
6947 *known_errorHandler = 4;
6948 else
6949 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006950 }
6951 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006952 case 1: /* strict */
6953 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6954 return -1;
6955 case 2: /* replace */
6956 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 x = charmapencode_output('?', mapping, res, respos);
6958 if (x==enc_EXCEPTION) {
6959 return -1;
6960 }
6961 else if (x==enc_FAILED) {
6962 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6963 return -1;
6964 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006965 }
6966 /* fall through */
6967 case 3: /* ignore */
6968 *inpos = collendpos;
6969 break;
6970 case 4: /* xmlcharrefreplace */
6971 /* generate replacement (temporarily (mis)uses p) */
6972 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 char buffer[2+29+1+1];
6974 char *cp;
6975 sprintf(buffer, "&#%d;", (int)p[collpos]);
6976 for (cp = buffer; *cp; ++cp) {
6977 x = charmapencode_output(*cp, mapping, res, respos);
6978 if (x==enc_EXCEPTION)
6979 return -1;
6980 else if (x==enc_FAILED) {
6981 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6982 return -1;
6983 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006984 }
6985 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006986 *inpos = collendpos;
6987 break;
6988 default:
6989 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00006990 encoding, reason, p, size, exceptionObject,
6991 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006992 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006994 if (PyBytes_Check(repunicode)) {
6995 /* Directly copy bytes result to output. */
6996 Py_ssize_t outsize = PyBytes_Size(*res);
6997 Py_ssize_t requiredsize;
6998 repsize = PyBytes_Size(repunicode);
6999 requiredsize = *respos + repsize;
7000 if (requiredsize > outsize)
7001 /* Make room for all additional bytes. */
7002 if (charmapencode_resize(res, respos, requiredsize)) {
7003 Py_DECREF(repunicode);
7004 return -1;
7005 }
7006 memcpy(PyBytes_AsString(*res) + *respos,
7007 PyBytes_AsString(repunicode), repsize);
7008 *respos += repsize;
7009 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007010 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007011 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007012 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007013 /* generate replacement */
7014 repsize = PyUnicode_GET_SIZE(repunicode);
7015 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 x = charmapencode_output(*uni2, mapping, res, respos);
7017 if (x==enc_EXCEPTION) {
7018 return -1;
7019 }
7020 else if (x==enc_FAILED) {
7021 Py_DECREF(repunicode);
7022 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7023 return -1;
7024 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007025 }
7026 *inpos = newpos;
7027 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007028 }
7029 return 0;
7030}
7031
Alexander Belopolsky40018472011-02-26 01:02:56 +00007032PyObject *
7033PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7034 Py_ssize_t size,
7035 PyObject *mapping,
7036 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007038 /* output object */
7039 PyObject *res = NULL;
7040 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007041 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007042 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007043 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007044 PyObject *errorHandler = NULL;
7045 PyObject *exc = NULL;
7046 /* the following variable is used for caching string comparisons
7047 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7048 * 3=ignore, 4=xmlcharrefreplace */
7049 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050
7051 /* Default to Latin-1 */
7052 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007053 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007055 /* allocate enough for a simple encoding without
7056 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007057 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007058 if (res == NULL)
7059 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007060 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007061 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007063 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007064 /* try to encode it */
7065 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7066 if (x==enc_EXCEPTION) /* error */
7067 goto onError;
7068 if (x==enc_FAILED) { /* unencodable character */
7069 if (charmap_encoding_error(p, size, &inpos, mapping,
7070 &exc,
7071 &known_errorHandler, &errorHandler, errors,
7072 &res, &respos)) {
7073 goto onError;
7074 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007075 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007076 else
7077 /* done with this character => adjust input position */
7078 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007081 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007082 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007083 if (_PyBytes_Resize(&res, respos) < 0)
7084 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007085
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007086 Py_XDECREF(exc);
7087 Py_XDECREF(errorHandler);
7088 return res;
7089
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007091 Py_XDECREF(res);
7092 Py_XDECREF(exc);
7093 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094 return NULL;
7095}
7096
Alexander Belopolsky40018472011-02-26 01:02:56 +00007097PyObject *
7098PyUnicode_AsCharmapString(PyObject *unicode,
7099 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100{
7101 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007102 PyErr_BadArgument();
7103 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 }
7105 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007106 PyUnicode_GET_SIZE(unicode),
7107 mapping,
7108 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109}
7110
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007111/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007112static void
7113make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007114 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007115 Py_ssize_t startpos, Py_ssize_t endpos,
7116 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007118 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007119 *exceptionObject = _PyUnicodeTranslateError_Create(
7120 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121 }
7122 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7124 goto onError;
7125 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7126 goto onError;
7127 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7128 goto onError;
7129 return;
7130 onError:
7131 Py_DECREF(*exceptionObject);
7132 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133 }
7134}
7135
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007136/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007137static void
7138raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007139 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007140 Py_ssize_t startpos, Py_ssize_t endpos,
7141 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007142{
7143 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007144 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007145 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007146 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007147}
7148
7149/* error handling callback helper:
7150 build arguments, call the callback and check the arguments,
7151 put the result into newpos and return the replacement string, which
7152 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007153static PyObject *
7154unicode_translate_call_errorhandler(const char *errors,
7155 PyObject **errorHandler,
7156 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007157 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007158 Py_ssize_t startpos, Py_ssize_t endpos,
7159 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007160{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007161 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007162
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007163 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007164 PyObject *restuple;
7165 PyObject *resunicode;
7166
7167 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007169 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007170 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007171 }
7172
7173 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007174 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007175 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007176 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007177
7178 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007179 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007180 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007181 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007182 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007183 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 Py_DECREF(restuple);
7185 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007186 }
7187 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007188 &resunicode, &i_newpos)) {
7189 Py_DECREF(restuple);
7190 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007191 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007192 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007193 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007194 else
7195 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007196 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7198 Py_DECREF(restuple);
7199 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007200 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007201 Py_INCREF(resunicode);
7202 Py_DECREF(restuple);
7203 return resunicode;
7204}
7205
7206/* Lookup the character ch in the mapping and put the result in result,
7207 which must be decrefed by the caller.
7208 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007209static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007210charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007211{
Christian Heimes217cfd12007-12-02 14:31:20 +00007212 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007213 PyObject *x;
7214
7215 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007217 x = PyObject_GetItem(mapping, w);
7218 Py_DECREF(w);
7219 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007220 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7221 /* No mapping found means: use 1:1 mapping. */
7222 PyErr_Clear();
7223 *result = NULL;
7224 return 0;
7225 } else
7226 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007227 }
7228 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 *result = x;
7230 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007231 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007232 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 long value = PyLong_AS_LONG(x);
7234 long max = PyUnicode_GetMax();
7235 if (value < 0 || value > max) {
7236 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007237 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 Py_DECREF(x);
7239 return -1;
7240 }
7241 *result = x;
7242 return 0;
7243 }
7244 else if (PyUnicode_Check(x)) {
7245 *result = x;
7246 return 0;
7247 }
7248 else {
7249 /* wrong return value */
7250 PyErr_SetString(PyExc_TypeError,
7251 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007252 Py_DECREF(x);
7253 return -1;
7254 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007255}
7256/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007257 if not reallocate and adjust various state variables.
7258 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007259static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007260charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007263 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007264 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007265 /* exponentially overallocate to minimize reallocations */
7266 if (requiredsize < 2 * oldsize)
7267 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007268 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7269 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007271 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007272 }
7273 return 0;
7274}
7275/* lookup the character, put the result in the output string and adjust
7276 various state variables. Return a new reference to the object that
7277 was put in the output buffer in *result, or Py_None, if the mapping was
7278 undefined (in which case no character was written).
7279 The called must decref result.
7280 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007281static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007282charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7283 PyObject *mapping, Py_UCS4 **output,
7284 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007285 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007287 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7288 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007289 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007290 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007292 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007293 }
7294 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007296 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007298 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007299 }
7300 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007301 Py_ssize_t repsize;
7302 if (PyUnicode_READY(*res) == -1)
7303 return -1;
7304 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007305 if (repsize==1) {
7306 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007307 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007308 }
7309 else if (repsize!=0) {
7310 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007311 Py_ssize_t requiredsize = *opos +
7312 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007314 Py_ssize_t i;
7315 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007316 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007317 for(i = 0; i < repsize; i++)
7318 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007320 }
7321 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007322 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007323 return 0;
7324}
7325
Alexander Belopolsky40018472011-02-26 01:02:56 +00007326PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007327_PyUnicode_TranslateCharmap(PyObject *input,
7328 PyObject *mapping,
7329 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007331 /* input object */
7332 char *idata;
7333 Py_ssize_t size, i;
7334 int kind;
7335 /* output buffer */
7336 Py_UCS4 *output = NULL;
7337 Py_ssize_t osize;
7338 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007339 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007340 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007341 char *reason = "character maps to <undefined>";
7342 PyObject *errorHandler = NULL;
7343 PyObject *exc = NULL;
7344 /* the following variable is used for caching string comparisons
7345 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7346 * 3=ignore, 4=xmlcharrefreplace */
7347 int known_errorHandler = -1;
7348
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007350 PyErr_BadArgument();
7351 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007354 if (PyUnicode_READY(input) == -1)
7355 return NULL;
7356 idata = (char*)PyUnicode_DATA(input);
7357 kind = PyUnicode_KIND(input);
7358 size = PyUnicode_GET_LENGTH(input);
7359 i = 0;
7360
7361 if (size == 0) {
7362 Py_INCREF(input);
7363 return input;
7364 }
7365
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007366 /* allocate enough for a simple 1:1 translation without
7367 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007368 osize = size;
7369 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7370 opos = 0;
7371 if (output == NULL) {
7372 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007376 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 /* try to encode it */
7378 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007379 if (charmaptranslate_output(input, i, mapping,
7380 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007381 Py_XDECREF(x);
7382 goto onError;
7383 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007384 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007386 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007387 else { /* untranslatable character */
7388 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7389 Py_ssize_t repsize;
7390 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007391 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007392 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007393 Py_ssize_t collstart = i;
7394 Py_ssize_t collend = i+1;
7395 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007398 while (collend < size) {
7399 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 goto onError;
7401 Py_XDECREF(x);
7402 if (x!=Py_None)
7403 break;
7404 ++collend;
7405 }
7406 /* cache callback name lookup
7407 * (if not done yet, i.e. it's the first error) */
7408 if (known_errorHandler==-1) {
7409 if ((errors==NULL) || (!strcmp(errors, "strict")))
7410 known_errorHandler = 1;
7411 else if (!strcmp(errors, "replace"))
7412 known_errorHandler = 2;
7413 else if (!strcmp(errors, "ignore"))
7414 known_errorHandler = 3;
7415 else if (!strcmp(errors, "xmlcharrefreplace"))
7416 known_errorHandler = 4;
7417 else
7418 known_errorHandler = 0;
7419 }
7420 switch (known_errorHandler) {
7421 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007422 raise_translate_exception(&exc, input, collstart,
7423 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007424 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 case 2: /* replace */
7426 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007427 for (coll = collstart; coll<collend; coll++)
7428 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007429 /* fall through */
7430 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007431 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 break;
7433 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007434 /* generate replacement (temporarily (mis)uses i) */
7435 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007436 char buffer[2+29+1+1];
7437 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007438 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7439 if (charmaptranslate_makespace(&output, &osize,
7440 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 goto onError;
7442 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007443 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007445 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 break;
7447 default:
7448 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007449 reason, input, &exc,
7450 collstart, collend, &newpos);
7451 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 goto onError;
7453 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007454 repsize = PyUnicode_GET_LENGTH(repunicode);
7455 if (charmaptranslate_makespace(&output, &osize,
7456 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 Py_DECREF(repunicode);
7458 goto onError;
7459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007460 for (uni2 = 0; repsize-->0; ++uni2)
7461 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7462 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007464 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007465 }
7466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007467 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7468 if (!res)
7469 goto onError;
7470 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007471 Py_XDECREF(exc);
7472 Py_XDECREF(errorHandler);
7473 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007476 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007477 Py_XDECREF(exc);
7478 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479 return NULL;
7480}
7481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007482/* Deprecated. Use PyUnicode_Translate instead. */
7483PyObject *
7484PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7485 Py_ssize_t size,
7486 PyObject *mapping,
7487 const char *errors)
7488{
7489 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7490 if (!unicode)
7491 return NULL;
7492 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7493}
7494
Alexander Belopolsky40018472011-02-26 01:02:56 +00007495PyObject *
7496PyUnicode_Translate(PyObject *str,
7497 PyObject *mapping,
7498 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499{
7500 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007501
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 str = PyUnicode_FromObject(str);
7503 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007504 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007505 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506 Py_DECREF(str);
7507 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007508
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510 Py_XDECREF(str);
7511 return NULL;
7512}
Tim Petersced69f82003-09-16 20:30:58 +00007513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007514static Py_UCS4
7515fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7516{
7517 /* No need to call PyUnicode_READY(self) because this function is only
7518 called as a callback from fixup() which does it already. */
7519 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7520 const int kind = PyUnicode_KIND(self);
7521 void *data = PyUnicode_DATA(self);
7522 Py_UCS4 maxchar = 0, ch, fixed;
7523 Py_ssize_t i;
7524
7525 for (i = 0; i < len; ++i) {
7526 ch = PyUnicode_READ(kind, data, i);
7527 fixed = 0;
7528 if (ch > 127) {
7529 if (Py_UNICODE_ISSPACE(ch))
7530 fixed = ' ';
7531 else {
7532 const int decimal = Py_UNICODE_TODECIMAL(ch);
7533 if (decimal >= 0)
7534 fixed = '0' + decimal;
7535 }
7536 if (fixed != 0) {
7537 if (fixed > maxchar)
7538 maxchar = fixed;
7539 PyUnicode_WRITE(kind, data, i, fixed);
7540 }
7541 else if (ch > maxchar)
7542 maxchar = ch;
7543 }
7544 else if (ch > maxchar)
7545 maxchar = ch;
7546 }
7547
7548 return maxchar;
7549}
7550
7551PyObject *
7552_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7553{
7554 if (!PyUnicode_Check(unicode)) {
7555 PyErr_BadInternalCall();
7556 return NULL;
7557 }
7558 if (PyUnicode_READY(unicode) == -1)
7559 return NULL;
7560 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7561 /* If the string is already ASCII, just return the same string */
7562 Py_INCREF(unicode);
7563 return unicode;
7564 }
7565 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7566}
7567
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007568PyObject *
7569PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7570 Py_ssize_t length)
7571{
7572 PyObject *result;
7573 Py_UNICODE *p; /* write pointer into result */
7574 Py_ssize_t i;
7575 /* Copy to a new string */
7576 result = (PyObject *)_PyUnicode_New(length);
7577 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7578 if (result == NULL)
7579 return result;
7580 p = PyUnicode_AS_UNICODE(result);
7581 /* Iterate over code points */
7582 for (i = 0; i < length; i++) {
7583 Py_UNICODE ch =s[i];
7584 if (ch > 127) {
7585 int decimal = Py_UNICODE_TODECIMAL(ch);
7586 if (decimal >= 0)
7587 p[i] = '0' + decimal;
7588 }
7589 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007590 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7591 Py_DECREF(result);
7592 return NULL;
7593 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007594 return result;
7595}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007596/* --- Decimal Encoder ---------------------------------------------------- */
7597
Alexander Belopolsky40018472011-02-26 01:02:56 +00007598int
7599PyUnicode_EncodeDecimal(Py_UNICODE *s,
7600 Py_ssize_t length,
7601 char *output,
7602 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007603{
7604 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007605 PyObject *errorHandler = NULL;
7606 PyObject *exc = NULL;
7607 const char *encoding = "decimal";
7608 const char *reason = "invalid decimal Unicode string";
7609 /* the following variable is used for caching string comparisons
7610 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7611 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007612
7613 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 PyErr_BadArgument();
7615 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007616 }
7617
7618 p = s;
7619 end = s + length;
7620 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 register Py_UNICODE ch = *p;
7622 int decimal;
7623 PyObject *repunicode;
7624 Py_ssize_t repsize;
7625 Py_ssize_t newpos;
7626 Py_UNICODE *uni2;
7627 Py_UNICODE *collstart;
7628 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007629
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007631 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007632 ++p;
7633 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007634 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 decimal = Py_UNICODE_TODECIMAL(ch);
7636 if (decimal >= 0) {
7637 *output++ = '0' + decimal;
7638 ++p;
7639 continue;
7640 }
7641 if (0 < ch && ch < 256) {
7642 *output++ = (char)ch;
7643 ++p;
7644 continue;
7645 }
7646 /* All other characters are considered unencodable */
7647 collstart = p;
7648 collend = p+1;
7649 while (collend < end) {
7650 if ((0 < *collend && *collend < 256) ||
7651 !Py_UNICODE_ISSPACE(*collend) ||
7652 Py_UNICODE_TODECIMAL(*collend))
7653 break;
7654 }
7655 /* cache callback name lookup
7656 * (if not done yet, i.e. it's the first error) */
7657 if (known_errorHandler==-1) {
7658 if ((errors==NULL) || (!strcmp(errors, "strict")))
7659 known_errorHandler = 1;
7660 else if (!strcmp(errors, "replace"))
7661 known_errorHandler = 2;
7662 else if (!strcmp(errors, "ignore"))
7663 known_errorHandler = 3;
7664 else if (!strcmp(errors, "xmlcharrefreplace"))
7665 known_errorHandler = 4;
7666 else
7667 known_errorHandler = 0;
7668 }
7669 switch (known_errorHandler) {
7670 case 1: /* strict */
7671 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7672 goto onError;
7673 case 2: /* replace */
7674 for (p = collstart; p < collend; ++p)
7675 *output++ = '?';
7676 /* fall through */
7677 case 3: /* ignore */
7678 p = collend;
7679 break;
7680 case 4: /* xmlcharrefreplace */
7681 /* generate replacement (temporarily (mis)uses p) */
7682 for (p = collstart; p < collend; ++p)
7683 output += sprintf(output, "&#%d;", (int)*p);
7684 p = collend;
7685 break;
7686 default:
7687 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7688 encoding, reason, s, length, &exc,
7689 collstart-s, collend-s, &newpos);
7690 if (repunicode == NULL)
7691 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007692 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007693 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007694 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7695 Py_DECREF(repunicode);
7696 goto onError;
7697 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007698 /* generate replacement */
7699 repsize = PyUnicode_GET_SIZE(repunicode);
7700 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7701 Py_UNICODE ch = *uni2;
7702 if (Py_UNICODE_ISSPACE(ch))
7703 *output++ = ' ';
7704 else {
7705 decimal = Py_UNICODE_TODECIMAL(ch);
7706 if (decimal >= 0)
7707 *output++ = '0' + decimal;
7708 else if (0 < ch && ch < 256)
7709 *output++ = (char)ch;
7710 else {
7711 Py_DECREF(repunicode);
7712 raise_encode_exception(&exc, encoding,
7713 s, length, collstart-s, collend-s, reason);
7714 goto onError;
7715 }
7716 }
7717 }
7718 p = s + newpos;
7719 Py_DECREF(repunicode);
7720 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007721 }
7722 /* 0-terminate the output string */
7723 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007724 Py_XDECREF(exc);
7725 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007726 return 0;
7727
Benjamin Peterson29060642009-01-31 22:14:21 +00007728 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007729 Py_XDECREF(exc);
7730 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007731 return -1;
7732}
7733
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734/* --- Helpers ------------------------------------------------------------ */
7735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007736#include "stringlib/ucs1lib.h"
7737#include "stringlib/fastsearch.h"
7738#include "stringlib/partition.h"
7739#include "stringlib/split.h"
7740#include "stringlib/count.h"
7741#include "stringlib/find.h"
7742#include "stringlib/localeutil.h"
7743#include "stringlib/undef.h"
7744
7745#include "stringlib/ucs2lib.h"
7746#include "stringlib/fastsearch.h"
7747#include "stringlib/partition.h"
7748#include "stringlib/split.h"
7749#include "stringlib/count.h"
7750#include "stringlib/find.h"
7751#include "stringlib/localeutil.h"
7752#include "stringlib/undef.h"
7753
7754#include "stringlib/ucs4lib.h"
7755#include "stringlib/fastsearch.h"
7756#include "stringlib/partition.h"
7757#include "stringlib/split.h"
7758#include "stringlib/count.h"
7759#include "stringlib/find.h"
7760#include "stringlib/localeutil.h"
7761#include "stringlib/undef.h"
7762
7763static Py_ssize_t
7764any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7765 const Py_UCS1*, Py_ssize_t,
7766 Py_ssize_t, Py_ssize_t),
7767 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7768 const Py_UCS2*, Py_ssize_t,
7769 Py_ssize_t, Py_ssize_t),
7770 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7771 const Py_UCS4*, Py_ssize_t,
7772 Py_ssize_t, Py_ssize_t),
7773 PyObject* s1, PyObject* s2,
7774 Py_ssize_t start,
7775 Py_ssize_t end)
7776{
7777 int kind1, kind2, kind;
7778 void *buf1, *buf2;
7779 Py_ssize_t len1, len2, result;
7780
7781 kind1 = PyUnicode_KIND(s1);
7782 kind2 = PyUnicode_KIND(s2);
7783 kind = kind1 > kind2 ? kind1 : kind2;
7784 buf1 = PyUnicode_DATA(s1);
7785 buf2 = PyUnicode_DATA(s2);
7786 if (kind1 != kind)
7787 buf1 = _PyUnicode_AsKind(s1, kind);
7788 if (!buf1)
7789 return -2;
7790 if (kind2 != kind)
7791 buf2 = _PyUnicode_AsKind(s2, kind);
7792 if (!buf2) {
7793 if (kind1 != kind) PyMem_Free(buf1);
7794 return -2;
7795 }
7796 len1 = PyUnicode_GET_LENGTH(s1);
7797 len2 = PyUnicode_GET_LENGTH(s2);
7798
7799 switch(kind) {
7800 case PyUnicode_1BYTE_KIND:
7801 result = ucs1(buf1, len1, buf2, len2, start, end);
7802 break;
7803 case PyUnicode_2BYTE_KIND:
7804 result = ucs2(buf1, len1, buf2, len2, start, end);
7805 break;
7806 case PyUnicode_4BYTE_KIND:
7807 result = ucs4(buf1, len1, buf2, len2, start, end);
7808 break;
7809 default:
7810 assert(0); result = -2;
7811 }
7812
7813 if (kind1 != kind)
7814 PyMem_Free(buf1);
7815 if (kind2 != kind)
7816 PyMem_Free(buf2);
7817
7818 return result;
7819}
7820
7821Py_ssize_t
7822_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7823 Py_ssize_t n_buffer,
7824 void *digits, Py_ssize_t n_digits,
7825 Py_ssize_t min_width,
7826 const char *grouping,
7827 const char *thousands_sep)
7828{
7829 switch(kind) {
7830 case PyUnicode_1BYTE_KIND:
7831 return _PyUnicode_ucs1_InsertThousandsGrouping(
7832 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7833 min_width, grouping, thousands_sep);
7834 case PyUnicode_2BYTE_KIND:
7835 return _PyUnicode_ucs2_InsertThousandsGrouping(
7836 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7837 min_width, grouping, thousands_sep);
7838 case PyUnicode_4BYTE_KIND:
7839 return _PyUnicode_ucs4_InsertThousandsGrouping(
7840 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7841 min_width, grouping, thousands_sep);
7842 }
7843 assert(0);
7844 return -1;
7845}
7846
7847
Eric Smith8c663262007-08-25 02:26:07 +00007848#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007849#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007850
Thomas Wouters477c8d52006-05-27 19:21:47 +00007851#include "stringlib/count.h"
7852#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007853
Thomas Wouters477c8d52006-05-27 19:21:47 +00007854/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007855#define ADJUST_INDICES(start, end, len) \
7856 if (end > len) \
7857 end = len; \
7858 else if (end < 0) { \
7859 end += len; \
7860 if (end < 0) \
7861 end = 0; \
7862 } \
7863 if (start < 0) { \
7864 start += len; \
7865 if (start < 0) \
7866 start = 0; \
7867 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007868
Alexander Belopolsky40018472011-02-26 01:02:56 +00007869Py_ssize_t
7870PyUnicode_Count(PyObject *str,
7871 PyObject *substr,
7872 Py_ssize_t start,
7873 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007875 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007876 PyUnicodeObject* str_obj;
7877 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007878 int kind1, kind2, kind;
7879 void *buf1 = NULL, *buf2 = NULL;
7880 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007881
Thomas Wouters477c8d52006-05-27 19:21:47 +00007882 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007883 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007885 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007886 if (!sub_obj || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 Py_DECREF(str_obj);
7888 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889 }
Tim Petersced69f82003-09-16 20:30:58 +00007890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007891 kind1 = PyUnicode_KIND(str_obj);
7892 kind2 = PyUnicode_KIND(sub_obj);
7893 kind = kind1 > kind2 ? kind1 : kind2;
7894 buf1 = PyUnicode_DATA(str_obj);
7895 if (kind1 != kind)
7896 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7897 if (!buf1)
7898 goto onError;
7899 buf2 = PyUnicode_DATA(sub_obj);
7900 if (kind2 != kind)
7901 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7902 if (!buf2)
7903 goto onError;
7904 len1 = PyUnicode_GET_LENGTH(str_obj);
7905 len2 = PyUnicode_GET_LENGTH(sub_obj);
7906
7907 ADJUST_INDICES(start, end, len1);
7908 switch(kind) {
7909 case PyUnicode_1BYTE_KIND:
7910 result = ucs1lib_count(
7911 ((Py_UCS1*)buf1) + start, end - start,
7912 buf2, len2, PY_SSIZE_T_MAX
7913 );
7914 break;
7915 case PyUnicode_2BYTE_KIND:
7916 result = ucs2lib_count(
7917 ((Py_UCS2*)buf1) + start, end - start,
7918 buf2, len2, PY_SSIZE_T_MAX
7919 );
7920 break;
7921 case PyUnicode_4BYTE_KIND:
7922 result = ucs4lib_count(
7923 ((Py_UCS4*)buf1) + start, end - start,
7924 buf2, len2, PY_SSIZE_T_MAX
7925 );
7926 break;
7927 default:
7928 assert(0); result = 0;
7929 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007930
7931 Py_DECREF(sub_obj);
7932 Py_DECREF(str_obj);
7933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007934 if (kind1 != kind)
7935 PyMem_Free(buf1);
7936 if (kind2 != kind)
7937 PyMem_Free(buf2);
7938
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007940 onError:
7941 Py_DECREF(sub_obj);
7942 Py_DECREF(str_obj);
7943 if (kind1 != kind && buf1)
7944 PyMem_Free(buf1);
7945 if (kind2 != kind && buf2)
7946 PyMem_Free(buf2);
7947 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948}
7949
Alexander Belopolsky40018472011-02-26 01:02:56 +00007950Py_ssize_t
7951PyUnicode_Find(PyObject *str,
7952 PyObject *sub,
7953 Py_ssize_t start,
7954 Py_ssize_t end,
7955 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007957 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00007958
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007960 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007962 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007963 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007964 Py_DECREF(str);
7965 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966 }
Tim Petersced69f82003-09-16 20:30:58 +00007967
Thomas Wouters477c8d52006-05-27 19:21:47 +00007968 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007969 result = any_find_slice(
7970 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
7971 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007972 );
7973 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007974 result = any_find_slice(
7975 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
7976 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007977 );
7978
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007980 Py_DECREF(sub);
7981
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982 return result;
7983}
7984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007985Py_ssize_t
7986PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
7987 Py_ssize_t start, Py_ssize_t end,
7988 int direction)
7989{
7990 char *result;
7991 int kind;
7992 if (PyUnicode_READY(str) == -1)
7993 return -2;
7994 if (end > PyUnicode_GET_LENGTH(str))
7995 end = PyUnicode_GET_LENGTH(str);
7996 kind = PyUnicode_KIND(str);
7997 result = findchar(PyUnicode_1BYTE_DATA(str)
7998 + PyUnicode_KIND_SIZE(kind, start),
7999 kind,
8000 end-start, ch, direction);
8001 if (!result)
8002 return -1;
8003 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8004}
8005
Alexander Belopolsky40018472011-02-26 01:02:56 +00008006static int
8007tailmatch(PyUnicodeObject *self,
8008 PyUnicodeObject *substring,
8009 Py_ssize_t start,
8010 Py_ssize_t end,
8011 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008013 int kind_self;
8014 int kind_sub;
8015 void *data_self;
8016 void *data_sub;
8017 Py_ssize_t offset;
8018 Py_ssize_t i;
8019 Py_ssize_t end_sub;
8020
8021 if (PyUnicode_READY(self) == -1 ||
8022 PyUnicode_READY(substring) == -1)
8023 return 0;
8024
8025 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 return 1;
8027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008028 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8029 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008033 kind_self = PyUnicode_KIND(self);
8034 data_self = PyUnicode_DATA(self);
8035 kind_sub = PyUnicode_KIND(substring);
8036 data_sub = PyUnicode_DATA(substring);
8037 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8038
8039 if (direction > 0)
8040 offset = end;
8041 else
8042 offset = start;
8043
8044 if (PyUnicode_READ(kind_self, data_self, offset) ==
8045 PyUnicode_READ(kind_sub, data_sub, 0) &&
8046 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8047 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8048 /* If both are of the same kind, memcmp is sufficient */
8049 if (kind_self == kind_sub) {
8050 return ! memcmp((char *)data_self +
8051 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8052 data_sub,
8053 PyUnicode_GET_LENGTH(substring) *
8054 PyUnicode_CHARACTER_SIZE(substring));
8055 }
8056 /* otherwise we have to compare each character by first accesing it */
8057 else {
8058 /* We do not need to compare 0 and len(substring)-1 because
8059 the if statement above ensured already that they are equal
8060 when we end up here. */
8061 // TODO: honor direction and do a forward or backwards search
8062 for (i = 1; i < end_sub; ++i) {
8063 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8064 PyUnicode_READ(kind_sub, data_sub, i))
8065 return 0;
8066 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069 }
8070
8071 return 0;
8072}
8073
Alexander Belopolsky40018472011-02-26 01:02:56 +00008074Py_ssize_t
8075PyUnicode_Tailmatch(PyObject *str,
8076 PyObject *substr,
8077 Py_ssize_t start,
8078 Py_ssize_t end,
8079 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008081 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008082
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 str = PyUnicode_FromObject(str);
8084 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086 substr = PyUnicode_FromObject(substr);
8087 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 Py_DECREF(str);
8089 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090 }
Tim Petersced69f82003-09-16 20:30:58 +00008091
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 (PyUnicodeObject *)substr,
8094 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095 Py_DECREF(str);
8096 Py_DECREF(substr);
8097 return result;
8098}
8099
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100/* Apply fixfct filter to the Unicode object self and return a
8101 reference to the modified object */
8102
Alexander Belopolsky40018472011-02-26 01:02:56 +00008103static PyObject *
8104fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008105 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008107 PyObject *u;
8108 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008110 if (PyUnicode_READY(self) == -1)
8111 return NULL;
8112 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8113 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8114 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008116 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008118 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8119 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008121 /* fix functions return the new maximum character in a string,
8122 if the kind of the resulting unicode object does not change,
8123 everything is fine. Otherwise we need to change the string kind
8124 and re-run the fix function. */
8125 maxchar_new = fixfct((PyUnicodeObject*)u);
8126 if (maxchar_new == 0)
8127 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8128 else if (maxchar_new <= 127)
8129 maxchar_new = 127;
8130 else if (maxchar_new <= 255)
8131 maxchar_new = 255;
8132 else if (maxchar_new <= 65535)
8133 maxchar_new = 65535;
8134 else
8135 maxchar_new = 1114111; /* 0x10ffff */
8136
8137 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 /* fixfct should return TRUE if it modified the buffer. If
8139 FALSE, return a reference to the original buffer instead
8140 (to save space, not time) */
8141 Py_INCREF(self);
8142 Py_DECREF(u);
8143 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008145 else if (maxchar_new == maxchar_old) {
8146 return u;
8147 }
8148 else {
8149 /* In case the maximum character changed, we need to
8150 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008151 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008152 if (v == NULL) {
8153 Py_DECREF(u);
8154 return NULL;
8155 }
8156 if (maxchar_new > maxchar_old) {
8157 /* If the maxchar increased so that the kind changed, not all
8158 characters are representable anymore and we need to fix the
8159 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008160 if (PyUnicode_CopyCharacters(v, 0,
8161 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008162 PyUnicode_GET_LENGTH(self)) < 0)
8163 {
8164 Py_DECREF(u);
8165 return NULL;
8166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008167 maxchar_old = fixfct((PyUnicodeObject*)v);
8168 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8169 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008170 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008171 if (PyUnicode_CopyCharacters(v, 0,
8172 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008173 PyUnicode_GET_LENGTH(self)) < 0)
8174 {
8175 Py_DECREF(u);
8176 return NULL;
8177 }
8178 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008179
8180 Py_DECREF(u);
8181 return v;
8182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183}
8184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008185static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008186fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008188 /* No need to call PyUnicode_READY(self) because this function is only
8189 called as a callback from fixup() which does it already. */
8190 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8191 const int kind = PyUnicode_KIND(self);
8192 void *data = PyUnicode_DATA(self);
8193 int touched = 0;
8194 Py_UCS4 maxchar = 0;
8195 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008196
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008197 for (i = 0; i < len; ++i) {
8198 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8199 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8200 if (up != ch) {
8201 if (up > maxchar)
8202 maxchar = up;
8203 PyUnicode_WRITE(kind, data, i, up);
8204 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008206 else if (ch > maxchar)
8207 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208 }
8209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008210 if (touched)
8211 return maxchar;
8212 else
8213 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008214}
8215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008216static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008217fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008219 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8220 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8221 const int kind = PyUnicode_KIND(self);
8222 void *data = PyUnicode_DATA(self);
8223 int touched = 0;
8224 Py_UCS4 maxchar = 0;
8225 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008227 for(i = 0; i < len; ++i) {
8228 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8229 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8230 if (lo != ch) {
8231 if (lo > maxchar)
8232 maxchar = lo;
8233 PyUnicode_WRITE(kind, data, i, lo);
8234 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008236 else if (ch > maxchar)
8237 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238 }
8239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008240 if (touched)
8241 return maxchar;
8242 else
8243 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244}
8245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008246static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008247fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008249 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8250 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8251 const int kind = PyUnicode_KIND(self);
8252 void *data = PyUnicode_DATA(self);
8253 int touched = 0;
8254 Py_UCS4 maxchar = 0;
8255 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008257 for(i = 0; i < len; ++i) {
8258 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8259 Py_UCS4 nu = 0;
8260
8261 if (Py_UNICODE_ISUPPER(ch))
8262 nu = Py_UNICODE_TOLOWER(ch);
8263 else if (Py_UNICODE_ISLOWER(ch))
8264 nu = Py_UNICODE_TOUPPER(ch);
8265
8266 if (nu != 0) {
8267 if (nu > maxchar)
8268 maxchar = nu;
8269 PyUnicode_WRITE(kind, data, i, nu);
8270 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008272 else if (ch > maxchar)
8273 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 }
8275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008276 if (touched)
8277 return maxchar;
8278 else
8279 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280}
8281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008282static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008283fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008285 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8286 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8287 const int kind = PyUnicode_KIND(self);
8288 void *data = PyUnicode_DATA(self);
8289 int touched = 0;
8290 Py_UCS4 maxchar = 0;
8291 Py_ssize_t i = 0;
8292 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008293
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008294 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008296
8297 ch = PyUnicode_READ(kind, data, i);
8298 if (!Py_UNICODE_ISUPPER(ch)) {
8299 maxchar = Py_UNICODE_TOUPPER(ch);
8300 PyUnicode_WRITE(kind, data, i, maxchar);
8301 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008303 ++i;
8304 for(; i < len; ++i) {
8305 ch = PyUnicode_READ(kind, data, i);
8306 if (!Py_UNICODE_ISLOWER(ch)) {
8307 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8308 if (lo > maxchar)
8309 maxchar = lo;
8310 PyUnicode_WRITE(kind, data, i, lo);
8311 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008313 else if (ch > maxchar)
8314 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008316
8317 if (touched)
8318 return maxchar;
8319 else
8320 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321}
8322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008323static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008324fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008326 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8327 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8328 const int kind = PyUnicode_KIND(self);
8329 void *data = PyUnicode_DATA(self);
8330 Py_UCS4 maxchar = 0;
8331 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332 int previous_is_cased;
8333
8334 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335 if (len == 1) {
8336 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8337 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8338 if (ti != ch) {
8339 PyUnicode_WRITE(kind, data, i, ti);
8340 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 }
8342 else
8343 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008346 for(; i < len; ++i) {
8347 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8348 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008349
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353 nu = Py_UNICODE_TOTITLE(ch);
8354
8355 if (nu > maxchar)
8356 maxchar = nu;
8357 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008358
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 if (Py_UNICODE_ISLOWER(ch) ||
8360 Py_UNICODE_ISUPPER(ch) ||
8361 Py_UNICODE_ISTITLE(ch))
8362 previous_is_cased = 1;
8363 else
8364 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008366 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367}
8368
Tim Peters8ce9f162004-08-27 01:49:32 +00008369PyObject *
8370PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008373 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008375 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008376 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8377 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008378 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008379 Py_ssize_t sz, i, res_offset;
8380 Py_UCS4 maxchar = 0;
8381 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382
Tim Peters05eba1f2004-08-27 21:32:02 +00008383 fseq = PySequence_Fast(seq, "");
8384 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008385 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008386 }
8387
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008388 /* NOTE: the following code can't call back into Python code,
8389 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008390 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008391
Tim Peters05eba1f2004-08-27 21:32:02 +00008392 seqlen = PySequence_Fast_GET_SIZE(fseq);
8393 /* If empty sequence, return u"". */
8394 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008395 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008396 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008397 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008398 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008399 /* If singleton sequence with an exact Unicode, return that. */
8400 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 item = items[0];
8402 if (PyUnicode_CheckExact(item)) {
8403 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 goto Done;
8406 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008407 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008408 else {
8409 /* Set up sep and seplen */
8410 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008411 /* fall back to a blank space separator */
8412 sep = PyUnicode_FromOrdinal(' ');
8413 if (!sep || PyUnicode_READY(sep) == -1)
8414 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008415 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008416 else {
8417 if (!PyUnicode_Check(separator)) {
8418 PyErr_Format(PyExc_TypeError,
8419 "separator: expected str instance,"
8420 " %.80s found",
8421 Py_TYPE(separator)->tp_name);
8422 goto onError;
8423 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008424 if (PyUnicode_READY(separator) == -1)
8425 goto onError;
8426 sep = separator;
8427 seplen = PyUnicode_GET_LENGTH(separator);
8428 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8429 /* inc refcount to keep this code path symetric with the
8430 above case of a blank separator */
8431 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008432 }
8433 }
8434
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008435 /* There are at least two things to join, or else we have a subclass
8436 * of str in the sequence.
8437 * Do a pre-pass to figure out the total amount of space we'll
8438 * need (sz), and see whether all argument are strings.
8439 */
8440 sz = 0;
8441 for (i = 0; i < seqlen; i++) {
8442 const Py_ssize_t old_sz = sz;
8443 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 if (!PyUnicode_Check(item)) {
8445 PyErr_Format(PyExc_TypeError,
8446 "sequence item %zd: expected str instance,"
8447 " %.80s found",
8448 i, Py_TYPE(item)->tp_name);
8449 goto onError;
8450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008451 if (PyUnicode_READY(item) == -1)
8452 goto onError;
8453 sz += PyUnicode_GET_LENGTH(item);
8454 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8455 if (item_maxchar > maxchar)
8456 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008457 if (i != 0)
8458 sz += seplen;
8459 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8460 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008462 goto onError;
8463 }
8464 }
Tim Petersced69f82003-09-16 20:30:58 +00008465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008466 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008467 if (res == NULL)
8468 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008469
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008470 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008472 Py_ssize_t itemlen;
8473 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008474 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 /* Copy item, and maybe the separator. */
8476 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008477 if (PyUnicode_CopyCharacters(res, res_offset,
8478 sep, 0, seplen) < 0)
8479 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008482 if (PyUnicode_CopyCharacters(res, res_offset,
8483 item, 0, itemlen) < 0)
8484 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008485 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008488
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008490 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008491 Py_XDECREF(sep);
8492 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008495 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008497 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 return NULL;
8499}
8500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501#define FILL(kind, data, value, start, length) \
8502 do { \
8503 Py_ssize_t i_ = 0; \
8504 assert(kind != PyUnicode_WCHAR_KIND); \
8505 switch ((kind)) { \
8506 case PyUnicode_1BYTE_KIND: { \
8507 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8508 memset(to_, (unsigned char)value, length); \
8509 break; \
8510 } \
8511 case PyUnicode_2BYTE_KIND: { \
8512 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8513 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8514 break; \
8515 } \
8516 default: { \
8517 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8518 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8519 break; \
8520 } \
8521 } \
8522 } while (0)
8523
Alexander Belopolsky40018472011-02-26 01:02:56 +00008524static PyUnicodeObject *
8525pad(PyUnicodeObject *self,
8526 Py_ssize_t left,
8527 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008530 PyObject *u;
8531 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008532 int kind;
8533 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534
8535 if (left < 0)
8536 left = 0;
8537 if (right < 0)
8538 right = 0;
8539
Tim Peters7a29bd52001-09-12 03:03:31 +00008540 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541 Py_INCREF(self);
8542 return self;
8543 }
8544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8546 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008547 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8548 return NULL;
8549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008550 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8551 if (fill > maxchar)
8552 maxchar = fill;
8553 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008554 if (!u)
8555 return NULL;
8556
8557 kind = PyUnicode_KIND(u);
8558 data = PyUnicode_DATA(u);
8559 if (left)
8560 FILL(kind, data, fill, 0, left);
8561 if (right)
8562 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008563 if (PyUnicode_CopyCharacters(u, left,
8564 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008565 _PyUnicode_LENGTH(self)) < 0)
8566 {
8567 Py_DECREF(u);
8568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569 }
8570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008571 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574
Alexander Belopolsky40018472011-02-26 01:02:56 +00008575PyObject *
8576PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579
8580 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008581 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 switch(PyUnicode_KIND(string)) {
8585 case PyUnicode_1BYTE_KIND:
8586 list = ucs1lib_splitlines(
8587 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8588 PyUnicode_GET_LENGTH(string), keepends);
8589 break;
8590 case PyUnicode_2BYTE_KIND:
8591 list = ucs2lib_splitlines(
8592 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8593 PyUnicode_GET_LENGTH(string), keepends);
8594 break;
8595 case PyUnicode_4BYTE_KIND:
8596 list = ucs4lib_splitlines(
8597 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8598 PyUnicode_GET_LENGTH(string), keepends);
8599 break;
8600 default:
8601 assert(0);
8602 list = 0;
8603 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604 Py_DECREF(string);
8605 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606}
8607
Alexander Belopolsky40018472011-02-26 01:02:56 +00008608static PyObject *
8609split(PyUnicodeObject *self,
8610 PyUnicodeObject *substring,
8611 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 int kind1, kind2, kind;
8614 void *buf1, *buf2;
8615 Py_ssize_t len1, len2;
8616 PyObject* out;
8617
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008619 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 if (PyUnicode_READY(self) == -1)
8622 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 if (substring == NULL)
8625 switch(PyUnicode_KIND(self)) {
8626 case PyUnicode_1BYTE_KIND:
8627 return ucs1lib_split_whitespace(
8628 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8629 PyUnicode_GET_LENGTH(self), maxcount
8630 );
8631 case PyUnicode_2BYTE_KIND:
8632 return ucs2lib_split_whitespace(
8633 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8634 PyUnicode_GET_LENGTH(self), maxcount
8635 );
8636 case PyUnicode_4BYTE_KIND:
8637 return ucs4lib_split_whitespace(
8638 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8639 PyUnicode_GET_LENGTH(self), maxcount
8640 );
8641 default:
8642 assert(0);
8643 return NULL;
8644 }
8645
8646 if (PyUnicode_READY(substring) == -1)
8647 return NULL;
8648
8649 kind1 = PyUnicode_KIND(self);
8650 kind2 = PyUnicode_KIND(substring);
8651 kind = kind1 > kind2 ? kind1 : kind2;
8652 buf1 = PyUnicode_DATA(self);
8653 buf2 = PyUnicode_DATA(substring);
8654 if (kind1 != kind)
8655 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8656 if (!buf1)
8657 return NULL;
8658 if (kind2 != kind)
8659 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8660 if (!buf2) {
8661 if (kind1 != kind) PyMem_Free(buf1);
8662 return NULL;
8663 }
8664 len1 = PyUnicode_GET_LENGTH(self);
8665 len2 = PyUnicode_GET_LENGTH(substring);
8666
8667 switch(kind) {
8668 case PyUnicode_1BYTE_KIND:
8669 out = ucs1lib_split(
8670 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8671 break;
8672 case PyUnicode_2BYTE_KIND:
8673 out = ucs2lib_split(
8674 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8675 break;
8676 case PyUnicode_4BYTE_KIND:
8677 out = ucs4lib_split(
8678 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8679 break;
8680 default:
8681 out = NULL;
8682 }
8683 if (kind1 != kind)
8684 PyMem_Free(buf1);
8685 if (kind2 != kind)
8686 PyMem_Free(buf2);
8687 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688}
8689
Alexander Belopolsky40018472011-02-26 01:02:56 +00008690static PyObject *
8691rsplit(PyUnicodeObject *self,
8692 PyUnicodeObject *substring,
8693 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008694{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008695 int kind1, kind2, kind;
8696 void *buf1, *buf2;
8697 Py_ssize_t len1, len2;
8698 PyObject* out;
8699
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008700 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008701 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703 if (PyUnicode_READY(self) == -1)
8704 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 if (substring == NULL)
8707 switch(PyUnicode_KIND(self)) {
8708 case PyUnicode_1BYTE_KIND:
8709 return ucs1lib_rsplit_whitespace(
8710 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8711 PyUnicode_GET_LENGTH(self), maxcount
8712 );
8713 case PyUnicode_2BYTE_KIND:
8714 return ucs2lib_rsplit_whitespace(
8715 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8716 PyUnicode_GET_LENGTH(self), maxcount
8717 );
8718 case PyUnicode_4BYTE_KIND:
8719 return ucs4lib_rsplit_whitespace(
8720 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8721 PyUnicode_GET_LENGTH(self), maxcount
8722 );
8723 default:
8724 assert(0);
8725 return NULL;
8726 }
8727
8728 if (PyUnicode_READY(substring) == -1)
8729 return NULL;
8730
8731 kind1 = PyUnicode_KIND(self);
8732 kind2 = PyUnicode_KIND(substring);
8733 kind = kind1 > kind2 ? kind1 : kind2;
8734 buf1 = PyUnicode_DATA(self);
8735 buf2 = PyUnicode_DATA(substring);
8736 if (kind1 != kind)
8737 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8738 if (!buf1)
8739 return NULL;
8740 if (kind2 != kind)
8741 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8742 if (!buf2) {
8743 if (kind1 != kind) PyMem_Free(buf1);
8744 return NULL;
8745 }
8746 len1 = PyUnicode_GET_LENGTH(self);
8747 len2 = PyUnicode_GET_LENGTH(substring);
8748
8749 switch(kind) {
8750 case PyUnicode_1BYTE_KIND:
8751 out = ucs1lib_rsplit(
8752 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8753 break;
8754 case PyUnicode_2BYTE_KIND:
8755 out = ucs2lib_rsplit(
8756 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8757 break;
8758 case PyUnicode_4BYTE_KIND:
8759 out = ucs4lib_rsplit(
8760 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8761 break;
8762 default:
8763 out = NULL;
8764 }
8765 if (kind1 != kind)
8766 PyMem_Free(buf1);
8767 if (kind2 != kind)
8768 PyMem_Free(buf2);
8769 return out;
8770}
8771
8772static Py_ssize_t
8773anylib_find(int kind, void *buf1, Py_ssize_t len1,
8774 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8775{
8776 switch(kind) {
8777 case PyUnicode_1BYTE_KIND:
8778 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8779 case PyUnicode_2BYTE_KIND:
8780 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8781 case PyUnicode_4BYTE_KIND:
8782 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8783 }
8784 assert(0);
8785 return -1;
8786}
8787
8788static Py_ssize_t
8789anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8790 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8791{
8792 switch(kind) {
8793 case PyUnicode_1BYTE_KIND:
8794 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8795 case PyUnicode_2BYTE_KIND:
8796 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8797 case PyUnicode_4BYTE_KIND:
8798 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8799 }
8800 assert(0);
8801 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008802}
8803
Alexander Belopolsky40018472011-02-26 01:02:56 +00008804static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805replace(PyObject *self, PyObject *str1,
8806 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008808 PyObject *u;
8809 char *sbuf = PyUnicode_DATA(self);
8810 char *buf1 = PyUnicode_DATA(str1);
8811 char *buf2 = PyUnicode_DATA(str2);
8812 int srelease = 0, release1 = 0, release2 = 0;
8813 int skind = PyUnicode_KIND(self);
8814 int kind1 = PyUnicode_KIND(str1);
8815 int kind2 = PyUnicode_KIND(str2);
8816 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8817 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8818 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819
8820 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008823 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 if (skind < kind1)
8826 /* substring too wide to be present */
8827 goto nothing;
8828
8829 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008830 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008831 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008833 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008835 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 Py_UCS4 u1, u2, maxchar;
8837 int mayshrink, rkind;
8838 u1 = PyUnicode_READ_CHAR(str1, 0);
8839 if (!findchar(sbuf, PyUnicode_KIND(self),
8840 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008841 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842 u2 = PyUnicode_READ_CHAR(str2, 0);
8843 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8844 /* Replacing u1 with u2 may cause a maxchar reduction in the
8845 result string. */
8846 mayshrink = maxchar > 127;
8847 if (u2 > maxchar) {
8848 maxchar = u2;
8849 mayshrink = 0;
8850 }
8851 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008852 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008854 if (PyUnicode_CopyCharacters(u, 0,
8855 (PyObject*)self, 0, slen) < 0)
8856 {
8857 Py_DECREF(u);
8858 return NULL;
8859 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008860 rkind = PyUnicode_KIND(u);
8861 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8862 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008863 if (--maxcount < 0)
8864 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008865 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008867 if (mayshrink) {
8868 PyObject *tmp = u;
8869 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8870 PyUnicode_GET_LENGTH(tmp));
8871 Py_DECREF(tmp);
8872 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 int rkind = skind;
8875 char *res;
8876 if (kind1 < rkind) {
8877 /* widen substring */
8878 buf1 = _PyUnicode_AsKind(str1, rkind);
8879 if (!buf1) goto error;
8880 release1 = 1;
8881 }
8882 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008883 if (i < 0)
8884 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885 if (rkind > kind2) {
8886 /* widen replacement */
8887 buf2 = _PyUnicode_AsKind(str2, rkind);
8888 if (!buf2) goto error;
8889 release2 = 1;
8890 }
8891 else if (rkind < kind2) {
8892 /* widen self and buf1 */
8893 rkind = kind2;
8894 if (release1) PyMem_Free(buf1);
8895 sbuf = _PyUnicode_AsKind(self, rkind);
8896 if (!sbuf) goto error;
8897 srelease = 1;
8898 buf1 = _PyUnicode_AsKind(str1, rkind);
8899 if (!buf1) goto error;
8900 release1 = 1;
8901 }
8902 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8903 if (!res) {
8904 PyErr_NoMemory();
8905 goto error;
8906 }
8907 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008908 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8910 buf2,
8911 PyUnicode_KIND_SIZE(rkind, len2));
8912 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008913
8914 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8916 slen-i,
8917 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008918 if (i == -1)
8919 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8921 buf2,
8922 PyUnicode_KIND_SIZE(rkind, len2));
8923 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008924 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925
8926 u = PyUnicode_FromKindAndData(rkind, res, slen);
8927 PyMem_Free(res);
8928 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 Py_ssize_t n, i, j, ires;
8933 Py_ssize_t product, new_size;
8934 int rkind = skind;
8935 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937 if (kind1 < rkind) {
8938 buf1 = _PyUnicode_AsKind(str1, rkind);
8939 if (!buf1) goto error;
8940 release1 = 1;
8941 }
8942 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008943 if (n == 0)
8944 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 if (kind2 < rkind) {
8946 buf2 = _PyUnicode_AsKind(str2, rkind);
8947 if (!buf2) goto error;
8948 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950 else if (kind2 > rkind) {
8951 rkind = kind2;
8952 sbuf = _PyUnicode_AsKind(self, rkind);
8953 if (!sbuf) goto error;
8954 srelease = 1;
8955 if (release1) PyMem_Free(buf1);
8956 buf1 = _PyUnicode_AsKind(str1, rkind);
8957 if (!buf1) goto error;
8958 release1 = 1;
8959 }
8960 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
8961 PyUnicode_GET_LENGTH(str1))); */
8962 product = n * (len2-len1);
8963 if ((product / (len2-len1)) != n) {
8964 PyErr_SetString(PyExc_OverflowError,
8965 "replace string is too long");
8966 goto error;
8967 }
8968 new_size = slen + product;
8969 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
8970 PyErr_SetString(PyExc_OverflowError,
8971 "replace string is too long");
8972 goto error;
8973 }
8974 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
8975 if (!res)
8976 goto error;
8977 ires = i = 0;
8978 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008979 while (n-- > 0) {
8980 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 j = anylib_find(rkind,
8982 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8983 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008984 if (j == -1)
8985 break;
8986 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008987 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008988 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8989 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8990 PyUnicode_KIND_SIZE(rkind, j-i));
8991 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008992 }
8993 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008994 if (len2 > 0) {
8995 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8996 buf2,
8997 PyUnicode_KIND_SIZE(rkind, len2));
8998 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008999 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009001 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009003 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9005 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9006 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009007 } else {
9008 /* interleave */
9009 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9011 buf2,
9012 PyUnicode_KIND_SIZE(rkind, len2));
9013 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009014 if (--n <= 0)
9015 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9017 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9018 PyUnicode_KIND_SIZE(rkind, 1));
9019 ires++;
9020 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9023 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9024 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009025 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028 if (srelease)
9029 PyMem_FREE(sbuf);
9030 if (release1)
9031 PyMem_FREE(buf1);
9032 if (release2)
9033 PyMem_FREE(buf2);
9034 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009035
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009037 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009038 if (srelease)
9039 PyMem_FREE(sbuf);
9040 if (release1)
9041 PyMem_FREE(buf1);
9042 if (release2)
9043 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009044 if (PyUnicode_CheckExact(self)) {
9045 Py_INCREF(self);
9046 return (PyObject *) self;
9047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 return PyUnicode_FromKindAndData(PyUnicode_KIND(self),
9049 PyUnicode_DATA(self),
9050 PyUnicode_GET_LENGTH(self));
9051 error:
9052 if (srelease && sbuf)
9053 PyMem_FREE(sbuf);
9054 if (release1 && buf1)
9055 PyMem_FREE(buf1);
9056 if (release2 && buf2)
9057 PyMem_FREE(buf2);
9058 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059}
9060
9061/* --- Unicode Object Methods --------------------------------------------- */
9062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009063PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009064 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065\n\
9066Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009067characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068
9069static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009070unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072 return fixup(self, fixtitle);
9073}
9074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009075PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009076 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077\n\
9078Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009079have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080
9081static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009082unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084 return fixup(self, fixcapitalize);
9085}
9086
9087#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009088PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090\n\
9091Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009092normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093
9094static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009095unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096{
9097 PyObject *list;
9098 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009099 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101 /* Split into words */
9102 list = split(self, NULL, -1);
9103 if (!list)
9104 return NULL;
9105
9106 /* Capitalize each word */
9107 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9108 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009109 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110 if (item == NULL)
9111 goto onError;
9112 Py_DECREF(PyList_GET_ITEM(list, i));
9113 PyList_SET_ITEM(list, i, item);
9114 }
9115
9116 /* Join the words to form a new string */
9117 item = PyUnicode_Join(NULL, list);
9118
Benjamin Peterson29060642009-01-31 22:14:21 +00009119 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009120 Py_DECREF(list);
9121 return (PyObject *)item;
9122}
9123#endif
9124
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009125/* Argument converter. Coerces to a single unicode character */
9126
9127static int
9128convert_uc(PyObject *obj, void *addr)
9129{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009130 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009131 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009132
Benjamin Peterson14339b62009-01-31 16:36:08 +00009133 uniobj = PyUnicode_FromObject(obj);
9134 if (uniobj == NULL) {
9135 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009136 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009137 return 0;
9138 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009140 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009141 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009142 Py_DECREF(uniobj);
9143 return 0;
9144 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145 if (PyUnicode_READY(uniobj)) {
9146 Py_DECREF(uniobj);
9147 return 0;
9148 }
9149 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009150 Py_DECREF(uniobj);
9151 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009152}
9153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009154PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009157Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009158done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009159
9160static PyObject *
9161unicode_center(PyUnicodeObject *self, PyObject *args)
9162{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009163 Py_ssize_t marg, left;
9164 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009165 Py_UCS4 fillchar = ' ';
9166
9167 if (PyUnicode_READY(self) == -1)
9168 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169
Thomas Woutersde017742006-02-16 19:34:37 +00009170 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171 return NULL;
9172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009173 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174 Py_INCREF(self);
9175 return (PyObject*) self;
9176 }
9177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009179 left = marg / 2 + (marg & width & 1);
9180
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009181 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009182}
9183
Marc-André Lemburge5034372000-08-08 08:04:29 +00009184#if 0
9185
9186/* This code should go into some future Unicode collation support
9187 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009188 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009189
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009190/* speedy UTF-16 code point order comparison */
9191/* gleaned from: */
9192/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9193
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009194static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009195{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009196 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009197 0, 0, 0, 0, 0, 0, 0, 0,
9198 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009199 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009200};
9201
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202static int
9203unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9204{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009205 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009206
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207 Py_UNICODE *s1 = str1->str;
9208 Py_UNICODE *s2 = str2->str;
9209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009210 len1 = str1->_base._base.length;
9211 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009212
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009214 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009215
9216 c1 = *s1++;
9217 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009218
Benjamin Peterson29060642009-01-31 22:14:21 +00009219 if (c1 > (1<<11) * 26)
9220 c1 += utf16Fixup[c1>>11];
9221 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009222 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009223 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009224
9225 if (c1 != c2)
9226 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009227
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009228 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229 }
9230
9231 return (len1 < len2) ? -1 : (len1 != len2);
9232}
9233
Marc-André Lemburge5034372000-08-08 08:04:29 +00009234#else
9235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236/* This function assumes that str1 and str2 are readied by the caller. */
9237
Marc-André Lemburge5034372000-08-08 08:04:29 +00009238static int
9239unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9240{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 int kind1, kind2;
9242 void *data1, *data2;
9243 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 kind1 = PyUnicode_KIND(str1);
9246 kind2 = PyUnicode_KIND(str2);
9247 data1 = PyUnicode_DATA(str1);
9248 data2 = PyUnicode_DATA(str2);
9249 len1 = PyUnicode_GET_LENGTH(str1);
9250 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009252 for (i = 0; i < len1 && i < len2; ++i) {
9253 Py_UCS4 c1, c2;
9254 c1 = PyUnicode_READ(kind1, data1, i);
9255 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009256
9257 if (c1 != c2)
9258 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009259 }
9260
9261 return (len1 < len2) ? -1 : (len1 != len2);
9262}
9263
9264#endif
9265
Alexander Belopolsky40018472011-02-26 01:02:56 +00009266int
9267PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9270 if (PyUnicode_READY(left) == -1 ||
9271 PyUnicode_READY(right) == -1)
9272 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009273 return unicode_compare((PyUnicodeObject *)left,
9274 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009276 PyErr_Format(PyExc_TypeError,
9277 "Can't compare %.100s and %.100s",
9278 left->ob_type->tp_name,
9279 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009280 return -1;
9281}
9282
Martin v. Löwis5b222132007-06-10 09:51:05 +00009283int
9284PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9285{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 Py_ssize_t i;
9287 int kind;
9288 void *data;
9289 Py_UCS4 chr;
9290
Martin v. Löwis5b222132007-06-10 09:51:05 +00009291 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292 if (PyUnicode_READY(uni) == -1)
9293 return -1;
9294 kind = PyUnicode_KIND(uni);
9295 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009296 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9298 if (chr != str[i])
9299 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009300 /* This check keeps Python strings that end in '\0' from comparing equal
9301 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009303 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009304 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009305 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009306 return 0;
9307}
9308
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009309
Benjamin Peterson29060642009-01-31 22:14:21 +00009310#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009311 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009312
Alexander Belopolsky40018472011-02-26 01:02:56 +00009313PyObject *
9314PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009315{
9316 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009317
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009318 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9319 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320 if (PyUnicode_READY(left) == -1 ||
9321 PyUnicode_READY(right) == -1)
9322 return NULL;
9323 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9324 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009325 if (op == Py_EQ) {
9326 Py_INCREF(Py_False);
9327 return Py_False;
9328 }
9329 if (op == Py_NE) {
9330 Py_INCREF(Py_True);
9331 return Py_True;
9332 }
9333 }
9334 if (left == right)
9335 result = 0;
9336 else
9337 result = unicode_compare((PyUnicodeObject *)left,
9338 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009339
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009340 /* Convert the return value to a Boolean */
9341 switch (op) {
9342 case Py_EQ:
9343 v = TEST_COND(result == 0);
9344 break;
9345 case Py_NE:
9346 v = TEST_COND(result != 0);
9347 break;
9348 case Py_LE:
9349 v = TEST_COND(result <= 0);
9350 break;
9351 case Py_GE:
9352 v = TEST_COND(result >= 0);
9353 break;
9354 case Py_LT:
9355 v = TEST_COND(result == -1);
9356 break;
9357 case Py_GT:
9358 v = TEST_COND(result == 1);
9359 break;
9360 default:
9361 PyErr_BadArgument();
9362 return NULL;
9363 }
9364 Py_INCREF(v);
9365 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009366 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009367
Brian Curtindfc80e32011-08-10 20:28:54 -05009368 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009369}
9370
Alexander Belopolsky40018472011-02-26 01:02:56 +00009371int
9372PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009373{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009374 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375 int kind1, kind2, kind;
9376 void *buf1, *buf2;
9377 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009378 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009379
9380 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009381 sub = PyUnicode_FromObject(element);
9382 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009383 PyErr_Format(PyExc_TypeError,
9384 "'in <string>' requires string as left operand, not %s",
9385 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009386 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 if (PyUnicode_READY(sub) == -1)
9389 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009390
Thomas Wouters477c8d52006-05-27 19:21:47 +00009391 str = PyUnicode_FromObject(container);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009392 if (!str || PyUnicode_READY(container) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009393 Py_DECREF(sub);
9394 return -1;
9395 }
9396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009397 kind1 = PyUnicode_KIND(str);
9398 kind2 = PyUnicode_KIND(sub);
9399 kind = kind1 > kind2 ? kind1 : kind2;
9400 buf1 = PyUnicode_DATA(str);
9401 buf2 = PyUnicode_DATA(sub);
9402 if (kind1 != kind)
9403 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9404 if (!buf1) {
9405 Py_DECREF(sub);
9406 return -1;
9407 }
9408 if (kind2 != kind)
9409 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9410 if (!buf2) {
9411 Py_DECREF(sub);
9412 if (kind1 != kind) PyMem_Free(buf1);
9413 return -1;
9414 }
9415 len1 = PyUnicode_GET_LENGTH(str);
9416 len2 = PyUnicode_GET_LENGTH(sub);
9417
9418 switch(kind) {
9419 case PyUnicode_1BYTE_KIND:
9420 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9421 break;
9422 case PyUnicode_2BYTE_KIND:
9423 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9424 break;
9425 case PyUnicode_4BYTE_KIND:
9426 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9427 break;
9428 default:
9429 result = -1;
9430 assert(0);
9431 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009432
9433 Py_DECREF(str);
9434 Py_DECREF(sub);
9435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 if (kind1 != kind)
9437 PyMem_Free(buf1);
9438 if (kind2 != kind)
9439 PyMem_Free(buf2);
9440
Guido van Rossum403d68b2000-03-13 15:55:09 +00009441 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009442}
9443
Guido van Rossumd57fd912000-03-10 22:53:23 +00009444/* Concat to string or Unicode object giving a new Unicode object. */
9445
Alexander Belopolsky40018472011-02-26 01:02:56 +00009446PyObject *
9447PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 PyObject *u = NULL, *v = NULL, *w;
9450 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451
9452 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009455 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009458 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459
9460 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009462 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009466 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009468 }
9469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 if (PyUnicode_READY(u) == -1 || PyUnicode_READY(v) == -1)
9471 goto onError;
9472
9473 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009474 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 w = PyUnicode_New(
9478 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9479 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009481 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009482 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9483 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009484 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009485 v, 0,
9486 PyUnicode_GET_LENGTH(v)) < 0)
9487 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488 Py_DECREF(u);
9489 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491
Benjamin Peterson29060642009-01-31 22:14:21 +00009492 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009493 Py_XDECREF(u);
9494 Py_XDECREF(v);
9495 return NULL;
9496}
9497
Walter Dörwald1ab83302007-05-18 17:15:44 +00009498void
9499PyUnicode_Append(PyObject **pleft, PyObject *right)
9500{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009501 PyObject *new;
9502 if (*pleft == NULL)
9503 return;
9504 if (right == NULL || !PyUnicode_Check(*pleft)) {
9505 Py_DECREF(*pleft);
9506 *pleft = NULL;
9507 return;
9508 }
9509 new = PyUnicode_Concat(*pleft, right);
9510 Py_DECREF(*pleft);
9511 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009512}
9513
9514void
9515PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9516{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009517 PyUnicode_Append(pleft, right);
9518 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009519}
9520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009521PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009522 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009524Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009525string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009526interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009527
9528static PyObject *
9529unicode_count(PyUnicodeObject *self, PyObject *args)
9530{
9531 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009532 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009533 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 int kind1, kind2, kind;
9536 void *buf1, *buf2;
9537 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538
Jesus Ceaac451502011-04-20 17:09:23 +02009539 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9540 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009541 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 kind1 = PyUnicode_KIND(self);
9544 kind2 = PyUnicode_KIND(substring);
9545 kind = kind1 > kind2 ? kind1 : kind2;
9546 buf1 = PyUnicode_DATA(self);
9547 buf2 = PyUnicode_DATA(substring);
9548 if (kind1 != kind)
9549 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9550 if (!buf1) {
9551 Py_DECREF(substring);
9552 return NULL;
9553 }
9554 if (kind2 != kind)
9555 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9556 if (!buf2) {
9557 Py_DECREF(substring);
9558 if (kind1 != kind) PyMem_Free(buf1);
9559 return NULL;
9560 }
9561 len1 = PyUnicode_GET_LENGTH(self);
9562 len2 = PyUnicode_GET_LENGTH(substring);
9563
9564 ADJUST_INDICES(start, end, len1);
9565 switch(kind) {
9566 case PyUnicode_1BYTE_KIND:
9567 iresult = ucs1lib_count(
9568 ((Py_UCS1*)buf1) + start, end - start,
9569 buf2, len2, PY_SSIZE_T_MAX
9570 );
9571 break;
9572 case PyUnicode_2BYTE_KIND:
9573 iresult = ucs2lib_count(
9574 ((Py_UCS2*)buf1) + start, end - start,
9575 buf2, len2, PY_SSIZE_T_MAX
9576 );
9577 break;
9578 case PyUnicode_4BYTE_KIND:
9579 iresult = ucs4lib_count(
9580 ((Py_UCS4*)buf1) + start, end - start,
9581 buf2, len2, PY_SSIZE_T_MAX
9582 );
9583 break;
9584 default:
9585 assert(0); iresult = 0;
9586 }
9587
9588 result = PyLong_FromSsize_t(iresult);
9589
9590 if (kind1 != kind)
9591 PyMem_Free(buf1);
9592 if (kind2 != kind)
9593 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594
9595 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009596
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597 return result;
9598}
9599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009600PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009601 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009602\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009603Encode S using the codec registered for encoding. Default encoding\n\
9604is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009605handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009606a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9607'xmlcharrefreplace' as well as any other name registered with\n\
9608codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609
9610static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009611unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009612{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009613 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009614 char *encoding = NULL;
9615 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009616
Benjamin Peterson308d6372009-09-18 21:42:35 +00009617 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9618 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009620 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009621}
9622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009623PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009624 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625\n\
9626Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009627If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628
9629static PyObject*
9630unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9631{
9632 Py_UNICODE *e;
9633 Py_UNICODE *p;
9634 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009635 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637 PyUnicodeObject *u;
9638 int tabsize = 8;
9639
9640 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009641 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9644 return NULL;
9645
Thomas Wouters7e474022000-07-16 12:04:32 +00009646 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009647 i = 0; /* chars up to and including most recent \n or \r */
9648 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9650 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009652 if (tabsize > 0) {
9653 incr = tabsize - (j % tabsize); /* cannot overflow */
9654 if (j > PY_SSIZE_T_MAX - incr)
9655 goto overflow1;
9656 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009657 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009658 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009659 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009660 if (j > PY_SSIZE_T_MAX - 1)
9661 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662 j++;
9663 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009664 if (i > PY_SSIZE_T_MAX - j)
9665 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009667 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668 }
9669 }
9670
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009671 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009672 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009673
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674 /* Second pass: create output string and fill it */
9675 u = _PyUnicode_New(i + j);
9676 if (!u)
9677 return NULL;
9678
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009679 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 q = _PyUnicode_WSTR(u); /* next output char */
9681 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009685 if (tabsize > 0) {
9686 i = tabsize - (j % tabsize);
9687 j += i;
9688 while (i--) {
9689 if (q >= qe)
9690 goto overflow2;
9691 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009692 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009693 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009694 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009695 else {
9696 if (q >= qe)
9697 goto overflow2;
9698 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009699 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700 if (*p == '\n' || *p == '\r')
9701 j = 0;
9702 }
9703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009704 if (PyUnicode_READY(u) == -1) {
9705 Py_DECREF(u);
9706 return NULL;
9707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009708 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009709
9710 overflow2:
9711 Py_DECREF(u);
9712 overflow1:
9713 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9714 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715}
9716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009717PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009718 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719\n\
9720Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009721such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722arguments start and end are interpreted as in slice notation.\n\
9723\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009724Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725
9726static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728{
Jesus Ceaac451502011-04-20 17:09:23 +02009729 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009730 Py_ssize_t start;
9731 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009732 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009733
Jesus Ceaac451502011-04-20 17:09:23 +02009734 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9735 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009738 if (PyUnicode_READY(self) == -1)
9739 return NULL;
9740 if (PyUnicode_READY(substring) == -1)
9741 return NULL;
9742
9743 result = any_find_slice(
9744 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9745 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009746 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747
9748 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 if (result == -2)
9751 return NULL;
9752
Christian Heimes217cfd12007-12-02 14:31:20 +00009753 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754}
9755
9756static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009757unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009759 Py_UCS4 ch;
9760
9761 if (PyUnicode_READY(self) == -1)
9762 return NULL;
9763 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009764 PyErr_SetString(PyExc_IndexError, "string index out of range");
9765 return NULL;
9766 }
9767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9769 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770}
9771
Guido van Rossumc2504932007-09-18 19:42:40 +00009772/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009773 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009774static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009775unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776{
Guido van Rossumc2504932007-09-18 19:42:40 +00009777 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009778 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 if (_PyUnicode_HASH(self) != -1)
9781 return _PyUnicode_HASH(self);
9782 if (PyUnicode_READY(self) == -1)
9783 return -1;
9784 len = PyUnicode_GET_LENGTH(self);
9785
9786 /* The hash function as a macro, gets expanded three times below. */
9787#define HASH(P) \
9788 x = (Py_uhash_t)*P << 7; \
9789 while (--len >= 0) \
9790 x = (1000003*x) ^ (Py_uhash_t)*P++;
9791
9792 switch (PyUnicode_KIND(self)) {
9793 case PyUnicode_1BYTE_KIND: {
9794 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9795 HASH(c);
9796 break;
9797 }
9798 case PyUnicode_2BYTE_KIND: {
9799 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9800 HASH(s);
9801 break;
9802 }
9803 default: {
9804 Py_UCS4 *l;
9805 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9806 "Impossible switch case in unicode_hash");
9807 l = PyUnicode_4BYTE_DATA(self);
9808 HASH(l);
9809 break;
9810 }
9811 }
9812 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9813
Guido van Rossumc2504932007-09-18 19:42:40 +00009814 if (x == -1)
9815 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009817 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009821PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009822 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009824Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009825
9826static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009829 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009830 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009831 Py_ssize_t start;
9832 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009833
Jesus Ceaac451502011-04-20 17:09:23 +02009834 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9835 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 if (PyUnicode_READY(self) == -1)
9839 return NULL;
9840 if (PyUnicode_READY(substring) == -1)
9841 return NULL;
9842
9843 result = any_find_slice(
9844 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9845 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009846 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009847
9848 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 if (result == -2)
9851 return NULL;
9852
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853 if (result < 0) {
9854 PyErr_SetString(PyExc_ValueError, "substring not found");
9855 return NULL;
9856 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009857
Christian Heimes217cfd12007-12-02 14:31:20 +00009858 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859}
9860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009861PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009862 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009864Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009865at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866
9867static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009868unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 Py_ssize_t i, length;
9871 int kind;
9872 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873 int cased;
9874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 if (PyUnicode_READY(self) == -1)
9876 return NULL;
9877 length = PyUnicode_GET_LENGTH(self);
9878 kind = PyUnicode_KIND(self);
9879 data = PyUnicode_DATA(self);
9880
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009882 if (length == 1)
9883 return PyBool_FromLong(
9884 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009886 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009888 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009889
Guido van Rossumd57fd912000-03-10 22:53:23 +00009890 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 for (i = 0; i < length; i++) {
9892 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009893
Benjamin Peterson29060642009-01-31 22:14:21 +00009894 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9895 return PyBool_FromLong(0);
9896 else if (!cased && Py_UNICODE_ISLOWER(ch))
9897 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009899 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900}
9901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009902PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009903 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009905Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009906at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907
9908static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009909unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009911 Py_ssize_t i, length;
9912 int kind;
9913 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914 int cased;
9915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 if (PyUnicode_READY(self) == -1)
9917 return NULL;
9918 length = PyUnicode_GET_LENGTH(self);
9919 kind = PyUnicode_KIND(self);
9920 data = PyUnicode_DATA(self);
9921
Guido van Rossumd57fd912000-03-10 22:53:23 +00009922 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 if (length == 1)
9924 return PyBool_FromLong(
9925 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009927 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009929 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009930
Guido van Rossumd57fd912000-03-10 22:53:23 +00009931 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 for (i = 0; i < length; i++) {
9933 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009934
Benjamin Peterson29060642009-01-31 22:14:21 +00009935 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9936 return PyBool_FromLong(0);
9937 else if (!cased && Py_UNICODE_ISUPPER(ch))
9938 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009940 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941}
9942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009943PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009944 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009946Return True if S is a titlecased string and there is at least one\n\
9947character in S, i.e. upper- and titlecase characters may only\n\
9948follow uncased characters and lowercase characters only cased ones.\n\
9949Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950
9951static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009952unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 Py_ssize_t i, length;
9955 int kind;
9956 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009957 int cased, previous_is_cased;
9958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 if (PyUnicode_READY(self) == -1)
9960 return NULL;
9961 length = PyUnicode_GET_LENGTH(self);
9962 kind = PyUnicode_KIND(self);
9963 data = PyUnicode_DATA(self);
9964
Guido van Rossumd57fd912000-03-10 22:53:23 +00009965 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009966 if (length == 1) {
9967 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
9968 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
9969 (Py_UNICODE_ISUPPER(ch) != 0));
9970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009971
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009972 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009974 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009975
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976 cased = 0;
9977 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 for (i = 0; i < length; i++) {
9979 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009980
Benjamin Peterson29060642009-01-31 22:14:21 +00009981 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
9982 if (previous_is_cased)
9983 return PyBool_FromLong(0);
9984 previous_is_cased = 1;
9985 cased = 1;
9986 }
9987 else if (Py_UNICODE_ISLOWER(ch)) {
9988 if (!previous_is_cased)
9989 return PyBool_FromLong(0);
9990 previous_is_cased = 1;
9991 cased = 1;
9992 }
9993 else
9994 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009995 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009996 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997}
9998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009999PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010000 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010001\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010002Return True if all characters in S are whitespace\n\
10003and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004
10005static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010006unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 Py_ssize_t i, length;
10009 int kind;
10010 void *data;
10011
10012 if (PyUnicode_READY(self) == -1)
10013 return NULL;
10014 length = PyUnicode_GET_LENGTH(self);
10015 kind = PyUnicode_KIND(self);
10016 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 if (length == 1)
10020 return PyBool_FromLong(
10021 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010023 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010025 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 for (i = 0; i < length; i++) {
10028 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010029 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010030 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010032 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033}
10034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010035PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010036 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010037\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010038Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010039and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010040
10041static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010042unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010043{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 Py_ssize_t i, length;
10045 int kind;
10046 void *data;
10047
10048 if (PyUnicode_READY(self) == -1)
10049 return NULL;
10050 length = PyUnicode_GET_LENGTH(self);
10051 kind = PyUnicode_KIND(self);
10052 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010053
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010054 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010055 if (length == 1)
10056 return PyBool_FromLong(
10057 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010058
10059 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010061 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 for (i = 0; i < length; i++) {
10064 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010065 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010066 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010067 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010068}
10069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010070PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010071 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010072\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010073Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010074and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010075
10076static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010077unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010078{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 int kind;
10080 void *data;
10081 Py_ssize_t len, i;
10082
10083 if (PyUnicode_READY(self) == -1)
10084 return NULL;
10085
10086 kind = PyUnicode_KIND(self);
10087 data = PyUnicode_DATA(self);
10088 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010089
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010090 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 if (len == 1) {
10092 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10093 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10094 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010095
10096 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010098 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 for (i = 0; i < len; i++) {
10101 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010102 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010103 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010104 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010105 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010106}
10107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010108PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010109 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010111Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010112False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010113
10114static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010115unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010116{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 Py_ssize_t i, length;
10118 int kind;
10119 void *data;
10120
10121 if (PyUnicode_READY(self) == -1)
10122 return NULL;
10123 length = PyUnicode_GET_LENGTH(self);
10124 kind = PyUnicode_KIND(self);
10125 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010126
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 if (length == 1)
10129 return PyBool_FromLong(
10130 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010131
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010132 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010134 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 for (i = 0; i < length; i++) {
10137 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010138 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010140 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141}
10142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010143PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010144 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010146Return True if all characters in S are digits\n\
10147and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148
10149static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010150unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 Py_ssize_t i, length;
10153 int kind;
10154 void *data;
10155
10156 if (PyUnicode_READY(self) == -1)
10157 return NULL;
10158 length = PyUnicode_GET_LENGTH(self);
10159 kind = PyUnicode_KIND(self);
10160 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 if (length == 1) {
10164 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10165 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010168 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010170 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 for (i = 0; i < length; i++) {
10173 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010174 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010176 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177}
10178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010179PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010180 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010182Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010183False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184
10185static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010186unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 Py_ssize_t i, length;
10189 int kind;
10190 void *data;
10191
10192 if (PyUnicode_READY(self) == -1)
10193 return NULL;
10194 length = PyUnicode_GET_LENGTH(self);
10195 kind = PyUnicode_KIND(self);
10196 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010197
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 if (length == 1)
10200 return PyBool_FromLong(
10201 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010203 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010205 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 for (i = 0; i < length; i++) {
10208 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010209 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010211 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212}
10213
Martin v. Löwis47383402007-08-15 07:32:56 +000010214int
10215PyUnicode_IsIdentifier(PyObject *self)
10216{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 int kind;
10218 void *data;
10219 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010220 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 if (PyUnicode_READY(self) == -1) {
10223 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010224 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 }
10226
10227 /* Special case for empty strings */
10228 if (PyUnicode_GET_LENGTH(self) == 0)
10229 return 0;
10230 kind = PyUnicode_KIND(self);
10231 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010232
10233 /* PEP 3131 says that the first character must be in
10234 XID_Start and subsequent characters in XID_Continue,
10235 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010236 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010237 letters, digits, underscore). However, given the current
10238 definition of XID_Start and XID_Continue, it is sufficient
10239 to check just for these, except that _ must be allowed
10240 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010242 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010243 return 0;
10244
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010245 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010247 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010248 return 1;
10249}
10250
10251PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010252 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010253\n\
10254Return True if S is a valid identifier according\n\
10255to the language definition.");
10256
10257static PyObject*
10258unicode_isidentifier(PyObject *self)
10259{
10260 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10261}
10262
Georg Brandl559e5d72008-06-11 18:37:52 +000010263PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010264 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010265\n\
10266Return True if all characters in S are considered\n\
10267printable in repr() or S is empty, False otherwise.");
10268
10269static PyObject*
10270unicode_isprintable(PyObject *self)
10271{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 Py_ssize_t i, length;
10273 int kind;
10274 void *data;
10275
10276 if (PyUnicode_READY(self) == -1)
10277 return NULL;
10278 length = PyUnicode_GET_LENGTH(self);
10279 kind = PyUnicode_KIND(self);
10280 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010281
10282 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 if (length == 1)
10284 return PyBool_FromLong(
10285 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 for (i = 0; i < length; i++) {
10288 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010289 Py_RETURN_FALSE;
10290 }
10291 }
10292 Py_RETURN_TRUE;
10293}
10294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010295PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010296 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297\n\
10298Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010299iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300
10301static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010302unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010304 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010305}
10306
Martin v. Löwis18e16552006-02-15 17:27:45 +000010307static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308unicode_length(PyUnicodeObject *self)
10309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 if (PyUnicode_READY(self) == -1)
10311 return -1;
10312 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313}
10314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010315PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010316 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010318Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010319done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320
10321static PyObject *
10322unicode_ljust(PyUnicodeObject *self, PyObject *args)
10323{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010324 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 Py_UCS4 fillchar = ' ';
10326
10327 if (PyUnicode_READY(self) == -1)
10328 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010329
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010330 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331 return NULL;
10332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010334 Py_INCREF(self);
10335 return (PyObject*) self;
10336 }
10337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010339}
10340
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010341PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010342 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010344Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010345
10346static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010347unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010349 return fixup(self, fixlower);
10350}
10351
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010352#define LEFTSTRIP 0
10353#define RIGHTSTRIP 1
10354#define BOTHSTRIP 2
10355
10356/* Arrays indexed by above */
10357static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10358
10359#define STRIPNAME(i) (stripformat[i]+3)
10360
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010361/* externally visible for str.strip(unicode) */
10362PyObject *
10363_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10364{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 void *data;
10366 int kind;
10367 Py_ssize_t i, j, len;
10368 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10371 return NULL;
10372
10373 kind = PyUnicode_KIND(self);
10374 data = PyUnicode_DATA(self);
10375 len = PyUnicode_GET_LENGTH(self);
10376 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10377 PyUnicode_DATA(sepobj),
10378 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010379
Benjamin Peterson14339b62009-01-31 16:36:08 +000010380 i = 0;
10381 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 while (i < len &&
10383 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010384 i++;
10385 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010386 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010387
Benjamin Peterson14339b62009-01-31 16:36:08 +000010388 j = len;
10389 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010390 do {
10391 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 } while (j >= i &&
10393 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010394 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010395 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010396
Benjamin Peterson14339b62009-01-31 16:36:08 +000010397 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010398 Py_INCREF(self);
10399 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010400 }
10401 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 return PyUnicode_Substring((PyObject*)self, i, j);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010403}
10404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405/* Assumes an already ready self string. */
10406
10407static PyObject *
10408substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len)
10409{
10410 const int kind = PyUnicode_KIND(self);
10411 void *data = PyUnicode_DATA(self);
10412 Py_UCS4 maxchar = 0;
10413 Py_ssize_t i;
10414 PyObject *unicode;
10415
10416 if (start < 0 || len < 0 || (start + len) > PyUnicode_GET_LENGTH(self)) {
10417 PyErr_BadInternalCall();
10418 return NULL;
10419 }
10420
10421 if (len == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) {
10422 Py_INCREF(self);
10423 return (PyObject*)self;
10424 }
10425
10426 for (i = 0; i < len; ++i) {
10427 const Py_UCS4 ch = PyUnicode_READ(kind, data, start + i);
10428 if (ch > maxchar)
10429 maxchar = ch;
10430 }
10431
10432 unicode = PyUnicode_New(len, maxchar);
10433 if (unicode == NULL)
10434 return NULL;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010435 if (PyUnicode_CopyCharacters(unicode, 0,
10436 (PyObject*)self, start, len) < 0)
10437 {
10438 Py_DECREF(unicode);
10439 return NULL;
10440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 return unicode;
10442}
10443
10444PyObject*
10445PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10446{
10447 unsigned char *data;
10448 int kind;
10449
10450 if (start == 0 && end == PyUnicode_GET_LENGTH(self)
10451 && PyUnicode_CheckExact(self))
10452 {
10453 Py_INCREF(self);
10454 return (PyObject *)self;
10455 }
10456
10457 if ((end - start) == 1)
10458 return unicode_getitem((PyUnicodeObject*)self, start);
10459
10460 if (PyUnicode_READY(self) == -1)
10461 return NULL;
10462 kind = PyUnicode_KIND(self);
10463 data = PyUnicode_1BYTE_DATA(self);
10464 return PyUnicode_FromKindAndData(kind, data + PyUnicode_KIND_SIZE(kind, start),
10465 end-start);
10466}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467
10468static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010469do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010470{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 int kind;
10472 void *data;
10473 Py_ssize_t len, i, j;
10474
10475 if (PyUnicode_READY(self) == -1)
10476 return NULL;
10477
10478 kind = PyUnicode_KIND(self);
10479 data = PyUnicode_DATA(self);
10480 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010481
Benjamin Peterson14339b62009-01-31 16:36:08 +000010482 i = 0;
10483 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010485 i++;
10486 }
10487 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010488
Benjamin Peterson14339b62009-01-31 16:36:08 +000010489 j = len;
10490 if (striptype != LEFTSTRIP) {
10491 do {
10492 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010494 j++;
10495 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010496
Benjamin Peterson14339b62009-01-31 16:36:08 +000010497 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
10498 Py_INCREF(self);
10499 return (PyObject*)self;
10500 }
10501 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 return substring(self, i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503}
10504
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010505
10506static PyObject *
10507do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10508{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010509 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010510
Benjamin Peterson14339b62009-01-31 16:36:08 +000010511 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10512 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010513
Benjamin Peterson14339b62009-01-31 16:36:08 +000010514 if (sep != NULL && sep != Py_None) {
10515 if (PyUnicode_Check(sep))
10516 return _PyUnicode_XStrip(self, striptype, sep);
10517 else {
10518 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010519 "%s arg must be None or str",
10520 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010521 return NULL;
10522 }
10523 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010524
Benjamin Peterson14339b62009-01-31 16:36:08 +000010525 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010526}
10527
10528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010529PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010530 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010531\n\
10532Return a copy of the string S with leading and trailing\n\
10533whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010534If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010535
10536static PyObject *
10537unicode_strip(PyUnicodeObject *self, PyObject *args)
10538{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010539 if (PyTuple_GET_SIZE(args) == 0)
10540 return do_strip(self, BOTHSTRIP); /* Common case */
10541 else
10542 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010543}
10544
10545
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010546PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010547 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010548\n\
10549Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010550If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010551
10552static PyObject *
10553unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10554{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010555 if (PyTuple_GET_SIZE(args) == 0)
10556 return do_strip(self, LEFTSTRIP); /* Common case */
10557 else
10558 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010559}
10560
10561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010562PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010563 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010564\n\
10565Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010566If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010567
10568static PyObject *
10569unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10570{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010571 if (PyTuple_GET_SIZE(args) == 0)
10572 return do_strip(self, RIGHTSTRIP); /* Common case */
10573 else
10574 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010575}
10576
10577
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010579unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580{
10581 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 Py_ssize_t nchars, n;
10583 size_t nbytes, char_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584
Georg Brandl222de0f2009-04-12 12:01:50 +000010585 if (len < 1) {
10586 Py_INCREF(unicode_empty);
10587 return (PyObject *)unicode_empty;
10588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589
Tim Peters7a29bd52001-09-12 03:03:31 +000010590 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591 /* no repeat, return original string */
10592 Py_INCREF(str);
10593 return (PyObject*) str;
10594 }
Tim Peters8f422462000-09-09 06:13:41 +000010595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 if (PyUnicode_READY(str) == -1)
10597 return NULL;
10598
Tim Peters8f422462000-09-09 06:13:41 +000010599 /* ensure # of chars needed doesn't overflow int and # of bytes
10600 * needed doesn't overflow size_t
10601 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 nchars = len * PyUnicode_GET_LENGTH(str);
10603 if (nchars / len != PyUnicode_GET_LENGTH(str)) {
Tim Peters8f422462000-09-09 06:13:41 +000010604 PyErr_SetString(PyExc_OverflowError,
10605 "repeated string is too long");
10606 return NULL;
10607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 char_size = PyUnicode_CHARACTER_SIZE(str);
10609 nbytes = (nchars + 1) * char_size;
10610 if (nbytes / char_size != (size_t)(nchars + 1)) {
Tim Peters8f422462000-09-09 06:13:41 +000010611 PyErr_SetString(PyExc_OverflowError,
10612 "repeated string is too long");
10613 return NULL;
10614 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616 if (!u)
10617 return NULL;
10618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 if (PyUnicode_GET_LENGTH(str) == 1) {
10620 const int kind = PyUnicode_KIND(str);
10621 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10622 void *to = PyUnicode_DATA(u);
10623 for (n = 0; n < len; ++n)
10624 PyUnicode_WRITE(kind, to, n, fill_char);
10625 }
10626 else {
10627 /* number of characters copied this far */
10628 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10629 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10630 char *to = (char *) PyUnicode_DATA(u);
10631 Py_MEMCPY(to, PyUnicode_DATA(str),
10632 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010633 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 n = (done <= nchars-done) ? done : nchars-done;
10635 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010636 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638 }
10639
10640 return (PyObject*) u;
10641}
10642
Alexander Belopolsky40018472011-02-26 01:02:56 +000010643PyObject *
10644PyUnicode_Replace(PyObject *obj,
10645 PyObject *subobj,
10646 PyObject *replobj,
10647 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648{
10649 PyObject *self;
10650 PyObject *str1;
10651 PyObject *str2;
10652 PyObject *result;
10653
10654 self = PyUnicode_FromObject(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 if (self == NULL || PyUnicode_READY(obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657 str1 = PyUnicode_FromObject(subobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 if (str1 == NULL || PyUnicode_READY(obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010659 Py_DECREF(self);
10660 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010661 }
10662 str2 = PyUnicode_FromObject(replobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 if (str2 == NULL || PyUnicode_READY(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010664 Py_DECREF(self);
10665 Py_DECREF(str1);
10666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010667 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669 Py_DECREF(self);
10670 Py_DECREF(str1);
10671 Py_DECREF(str2);
10672 return result;
10673}
10674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010675PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010676 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677\n\
10678Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010679old replaced by new. If the optional argument count is\n\
10680given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010681
10682static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010684{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 PyObject *str1;
10686 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010687 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688 PyObject *result;
10689
Martin v. Löwis18e16552006-02-15 17:27:45 +000010690 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010693 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 str1 = PyUnicode_FromObject(str1);
10695 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10696 return NULL;
10697 str2 = PyUnicode_FromObject(str2);
10698 if (str2 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010699 Py_DECREF(str1);
10700 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702
10703 result = replace(self, str1, str2, maxcount);
10704
10705 Py_DECREF(str1);
10706 Py_DECREF(str2);
10707 return result;
10708}
10709
Alexander Belopolsky40018472011-02-26 01:02:56 +000010710static PyObject *
10711unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010713 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 Py_ssize_t isize;
10715 Py_ssize_t osize, squote, dquote, i, o;
10716 Py_UCS4 max, quote;
10717 int ikind, okind;
10718 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010721 return NULL;
10722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 isize = PyUnicode_GET_LENGTH(unicode);
10724 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 /* Compute length of output, quote characters, and
10727 maximum character */
10728 osize = 2; /* quotes */
10729 max = 127;
10730 squote = dquote = 0;
10731 ikind = PyUnicode_KIND(unicode);
10732 for (i = 0; i < isize; i++) {
10733 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10734 switch (ch) {
10735 case '\'': squote++; osize++; break;
10736 case '"': dquote++; osize++; break;
10737 case '\\': case '\t': case '\r': case '\n':
10738 osize += 2; break;
10739 default:
10740 /* Fast-path ASCII */
10741 if (ch < ' ' || ch == 0x7f)
10742 osize += 4; /* \xHH */
10743 else if (ch < 0x7f)
10744 osize++;
10745 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10746 osize++;
10747 max = ch > max ? ch : max;
10748 }
10749 else if (ch < 0x100)
10750 osize += 4; /* \xHH */
10751 else if (ch < 0x10000)
10752 osize += 6; /* \uHHHH */
10753 else
10754 osize += 10; /* \uHHHHHHHH */
10755 }
10756 }
10757
10758 quote = '\'';
10759 if (squote) {
10760 if (dquote)
10761 /* Both squote and dquote present. Use squote,
10762 and escape them */
10763 osize += squote;
10764 else
10765 quote = '"';
10766 }
10767
10768 repr = PyUnicode_New(osize, max);
10769 if (repr == NULL)
10770 return NULL;
10771 okind = PyUnicode_KIND(repr);
10772 odata = PyUnicode_DATA(repr);
10773
10774 PyUnicode_WRITE(okind, odata, 0, quote);
10775 PyUnicode_WRITE(okind, odata, osize-1, quote);
10776
10777 for (i = 0, o = 1; i < isize; i++) {
10778 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010779
10780 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 if ((ch == quote) || (ch == '\\')) {
10782 PyUnicode_WRITE(okind, odata, o++, '\\');
10783 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010784 continue;
10785 }
10786
Benjamin Peterson29060642009-01-31 22:14:21 +000010787 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010788 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 PyUnicode_WRITE(okind, odata, o++, '\\');
10790 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010791 }
10792 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 PyUnicode_WRITE(okind, odata, o++, '\\');
10794 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010795 }
10796 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 PyUnicode_WRITE(okind, odata, o++, '\\');
10798 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010799 }
10800
10801 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010802 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 PyUnicode_WRITE(okind, odata, o++, '\\');
10804 PyUnicode_WRITE(okind, odata, o++, 'x');
10805 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10806 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010807 }
10808
Georg Brandl559e5d72008-06-11 18:37:52 +000010809 /* Copy ASCII characters as-is */
10810 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010812 }
10813
Benjamin Peterson29060642009-01-31 22:14:21 +000010814 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010815 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010816 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010817 (categories Z* and C* except ASCII space)
10818 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010820 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821 if (ch <= 0xff) {
10822 PyUnicode_WRITE(okind, odata, o++, '\\');
10823 PyUnicode_WRITE(okind, odata, o++, 'x');
10824 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10825 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010826 }
10827 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 else if (ch >= 0x10000) {
10829 PyUnicode_WRITE(okind, odata, o++, '\\');
10830 PyUnicode_WRITE(okind, odata, o++, 'U');
10831 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10832 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10833 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10834 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10835 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10836 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10837 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10838 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010839 }
10840 /* Map 16-bit characters to '\uxxxx' */
10841 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 PyUnicode_WRITE(okind, odata, o++, '\\');
10843 PyUnicode_WRITE(okind, odata, o++, 'u');
10844 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10845 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10846 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10847 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010848 }
10849 }
10850 /* Copy characters as-is */
10851 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010852 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010853 }
10854 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010855 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010857 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858}
10859
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010860PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010861 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862\n\
10863Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010864such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865arguments start and end are interpreted as in slice notation.\n\
10866\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010867Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868
10869static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871{
Jesus Ceaac451502011-04-20 17:09:23 +020010872 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010873 Py_ssize_t start;
10874 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010875 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876
Jesus Ceaac451502011-04-20 17:09:23 +020010877 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10878 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 if (PyUnicode_READY(self) == -1)
10882 return NULL;
10883 if (PyUnicode_READY(substring) == -1)
10884 return NULL;
10885
10886 result = any_find_slice(
10887 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10888 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010889 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890
10891 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010893 if (result == -2)
10894 return NULL;
10895
Christian Heimes217cfd12007-12-02 14:31:20 +000010896 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897}
10898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010899PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010900 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010902Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903
10904static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010905unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906{
Jesus Ceaac451502011-04-20 17:09:23 +020010907 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010908 Py_ssize_t start;
10909 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010910 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911
Jesus Ceaac451502011-04-20 17:09:23 +020010912 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10913 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 if (PyUnicode_READY(self) == -1)
10917 return NULL;
10918 if (PyUnicode_READY(substring) == -1)
10919 return NULL;
10920
10921 result = any_find_slice(
10922 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10923 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010924 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925
10926 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 if (result == -2)
10929 return NULL;
10930
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931 if (result < 0) {
10932 PyErr_SetString(PyExc_ValueError, "substring not found");
10933 return NULL;
10934 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935
Christian Heimes217cfd12007-12-02 14:31:20 +000010936 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937}
10938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010939PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010940 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010942Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010943done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944
10945static PyObject *
10946unicode_rjust(PyUnicodeObject *self, PyObject *args)
10947{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010948 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 Py_UCS4 fillchar = ' ';
10950
10951 if (PyUnicode_READY(self) == -1)
10952 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010953
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010954 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955 return NULL;
10956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958 Py_INCREF(self);
10959 return (PyObject*) self;
10960 }
10961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963}
10964
Alexander Belopolsky40018472011-02-26 01:02:56 +000010965PyObject *
10966PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967{
10968 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010969
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970 s = PyUnicode_FromObject(s);
10971 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010972 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010973 if (sep != NULL) {
10974 sep = PyUnicode_FromObject(sep);
10975 if (sep == NULL) {
10976 Py_DECREF(s);
10977 return NULL;
10978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979 }
10980
10981 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
10982
10983 Py_DECREF(s);
10984 Py_XDECREF(sep);
10985 return result;
10986}
10987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010988PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010989 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990\n\
10991Return a list of the words in S, using sep as the\n\
10992delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000010993splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000010994whitespace string is a separator and empty strings are\n\
10995removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996
10997static PyObject*
10998unicode_split(PyUnicodeObject *self, PyObject *args)
10999{
11000 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011001 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002
Martin v. Löwis18e16552006-02-15 17:27:45 +000011003 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 return NULL;
11005
11006 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011007 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011009 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011011 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012}
11013
Thomas Wouters477c8d52006-05-27 19:21:47 +000011014PyObject *
11015PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11016{
11017 PyObject* str_obj;
11018 PyObject* sep_obj;
11019 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 int kind1, kind2, kind;
11021 void *buf1 = NULL, *buf2 = NULL;
11022 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011023
11024 str_obj = PyUnicode_FromObject(str_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025 if (!str_obj || PyUnicode_READY(str_in) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011026 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011027 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011029 Py_DECREF(str_obj);
11030 return NULL;
11031 }
11032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 kind1 = PyUnicode_KIND(str_in);
11034 kind2 = PyUnicode_KIND(sep_obj);
11035 kind = kind1 > kind2 ? kind1 : kind2;
11036 buf1 = PyUnicode_DATA(str_in);
11037 if (kind1 != kind)
11038 buf1 = _PyUnicode_AsKind(str_in, kind);
11039 if (!buf1)
11040 goto onError;
11041 buf2 = PyUnicode_DATA(sep_obj);
11042 if (kind2 != kind)
11043 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11044 if (!buf2)
11045 goto onError;
11046 len1 = PyUnicode_GET_LENGTH(str_obj);
11047 len2 = PyUnicode_GET_LENGTH(sep_obj);
11048
11049 switch(PyUnicode_KIND(str_in)) {
11050 case PyUnicode_1BYTE_KIND:
11051 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11052 break;
11053 case PyUnicode_2BYTE_KIND:
11054 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11055 break;
11056 case PyUnicode_4BYTE_KIND:
11057 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11058 break;
11059 default:
11060 assert(0);
11061 out = 0;
11062 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011063
11064 Py_DECREF(sep_obj);
11065 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011066 if (kind1 != kind)
11067 PyMem_Free(buf1);
11068 if (kind2 != kind)
11069 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011070
11071 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 onError:
11073 Py_DECREF(sep_obj);
11074 Py_DECREF(str_obj);
11075 if (kind1 != kind && buf1)
11076 PyMem_Free(buf1);
11077 if (kind2 != kind && buf2)
11078 PyMem_Free(buf2);
11079 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011080}
11081
11082
11083PyObject *
11084PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11085{
11086 PyObject* str_obj;
11087 PyObject* sep_obj;
11088 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011089 int kind1, kind2, kind;
11090 void *buf1 = NULL, *buf2 = NULL;
11091 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011092
11093 str_obj = PyUnicode_FromObject(str_in);
11094 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011095 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011096 sep_obj = PyUnicode_FromObject(sep_in);
11097 if (!sep_obj) {
11098 Py_DECREF(str_obj);
11099 return NULL;
11100 }
11101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011102 kind1 = PyUnicode_KIND(str_in);
11103 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011104 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 buf1 = PyUnicode_DATA(str_in);
11106 if (kind1 != kind)
11107 buf1 = _PyUnicode_AsKind(str_in, kind);
11108 if (!buf1)
11109 goto onError;
11110 buf2 = PyUnicode_DATA(sep_obj);
11111 if (kind2 != kind)
11112 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11113 if (!buf2)
11114 goto onError;
11115 len1 = PyUnicode_GET_LENGTH(str_obj);
11116 len2 = PyUnicode_GET_LENGTH(sep_obj);
11117
11118 switch(PyUnicode_KIND(str_in)) {
11119 case PyUnicode_1BYTE_KIND:
11120 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11121 break;
11122 case PyUnicode_2BYTE_KIND:
11123 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11124 break;
11125 case PyUnicode_4BYTE_KIND:
11126 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11127 break;
11128 default:
11129 assert(0);
11130 out = 0;
11131 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011132
11133 Py_DECREF(sep_obj);
11134 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011135 if (kind1 != kind)
11136 PyMem_Free(buf1);
11137 if (kind2 != kind)
11138 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011139
11140 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 onError:
11142 Py_DECREF(sep_obj);
11143 Py_DECREF(str_obj);
11144 if (kind1 != kind && buf1)
11145 PyMem_Free(buf1);
11146 if (kind2 != kind && buf2)
11147 PyMem_Free(buf2);
11148 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011149}
11150
11151PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011152 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011153\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011154Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011155the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011156found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011157
11158static PyObject*
11159unicode_partition(PyUnicodeObject *self, PyObject *separator)
11160{
11161 return PyUnicode_Partition((PyObject *)self, separator);
11162}
11163
11164PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011165 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011166\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011167Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011168the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011169separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011170
11171static PyObject*
11172unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11173{
11174 return PyUnicode_RPartition((PyObject *)self, separator);
11175}
11176
Alexander Belopolsky40018472011-02-26 01:02:56 +000011177PyObject *
11178PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011179{
11180 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011181
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011182 s = PyUnicode_FromObject(s);
11183 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011184 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011185 if (sep != NULL) {
11186 sep = PyUnicode_FromObject(sep);
11187 if (sep == NULL) {
11188 Py_DECREF(s);
11189 return NULL;
11190 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011191 }
11192
11193 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11194
11195 Py_DECREF(s);
11196 Py_XDECREF(sep);
11197 return result;
11198}
11199
11200PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011201 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011202\n\
11203Return a list of the words in S, using sep as the\n\
11204delimiter string, starting at the end of the string and\n\
11205working to the front. If maxsplit is given, at most maxsplit\n\
11206splits are done. If sep is not specified, any whitespace string\n\
11207is a separator.");
11208
11209static PyObject*
11210unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11211{
11212 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011213 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011214
Martin v. Löwis18e16552006-02-15 17:27:45 +000011215 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011216 return NULL;
11217
11218 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011219 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011220 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011221 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011222 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011223 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011224}
11225
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011226PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011227 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228\n\
11229Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011230Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011231is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232
11233static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011234unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011236 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011237 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011239 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11240 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241 return NULL;
11242
Guido van Rossum86662912000-04-11 15:38:46 +000011243 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244}
11245
11246static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011247PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248{
Walter Dörwald346737f2007-05-31 10:44:43 +000011249 if (PyUnicode_CheckExact(self)) {
11250 Py_INCREF(self);
11251 return self;
11252 } else
11253 /* Subtype -- return genuine unicode string with the same value. */
11254 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
11255 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256}
11257
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011258PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011259 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260\n\
11261Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011262and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263
11264static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011265unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267 return fixup(self, fixswapcase);
11268}
11269
Georg Brandlceee0772007-11-27 23:48:05 +000011270PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011271 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011272\n\
11273Return a translation table usable for str.translate().\n\
11274If there is only one argument, it must be a dictionary mapping Unicode\n\
11275ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011276Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011277If there are two arguments, they must be strings of equal length, and\n\
11278in the resulting dictionary, each character in x will be mapped to the\n\
11279character at the same position in y. If there is a third argument, it\n\
11280must be a string, whose characters will be mapped to None in the result.");
11281
11282static PyObject*
11283unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11284{
11285 PyObject *x, *y = NULL, *z = NULL;
11286 PyObject *new = NULL, *key, *value;
11287 Py_ssize_t i = 0;
11288 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011289
Georg Brandlceee0772007-11-27 23:48:05 +000011290 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11291 return NULL;
11292 new = PyDict_New();
11293 if (!new)
11294 return NULL;
11295 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011296 int x_kind, y_kind, z_kind;
11297 void *x_data, *y_data, *z_data;
11298
Georg Brandlceee0772007-11-27 23:48:05 +000011299 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011300 if (!PyUnicode_Check(x)) {
11301 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11302 "be a string if there is a second argument");
11303 goto err;
11304 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011305 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011306 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11307 "arguments must have equal length");
11308 goto err;
11309 }
11310 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 x_kind = PyUnicode_KIND(x);
11312 y_kind = PyUnicode_KIND(y);
11313 x_data = PyUnicode_DATA(x);
11314 y_data = PyUnicode_DATA(y);
11315 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11316 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11317 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011318 if (!key || !value)
11319 goto err;
11320 res = PyDict_SetItem(new, key, value);
11321 Py_DECREF(key);
11322 Py_DECREF(value);
11323 if (res < 0)
11324 goto err;
11325 }
11326 /* create entries for deleting chars in z */
11327 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 z_kind = PyUnicode_KIND(z);
11329 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011330 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011332 if (!key)
11333 goto err;
11334 res = PyDict_SetItem(new, key, Py_None);
11335 Py_DECREF(key);
11336 if (res < 0)
11337 goto err;
11338 }
11339 }
11340 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 int kind;
11342 void *data;
11343
Georg Brandlceee0772007-11-27 23:48:05 +000011344 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011345 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011346 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11347 "to maketrans it must be a dict");
11348 goto err;
11349 }
11350 /* copy entries into the new dict, converting string keys to int keys */
11351 while (PyDict_Next(x, &i, &key, &value)) {
11352 if (PyUnicode_Check(key)) {
11353 /* convert string keys to integer keys */
11354 PyObject *newkey;
11355 if (PyUnicode_GET_SIZE(key) != 1) {
11356 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11357 "table must be of length 1");
11358 goto err;
11359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 kind = PyUnicode_KIND(key);
11361 data = PyUnicode_DATA(key);
11362 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011363 if (!newkey)
11364 goto err;
11365 res = PyDict_SetItem(new, newkey, value);
11366 Py_DECREF(newkey);
11367 if (res < 0)
11368 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011369 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011370 /* just keep integer keys */
11371 if (PyDict_SetItem(new, key, value) < 0)
11372 goto err;
11373 } else {
11374 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11375 "be strings or integers");
11376 goto err;
11377 }
11378 }
11379 }
11380 return new;
11381 err:
11382 Py_DECREF(new);
11383 return NULL;
11384}
11385
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011386PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011387 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011388\n\
11389Return a copy of the string S, where all characters have been mapped\n\
11390through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011391Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011392Unmapped characters are left untouched. Characters mapped to None\n\
11393are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394
11395static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399}
11400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011401PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011402 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011404Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405
11406static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011407unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409 return fixup(self, fixupper);
11410}
11411
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011412PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011413 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011415Pad a numeric string S with zeros on the left, to fill a field\n\
11416of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417
11418static PyObject *
11419unicode_zfill(PyUnicodeObject *self, PyObject *args)
11420{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011421 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011423 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 int kind;
11425 void *data;
11426 Py_UCS4 chr;
11427
11428 if (PyUnicode_READY(self) == -1)
11429 return NULL;
11430
Martin v. Löwis18e16552006-02-15 17:27:45 +000011431 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432 return NULL;
11433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011435 if (PyUnicode_CheckExact(self)) {
11436 Py_INCREF(self);
11437 return (PyObject*) self;
11438 }
11439 else
11440 return PyUnicode_FromUnicode(
11441 PyUnicode_AS_UNICODE(self),
11442 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +000011443 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444 }
11445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447
11448 u = pad(self, fill, 0, '0');
11449
Walter Dörwald068325e2002-04-15 13:36:47 +000011450 if (u == NULL)
11451 return NULL;
11452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 kind = PyUnicode_KIND(u);
11454 data = PyUnicode_DATA(u);
11455 chr = PyUnicode_READ(kind, data, fill);
11456
11457 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 PyUnicode_WRITE(kind, data, 0, chr);
11460 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461 }
11462
11463 return (PyObject*) u;
11464}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465
11466#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011467static PyObject *
11468unicode__decimal2ascii(PyObject *self)
11469{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011471}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472#endif
11473
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011474PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011477Return True if S starts with the specified prefix, False otherwise.\n\
11478With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011479With optional end, stop comparing S at that position.\n\
11480prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481
11482static PyObject *
11483unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011484 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011486 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011488 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011489 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011490 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491
Jesus Ceaac451502011-04-20 17:09:23 +020011492 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011493 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011494 if (PyTuple_Check(subobj)) {
11495 Py_ssize_t i;
11496 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11497 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011498 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011499 if (substring == NULL)
11500 return NULL;
11501 result = tailmatch(self, substring, start, end, -1);
11502 Py_DECREF(substring);
11503 if (result) {
11504 Py_RETURN_TRUE;
11505 }
11506 }
11507 /* nothing matched */
11508 Py_RETURN_FALSE;
11509 }
11510 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011511 if (substring == NULL) {
11512 if (PyErr_ExceptionMatches(PyExc_TypeError))
11513 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11514 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011515 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011516 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011517 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011519 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520}
11521
11522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011523PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011524 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011526Return True if S ends with the specified suffix, False otherwise.\n\
11527With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011528With optional end, stop comparing S at that position.\n\
11529suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530
11531static PyObject *
11532unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011533 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011535 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011537 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011538 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011539 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540
Jesus Ceaac451502011-04-20 17:09:23 +020011541 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011542 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011543 if (PyTuple_Check(subobj)) {
11544 Py_ssize_t i;
11545 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11546 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011547 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011548 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011550 result = tailmatch(self, substring, start, end, +1);
11551 Py_DECREF(substring);
11552 if (result) {
11553 Py_RETURN_TRUE;
11554 }
11555 }
11556 Py_RETURN_FALSE;
11557 }
11558 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011559 if (substring == NULL) {
11560 if (PyErr_ExceptionMatches(PyExc_TypeError))
11561 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11562 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011564 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011565 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011567 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568}
11569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011570#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011571
11572PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011573 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011574\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011575Return a formatted version of S, using substitutions from args and kwargs.\n\
11576The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011577
Eric Smith27bbca62010-11-04 17:06:58 +000011578PyDoc_STRVAR(format_map__doc__,
11579 "S.format_map(mapping) -> str\n\
11580\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011581Return a formatted version of S, using substitutions from mapping.\n\
11582The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011583
Eric Smith4a7d76d2008-05-30 18:10:19 +000011584static PyObject *
11585unicode__format__(PyObject* self, PyObject* args)
11586{
11587 PyObject *format_spec;
11588
11589 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11590 return NULL;
11591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11593 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011594}
11595
Eric Smith8c663262007-08-25 02:26:07 +000011596PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011598\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011599Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011600
11601static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011602unicode__sizeof__(PyUnicodeObject *v)
11603{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 Py_ssize_t size;
11605
11606 /* If it's a compact object, account for base structure +
11607 character data. */
11608 if (PyUnicode_IS_COMPACT_ASCII(v))
11609 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11610 else if (PyUnicode_IS_COMPACT(v))
11611 size = sizeof(PyCompactUnicodeObject) +
11612 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11613 else {
11614 /* If it is a two-block object, account for base object, and
11615 for character block if present. */
11616 size = sizeof(PyUnicodeObject);
11617 if (v->data.any)
11618 size += (PyUnicode_GET_LENGTH(v) + 1) *
11619 PyUnicode_CHARACTER_SIZE(v);
11620 }
11621 /* If the wstr pointer is present, account for it unless it is shared
11622 with the data pointer. Since PyUnicode_DATA will crash if the object
11623 is not ready, check whether it's either not ready (in which case the
11624 data is entirely in wstr) or if the data is not shared. */
11625 if (_PyUnicode_WSTR(v) &&
11626 (!PyUnicode_IS_READY(v) ||
11627 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11628 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11629 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11630 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11631
11632 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011633}
11634
11635PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011637
11638static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011639unicode_getnewargs(PyUnicodeObject *v)
11640{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 PyObject *copy;
11642 unsigned char *data;
11643 int kind;
11644 if (PyUnicode_READY(v) == -1)
11645 return NULL;
11646 kind = PyUnicode_KIND(v);
11647 data = PyUnicode_1BYTE_DATA(v);
11648 copy = PyUnicode_FromKindAndData(kind, data, PyUnicode_GET_LENGTH(v));
11649 if (!copy)
11650 return NULL;
11651 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011652}
11653
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654static PyMethodDef unicode_methods[] = {
11655
11656 /* Order is according to common usage: often used methods should
11657 appear first, since lookup is done sequentially. */
11658
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011659 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011660 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11661 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011662 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011663 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11664 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11665 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11666 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11667 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11668 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11669 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011670 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011671 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11672 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11673 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011674 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011675 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11676 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11677 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011678 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011679 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011680 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011681 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011682 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11683 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11684 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11685 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11686 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11687 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11688 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11689 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11690 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11691 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11692 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11693 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11694 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11695 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011696 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011697 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011698 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011699 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011700 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011701 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011702 {"maketrans", (PyCFunction) unicode_maketrans,
11703 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011704 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011705#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011706 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707#endif
11708
11709#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011710 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011711 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712#endif
11713
Benjamin Peterson14339b62009-01-31 16:36:08 +000011714 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715 {NULL, NULL}
11716};
11717
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011718static PyObject *
11719unicode_mod(PyObject *v, PyObject *w)
11720{
Brian Curtindfc80e32011-08-10 20:28:54 -050011721 if (!PyUnicode_Check(v))
11722 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011723 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011724}
11725
11726static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011727 0, /*nb_add*/
11728 0, /*nb_subtract*/
11729 0, /*nb_multiply*/
11730 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011731};
11732
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011734 (lenfunc) unicode_length, /* sq_length */
11735 PyUnicode_Concat, /* sq_concat */
11736 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11737 (ssizeargfunc) unicode_getitem, /* sq_item */
11738 0, /* sq_slice */
11739 0, /* sq_ass_item */
11740 0, /* sq_ass_slice */
11741 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742};
11743
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011744static PyObject*
11745unicode_subscript(PyUnicodeObject* self, PyObject* item)
11746{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 if (PyUnicode_READY(self) == -1)
11748 return NULL;
11749
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011750 if (PyIndex_Check(item)) {
11751 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011752 if (i == -1 && PyErr_Occurred())
11753 return NULL;
11754 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011756 return unicode_getitem(self, i);
11757 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011758 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011759 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011760 Py_UNICODE* result_buf;
11761 PyObject* result;
11762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011765 return NULL;
11766 }
11767
11768 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 return PyUnicode_New(0, 0);
11770 } else if (start == 0 && step == 1 &&
11771 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011772 PyUnicode_CheckExact(self)) {
11773 Py_INCREF(self);
11774 return (PyObject *)self;
11775 } else if (step == 1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 return substring(self, start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011777 } else {
11778 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011779 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11780 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011781
Benjamin Peterson29060642009-01-31 22:14:21 +000011782 if (result_buf == NULL)
11783 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011784
11785 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11786 result_buf[i] = source_buf[cur];
11787 }
Tim Petersced69f82003-09-16 20:30:58 +000011788
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011789 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011790 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011791 return result;
11792 }
11793 } else {
11794 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11795 return NULL;
11796 }
11797}
11798
11799static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011800 (lenfunc)unicode_length, /* mp_length */
11801 (binaryfunc)unicode_subscript, /* mp_subscript */
11802 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011803};
11804
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806/* Helpers for PyUnicode_Format() */
11807
11808static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011809getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011811 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011813 (*p_argidx)++;
11814 if (arglen < 0)
11815 return args;
11816 else
11817 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818 }
11819 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 return NULL;
11822}
11823
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011824/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011826static PyObject *
11827formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011829 char *p;
11830 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011832
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833 x = PyFloat_AsDouble(v);
11834 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011835 return NULL;
11836
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011838 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011839
Eric Smith0923d1d2009-04-16 20:16:10 +000011840 p = PyOS_double_to_string(x, type, prec,
11841 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011842 if (p == NULL)
11843 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011845 PyMem_Free(p);
11846 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847}
11848
Tim Peters38fd5b62000-09-21 05:43:11 +000011849static PyObject*
11850formatlong(PyObject *val, int flags, int prec, int type)
11851{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011852 char *buf;
11853 int len;
11854 PyObject *str; /* temporary string object. */
11855 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011856
Benjamin Peterson14339b62009-01-31 16:36:08 +000011857 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11858 if (!str)
11859 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011861 Py_DECREF(str);
11862 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011863}
11864
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011867 size_t buflen,
11868 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011870 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011871 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 if (PyUnicode_GET_LENGTH(v) == 1) {
11873 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011874 buf[1] = '\0';
11875 return 1;
11876 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 goto onError;
11878 }
11879 else {
11880 /* Integer input truncated to a character */
11881 long x;
11882 x = PyLong_AsLong(v);
11883 if (x == -1 && PyErr_Occurred())
11884 goto onError;
11885
11886 if (x < 0 || x > 0x10ffff) {
11887 PyErr_SetString(PyExc_OverflowError,
11888 "%c arg not in range(0x110000)");
11889 return -1;
11890 }
11891
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011893 buf[1] = '\0';
11894 return 1;
11895 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011896
Benjamin Peterson29060642009-01-31 22:14:21 +000011897 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011898 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011899 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011900 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901}
11902
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011903/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011904 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011905*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011906#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011907
Alexander Belopolsky40018472011-02-26 01:02:56 +000011908PyObject *
11909PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 void *fmt;
11912 int fmtkind;
11913 PyObject *result;
11914 Py_UCS4 *res, *res0;
11915 Py_UCS4 max;
11916 int kind;
11917 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011921
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011923 PyErr_BadInternalCall();
11924 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11927 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011928 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 fmt = PyUnicode_DATA(uformat);
11930 fmtkind = PyUnicode_KIND(uformat);
11931 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11932 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933
11934 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11936 if (res0 == NULL) {
11937 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940
11941 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011942 arglen = PyTuple_Size(args);
11943 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944 }
11945 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011946 arglen = -1;
11947 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011949 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011950 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011951 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952
11953 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011955 if (--rescnt < 0) {
11956 rescnt = fmtcnt + 100;
11957 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11959 if (res0 == NULL){
11960 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011961 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 }
11963 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011967 }
11968 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011969 /* Got a format specifier */
11970 int flags = 0;
11971 Py_ssize_t width = -1;
11972 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 Py_UCS4 c = '\0';
11974 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011975 int isnumok;
11976 PyObject *v = NULL;
11977 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 void *pbuf;
11979 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011980 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 Py_ssize_t len, len1;
11982 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 fmtpos++;
11985 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
11986 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000011987 Py_ssize_t keylen;
11988 PyObject *key;
11989 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000011990
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 if (dict == NULL) {
11992 PyErr_SetString(PyExc_TypeError,
11993 "format requires a mapping");
11994 goto onError;
11995 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011999 /* Skip over balanced parentheses */
12000 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012002 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012008 if (fmtcnt < 0 || pcount > 0) {
12009 PyErr_SetString(PyExc_ValueError,
12010 "incomplete format key");
12011 goto onError;
12012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 key = substring(uformat, keystart, keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012014 if (key == NULL)
12015 goto onError;
12016 if (args_owned) {
12017 Py_DECREF(args);
12018 args_owned = 0;
12019 }
12020 args = PyObject_GetItem(dict, key);
12021 Py_DECREF(key);
12022 if (args == NULL) {
12023 goto onError;
12024 }
12025 args_owned = 1;
12026 arglen = -1;
12027 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012028 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012029 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012031 case '-': flags |= F_LJUST; continue;
12032 case '+': flags |= F_SIGN; continue;
12033 case ' ': flags |= F_BLANK; continue;
12034 case '#': flags |= F_ALT; continue;
12035 case '0': flags |= F_ZERO; continue;
12036 }
12037 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012038 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012039 if (c == '*') {
12040 v = getnextarg(args, arglen, &argidx);
12041 if (v == NULL)
12042 goto onError;
12043 if (!PyLong_Check(v)) {
12044 PyErr_SetString(PyExc_TypeError,
12045 "* wants int");
12046 goto onError;
12047 }
12048 width = PyLong_AsLong(v);
12049 if (width == -1 && PyErr_Occurred())
12050 goto onError;
12051 if (width < 0) {
12052 flags |= F_LJUST;
12053 width = -width;
12054 }
12055 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012057 }
12058 else if (c >= '0' && c <= '9') {
12059 width = c - '0';
12060 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012062 if (c < '0' || c > '9')
12063 break;
12064 if ((width*10) / 10 != width) {
12065 PyErr_SetString(PyExc_ValueError,
12066 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012067 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012068 }
12069 width = width*10 + (c - '0');
12070 }
12071 }
12072 if (c == '.') {
12073 prec = 0;
12074 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012076 if (c == '*') {
12077 v = getnextarg(args, arglen, &argidx);
12078 if (v == NULL)
12079 goto onError;
12080 if (!PyLong_Check(v)) {
12081 PyErr_SetString(PyExc_TypeError,
12082 "* wants int");
12083 goto onError;
12084 }
12085 prec = PyLong_AsLong(v);
12086 if (prec == -1 && PyErr_Occurred())
12087 goto onError;
12088 if (prec < 0)
12089 prec = 0;
12090 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012092 }
12093 else if (c >= '0' && c <= '9') {
12094 prec = c - '0';
12095 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012097 if (c < '0' || c > '9')
12098 break;
12099 if ((prec*10) / 10 != prec) {
12100 PyErr_SetString(PyExc_ValueError,
12101 "prec too big");
12102 goto onError;
12103 }
12104 prec = prec*10 + (c - '0');
12105 }
12106 }
12107 } /* prec */
12108 if (fmtcnt >= 0) {
12109 if (c == 'h' || c == 'l' || c == 'L') {
12110 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012112 }
12113 }
12114 if (fmtcnt < 0) {
12115 PyErr_SetString(PyExc_ValueError,
12116 "incomplete format");
12117 goto onError;
12118 }
12119 if (c != '%') {
12120 v = getnextarg(args, arglen, &argidx);
12121 if (v == NULL)
12122 goto onError;
12123 }
12124 sign = 0;
12125 fill = ' ';
12126 switch (c) {
12127
12128 case '%':
12129 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012131 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012133 len = 1;
12134 break;
12135
12136 case 's':
12137 case 'r':
12138 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012139 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012140 temp = v;
12141 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012142 }
12143 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012144 if (c == 's')
12145 temp = PyObject_Str(v);
12146 else if (c == 'r')
12147 temp = PyObject_Repr(v);
12148 else
12149 temp = PyObject_ASCII(v);
12150 if (temp == NULL)
12151 goto onError;
12152 if (PyUnicode_Check(temp))
12153 /* nothing to do */;
12154 else {
12155 Py_DECREF(temp);
12156 PyErr_SetString(PyExc_TypeError,
12157 "%s argument has non-string str()");
12158 goto onError;
12159 }
12160 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 if (PyUnicode_READY(temp) == -1) {
12162 Py_CLEAR(temp);
12163 goto onError;
12164 }
12165 pbuf = PyUnicode_DATA(temp);
12166 kind = PyUnicode_KIND(temp);
12167 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012168 if (prec >= 0 && len > prec)
12169 len = prec;
12170 break;
12171
12172 case 'i':
12173 case 'd':
12174 case 'u':
12175 case 'o':
12176 case 'x':
12177 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012178 isnumok = 0;
12179 if (PyNumber_Check(v)) {
12180 PyObject *iobj=NULL;
12181
12182 if (PyLong_Check(v)) {
12183 iobj = v;
12184 Py_INCREF(iobj);
12185 }
12186 else {
12187 iobj = PyNumber_Long(v);
12188 }
12189 if (iobj!=NULL) {
12190 if (PyLong_Check(iobj)) {
12191 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012192 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012193 Py_DECREF(iobj);
12194 if (!temp)
12195 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 if (PyUnicode_READY(temp) == -1) {
12197 Py_CLEAR(temp);
12198 goto onError;
12199 }
12200 pbuf = PyUnicode_DATA(temp);
12201 kind = PyUnicode_KIND(temp);
12202 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012203 sign = 1;
12204 }
12205 else {
12206 Py_DECREF(iobj);
12207 }
12208 }
12209 }
12210 if (!isnumok) {
12211 PyErr_Format(PyExc_TypeError,
12212 "%%%c format: a number is required, "
12213 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12214 goto onError;
12215 }
12216 if (flags & F_ZERO)
12217 fill = '0';
12218 break;
12219
12220 case 'e':
12221 case 'E':
12222 case 'f':
12223 case 'F':
12224 case 'g':
12225 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012226 temp = formatfloat(v, flags, prec, c);
12227 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012228 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 if (PyUnicode_READY(temp) == -1) {
12230 Py_CLEAR(temp);
12231 goto onError;
12232 }
12233 pbuf = PyUnicode_DATA(temp);
12234 kind = PyUnicode_KIND(temp);
12235 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012236 sign = 1;
12237 if (flags & F_ZERO)
12238 fill = '0';
12239 break;
12240
12241 case 'c':
12242 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012244 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
12245 if (len < 0)
12246 goto onError;
12247 break;
12248
12249 default:
12250 PyErr_Format(PyExc_ValueError,
12251 "unsupported format character '%c' (0x%x) "
12252 "at index %zd",
12253 (31<=c && c<=126) ? (char)c : '?',
12254 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012255 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012256 goto onError;
12257 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012258 /* pbuf is initialized here. */
12259 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012260 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12262 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12263 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012264 len--;
12265 }
12266 else if (flags & F_SIGN)
12267 sign = '+';
12268 else if (flags & F_BLANK)
12269 sign = ' ';
12270 else
12271 sign = 0;
12272 }
12273 if (width < len)
12274 width = len;
12275 if (rescnt - (sign != 0) < width) {
12276 reslen -= rescnt;
12277 rescnt = width + fmtcnt + 100;
12278 reslen += rescnt;
12279 if (reslen < 0) {
12280 Py_XDECREF(temp);
12281 PyErr_NoMemory();
12282 goto onError;
12283 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12285 if (res0 == 0) {
12286 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012287 Py_XDECREF(temp);
12288 goto onError;
12289 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012291 }
12292 if (sign) {
12293 if (fill != ' ')
12294 *res++ = sign;
12295 rescnt--;
12296 if (width > len)
12297 width--;
12298 }
12299 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12301 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012302 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12304 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 }
12306 rescnt -= 2;
12307 width -= 2;
12308 if (width < 0)
12309 width = 0;
12310 len -= 2;
12311 }
12312 if (width > len && !(flags & F_LJUST)) {
12313 do {
12314 --rescnt;
12315 *res++ = fill;
12316 } while (--width > len);
12317 }
12318 if (fill == ' ') {
12319 if (sign)
12320 *res++ = sign;
12321 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12323 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12324 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12325 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012326 }
12327 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328 /* Copy all characters, preserving len */
12329 len1 = len;
12330 while (len1--) {
12331 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12332 rescnt--;
12333 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012334 while (--width >= len) {
12335 --rescnt;
12336 *res++ = ' ';
12337 }
12338 if (dict && (argidx < arglen) && c != '%') {
12339 PyErr_SetString(PyExc_TypeError,
12340 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012341 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012342 goto onError;
12343 }
12344 Py_XDECREF(temp);
12345 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346 } /* until end */
12347 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012348 PyErr_SetString(PyExc_TypeError,
12349 "not all arguments converted during string formatting");
12350 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351 }
12352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353
12354 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12355 if (*res > max)
12356 max = *res;
12357 result = PyUnicode_New(reslen - rescnt, max);
12358 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012359 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360 kind = PyUnicode_KIND(result);
12361 for (res = res0; res < res0+reslen-rescnt; res++)
12362 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12363 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012365 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366 }
12367 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368 return (PyObject *)result;
12369
Benjamin Peterson29060642009-01-31 22:14:21 +000012370 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372 Py_DECREF(uformat);
12373 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012374 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012375 }
12376 return NULL;
12377}
12378
Jeremy Hylton938ace62002-07-17 16:30:39 +000012379static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012380unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12381
Tim Peters6d6c1a32001-08-02 04:15:00 +000012382static PyObject *
12383unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12384{
Benjamin Peterson29060642009-01-31 22:14:21 +000012385 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012386 static char *kwlist[] = {"object", "encoding", "errors", 0};
12387 char *encoding = NULL;
12388 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012389
Benjamin Peterson14339b62009-01-31 16:36:08 +000012390 if (type != &PyUnicode_Type)
12391 return unicode_subtype_new(type, args, kwds);
12392 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012393 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012394 return NULL;
12395 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012396 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012397 if (encoding == NULL && errors == NULL)
12398 return PyObject_Str(x);
12399 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012400 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012401}
12402
Guido van Rossume023fe02001-08-30 03:12:59 +000012403static PyObject *
12404unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12405{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012406 PyUnicodeObject *tmp, *pnew;
12407 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012409
Benjamin Peterson14339b62009-01-31 16:36:08 +000012410 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12411 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12412 if (tmp == NULL)
12413 return NULL;
12414 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12416 // it seems kind of strange that tp_alloc gets passed the size
12417 // of the unicode string because there will follow another
12418 // malloc.
12419 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12420 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012421 if (pnew == NULL) {
12422 Py_DECREF(tmp);
12423 return NULL;
12424 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12426 if (_PyUnicode_WSTR(pnew) == NULL) {
12427 err = PyErr_NoMemory();
12428 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012429 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12431 _PyUnicode_WSTR_LENGTH(pnew) = n;
12432 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12433 _PyUnicode_STATE(pnew).interned = 0;
12434 _PyUnicode_STATE(pnew).kind = 0;
12435 _PyUnicode_STATE(pnew).compact = 0;
12436 _PyUnicode_STATE(pnew).ready = 0;
12437 _PyUnicode_STATE(pnew).ascii = 0;
12438 pnew->data.any = NULL;
12439 _PyUnicode_LENGTH(pnew) = 0;
12440 pnew->_base.utf8 = NULL;
12441 pnew->_base.utf8_length = 0;
12442
12443 if (PyUnicode_READY(pnew) == -1) {
12444 PyObject_FREE(_PyUnicode_WSTR(pnew));
12445 goto onError;
12446 }
12447
Benjamin Peterson14339b62009-01-31 16:36:08 +000012448 Py_DECREF(tmp);
12449 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450
12451 onError:
12452 _Py_ForgetReference((PyObject *)pnew);
12453 PyObject_Del(pnew);
12454 Py_DECREF(tmp);
12455 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012456}
12457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012458PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012459 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012460\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012461Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012462encoding defaults to the current default string encoding.\n\
12463errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012464
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012465static PyObject *unicode_iter(PyObject *seq);
12466
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012468 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012469 "str", /* tp_name */
12470 sizeof(PyUnicodeObject), /* tp_size */
12471 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012473 (destructor)unicode_dealloc, /* tp_dealloc */
12474 0, /* tp_print */
12475 0, /* tp_getattr */
12476 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012477 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012478 unicode_repr, /* tp_repr */
12479 &unicode_as_number, /* tp_as_number */
12480 &unicode_as_sequence, /* tp_as_sequence */
12481 &unicode_as_mapping, /* tp_as_mapping */
12482 (hashfunc) unicode_hash, /* tp_hash*/
12483 0, /* tp_call*/
12484 (reprfunc) unicode_str, /* tp_str */
12485 PyObject_GenericGetAttr, /* tp_getattro */
12486 0, /* tp_setattro */
12487 0, /* tp_as_buffer */
12488 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012489 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012490 unicode_doc, /* tp_doc */
12491 0, /* tp_traverse */
12492 0, /* tp_clear */
12493 PyUnicode_RichCompare, /* tp_richcompare */
12494 0, /* tp_weaklistoffset */
12495 unicode_iter, /* tp_iter */
12496 0, /* tp_iternext */
12497 unicode_methods, /* tp_methods */
12498 0, /* tp_members */
12499 0, /* tp_getset */
12500 &PyBaseObject_Type, /* tp_base */
12501 0, /* tp_dict */
12502 0, /* tp_descr_get */
12503 0, /* tp_descr_set */
12504 0, /* tp_dictoffset */
12505 0, /* tp_init */
12506 0, /* tp_alloc */
12507 unicode_new, /* tp_new */
12508 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509};
12510
12511/* Initialize the Unicode implementation */
12512
Thomas Wouters78890102000-07-22 19:25:51 +000012513void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012515 int i;
12516
Thomas Wouters477c8d52006-05-27 19:21:47 +000012517 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012519 0x000A, /* LINE FEED */
12520 0x000D, /* CARRIAGE RETURN */
12521 0x001C, /* FILE SEPARATOR */
12522 0x001D, /* GROUP SEPARATOR */
12523 0x001E, /* RECORD SEPARATOR */
12524 0x0085, /* NEXT LINE */
12525 0x2028, /* LINE SEPARATOR */
12526 0x2029, /* PARAGRAPH SEPARATOR */
12527 };
12528
Fred Drakee4315f52000-05-09 19:53:39 +000012529 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012530 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012531 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012532 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012533
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012534 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012535 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012536 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012537 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012538
12539 /* initialize the linebreak bloom filter */
12540 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 PyUnicode_2BYTE_KIND, linebreak,
12542 sizeof(linebreak) / sizeof(linebreak[0]));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012543
12544 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545}
12546
12547/* Finalize the Unicode implementation */
12548
Christian Heimesa156e092008-02-16 07:38:31 +000012549int
12550PyUnicode_ClearFreeList(void)
12551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012553}
12554
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555void
Thomas Wouters78890102000-07-22 19:25:51 +000012556_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012558 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012560 Py_XDECREF(unicode_empty);
12561 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012562
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012563 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012564 if (unicode_latin1[i]) {
12565 Py_DECREF(unicode_latin1[i]);
12566 unicode_latin1[i] = NULL;
12567 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012568 }
Christian Heimesa156e092008-02-16 07:38:31 +000012569 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012571
Walter Dörwald16807132007-05-25 13:52:07 +000012572void
12573PyUnicode_InternInPlace(PyObject **p)
12574{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012575 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12576 PyObject *t;
12577 if (s == NULL || !PyUnicode_Check(s))
12578 Py_FatalError(
12579 "PyUnicode_InternInPlace: unicode strings only please!");
12580 /* If it's a subclass, we don't really know what putting
12581 it in the interned dict might do. */
12582 if (!PyUnicode_CheckExact(s))
12583 return;
12584 if (PyUnicode_CHECK_INTERNED(s))
12585 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 if (PyUnicode_READY(s) == -1) {
12587 assert(0 && "ready fail in intern...");
12588 return;
12589 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012590 if (interned == NULL) {
12591 interned = PyDict_New();
12592 if (interned == NULL) {
12593 PyErr_Clear(); /* Don't leave an exception */
12594 return;
12595 }
12596 }
12597 /* It might be that the GetItem call fails even
12598 though the key is present in the dictionary,
12599 namely when this happens during a stack overflow. */
12600 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012601 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012602 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012603
Benjamin Peterson29060642009-01-31 22:14:21 +000012604 if (t) {
12605 Py_INCREF(t);
12606 Py_DECREF(*p);
12607 *p = t;
12608 return;
12609 }
Walter Dörwald16807132007-05-25 13:52:07 +000012610
Benjamin Peterson14339b62009-01-31 16:36:08 +000012611 PyThreadState_GET()->recursion_critical = 1;
12612 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12613 PyErr_Clear();
12614 PyThreadState_GET()->recursion_critical = 0;
12615 return;
12616 }
12617 PyThreadState_GET()->recursion_critical = 0;
12618 /* The two references in interned are not counted by refcnt.
12619 The deallocator will take care of this */
12620 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012622}
12623
12624void
12625PyUnicode_InternImmortal(PyObject **p)
12626{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12628
Benjamin Peterson14339b62009-01-31 16:36:08 +000012629 PyUnicode_InternInPlace(p);
12630 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012632 Py_INCREF(*p);
12633 }
Walter Dörwald16807132007-05-25 13:52:07 +000012634}
12635
12636PyObject *
12637PyUnicode_InternFromString(const char *cp)
12638{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012639 PyObject *s = PyUnicode_FromString(cp);
12640 if (s == NULL)
12641 return NULL;
12642 PyUnicode_InternInPlace(&s);
12643 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012644}
12645
Alexander Belopolsky40018472011-02-26 01:02:56 +000012646void
12647_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012648{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012649 PyObject *keys;
12650 PyUnicodeObject *s;
12651 Py_ssize_t i, n;
12652 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012653
Benjamin Peterson14339b62009-01-31 16:36:08 +000012654 if (interned == NULL || !PyDict_Check(interned))
12655 return;
12656 keys = PyDict_Keys(interned);
12657 if (keys == NULL || !PyList_Check(keys)) {
12658 PyErr_Clear();
12659 return;
12660 }
Walter Dörwald16807132007-05-25 13:52:07 +000012661
Benjamin Peterson14339b62009-01-31 16:36:08 +000012662 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12663 detector, interned unicode strings are not forcibly deallocated;
12664 rather, we give them their stolen references back, and then clear
12665 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012666
Benjamin Peterson14339b62009-01-31 16:36:08 +000012667 n = PyList_GET_SIZE(keys);
12668 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012669 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012670 for (i = 0; i < n; i++) {
12671 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 if (PyUnicode_READY(s) == -1)
12673 fprintf(stderr, "could not ready string\n");
12674 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012675 case SSTATE_NOT_INTERNED:
12676 /* XXX Shouldn't happen */
12677 break;
12678 case SSTATE_INTERNED_IMMORTAL:
12679 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012681 break;
12682 case SSTATE_INTERNED_MORTAL:
12683 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012685 break;
12686 default:
12687 Py_FatalError("Inconsistent interned string state.");
12688 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012689 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012690 }
12691 fprintf(stderr, "total size of all interned strings: "
12692 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12693 "mortal/immortal\n", mortal_size, immortal_size);
12694 Py_DECREF(keys);
12695 PyDict_Clear(interned);
12696 Py_DECREF(interned);
12697 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012698}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012699
12700
12701/********************* Unicode Iterator **************************/
12702
12703typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012704 PyObject_HEAD
12705 Py_ssize_t it_index;
12706 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012707} unicodeiterobject;
12708
12709static void
12710unicodeiter_dealloc(unicodeiterobject *it)
12711{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012712 _PyObject_GC_UNTRACK(it);
12713 Py_XDECREF(it->it_seq);
12714 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012715}
12716
12717static int
12718unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12719{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012720 Py_VISIT(it->it_seq);
12721 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012722}
12723
12724static PyObject *
12725unicodeiter_next(unicodeiterobject *it)
12726{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012727 PyUnicodeObject *seq;
12728 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012729
Benjamin Peterson14339b62009-01-31 16:36:08 +000012730 assert(it != NULL);
12731 seq = it->it_seq;
12732 if (seq == NULL)
12733 return NULL;
12734 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12737 int kind = PyUnicode_KIND(seq);
12738 void *data = PyUnicode_DATA(seq);
12739 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12740 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012741 if (item != NULL)
12742 ++it->it_index;
12743 return item;
12744 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012745
Benjamin Peterson14339b62009-01-31 16:36:08 +000012746 Py_DECREF(seq);
12747 it->it_seq = NULL;
12748 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012749}
12750
12751static PyObject *
12752unicodeiter_len(unicodeiterobject *it)
12753{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012754 Py_ssize_t len = 0;
12755 if (it->it_seq)
12756 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12757 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012758}
12759
12760PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12761
12762static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012763 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012764 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012765 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012766};
12767
12768PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012769 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12770 "str_iterator", /* tp_name */
12771 sizeof(unicodeiterobject), /* tp_basicsize */
12772 0, /* tp_itemsize */
12773 /* methods */
12774 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12775 0, /* tp_print */
12776 0, /* tp_getattr */
12777 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012778 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012779 0, /* tp_repr */
12780 0, /* tp_as_number */
12781 0, /* tp_as_sequence */
12782 0, /* tp_as_mapping */
12783 0, /* tp_hash */
12784 0, /* tp_call */
12785 0, /* tp_str */
12786 PyObject_GenericGetAttr, /* tp_getattro */
12787 0, /* tp_setattro */
12788 0, /* tp_as_buffer */
12789 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12790 0, /* tp_doc */
12791 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12792 0, /* tp_clear */
12793 0, /* tp_richcompare */
12794 0, /* tp_weaklistoffset */
12795 PyObject_SelfIter, /* tp_iter */
12796 (iternextfunc)unicodeiter_next, /* tp_iternext */
12797 unicodeiter_methods, /* tp_methods */
12798 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012799};
12800
12801static PyObject *
12802unicode_iter(PyObject *seq)
12803{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012804 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012805
Benjamin Peterson14339b62009-01-31 16:36:08 +000012806 if (!PyUnicode_Check(seq)) {
12807 PyErr_BadInternalCall();
12808 return NULL;
12809 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012810 if (PyUnicode_READY(seq) == -1)
12811 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012812 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12813 if (it == NULL)
12814 return NULL;
12815 it->it_index = 0;
12816 Py_INCREF(seq);
12817 it->it_seq = (PyUnicodeObject *)seq;
12818 _PyObject_GC_TRACK(it);
12819 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012820}
12821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822#define UNIOP(x) Py_UNICODE_##x
12823#define UNIOP_t Py_UNICODE
12824#include "uniops.h"
12825#undef UNIOP
12826#undef UNIOP_t
12827#define UNIOP(x) Py_UCS4_##x
12828#define UNIOP_t Py_UCS4
12829#include "uniops.h"
12830#undef UNIOP
12831#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012832
Victor Stinner71133ff2010-09-01 23:43:53 +000012833Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012834PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012835{
12836 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12837 Py_UNICODE *copy;
12838 Py_ssize_t size;
12839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012840 if (!PyUnicode_Check(unicode)) {
12841 PyErr_BadArgument();
12842 return NULL;
12843 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012844 /* Ensure we won't overflow the size. */
12845 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12846 PyErr_NoMemory();
12847 return NULL;
12848 }
12849 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12850 size *= sizeof(Py_UNICODE);
12851 copy = PyMem_Malloc(size);
12852 if (copy == NULL) {
12853 PyErr_NoMemory();
12854 return NULL;
12855 }
12856 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12857 return copy;
12858}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012859
Georg Brandl66c221e2010-10-14 07:04:07 +000012860/* A _string module, to export formatter_parser and formatter_field_name_split
12861 to the string.Formatter class implemented in Python. */
12862
12863static PyMethodDef _string_methods[] = {
12864 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12865 METH_O, PyDoc_STR("split the argument as a field name")},
12866 {"formatter_parser", (PyCFunction) formatter_parser,
12867 METH_O, PyDoc_STR("parse the argument as a format string")},
12868 {NULL, NULL}
12869};
12870
12871static struct PyModuleDef _string_module = {
12872 PyModuleDef_HEAD_INIT,
12873 "_string",
12874 PyDoc_STR("string helper module"),
12875 0,
12876 _string_methods,
12877 NULL,
12878 NULL,
12879 NULL,
12880 NULL
12881};
12882
12883PyMODINIT_FUNC
12884PyInit__string(void)
12885{
12886 return PyModule_Create(&_string_module);
12887}
12888
12889
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012890#ifdef __cplusplus
12891}
12892#endif