blob: bcf26d8d804d024f3bfea519e62243bc633540cb [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
109#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
110#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
111#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
112#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
113#define _PyUnicode_KIND(op) \
114 (assert(PyUnicode_Check(op)), \
115 ((PyASCIIObject *)(op))->state.kind)
116#define _PyUnicode_GET_LENGTH(op) \
117 (assert(PyUnicode_Check(op)), \
118 ((PyASCIIObject *)(op))->length)
119
120
Walter Dörwald16807132007-05-25 13:52:07 +0000121/* This dictionary holds all interned unicode strings. Note that references
122 to strings in this dictionary are *not* counted in the string's ob_refcnt.
123 When the interned string reaches a refcnt of 0 the string deallocation
124 function will delete the reference from this dictionary.
125
126 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000127 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000128*/
129static PyObject *interned;
130
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000131/* The empty Unicode object is shared to improve performance. */
132static PyUnicodeObject *unicode_empty;
133
134/* Single character Unicode strings in the Latin-1 range are being
135 shared as well. */
136static PyUnicodeObject *unicode_latin1[256];
137
Christian Heimes190d79e2008-01-30 11:58:22 +0000138/* Fast detection of the most frequent whitespace characters */
139const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000141/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000142/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000143/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000144/* case 0x000C: * FORM FEED */
145/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000146 0, 1, 1, 1, 1, 1, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000148/* case 0x001C: * FILE SEPARATOR */
149/* case 0x001D: * GROUP SEPARATOR */
150/* case 0x001E: * RECORD SEPARATOR */
151/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000152 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000153/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000154 1, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000158
Benjamin Peterson14339b62009-01-31 16:36:08 +0000159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000167};
168
Alexander Belopolsky40018472011-02-26 01:02:56 +0000169static PyObject *
170unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000171 PyObject **errorHandler,const char *encoding, const char *reason,
172 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
173 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
174
Alexander Belopolsky40018472011-02-26 01:02:56 +0000175static void
176raise_encode_exception(PyObject **exceptionObject,
177 const char *encoding,
178 const Py_UNICODE *unicode, Py_ssize_t size,
179 Py_ssize_t startpos, Py_ssize_t endpos,
180 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000181
Christian Heimes190d79e2008-01-30 11:58:22 +0000182/* Same for linebreaks */
183static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000185/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000186/* 0x000B, * LINE TABULATION */
187/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000188/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000189 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* 0x001C, * FILE SEPARATOR */
192/* 0x001D, * GROUP SEPARATOR */
193/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 0, 0, 0, 0, 1, 1, 1, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
196 0, 0, 0, 0, 0, 0, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000199
Benjamin Peterson14339b62009-01-31 16:36:08 +0000200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000208};
209
210
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000211Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000212PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000213{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000214#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000216#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 /* This is actually an illegal character, so it should
218 not be passed to unichr. */
219 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000220#endif
221}
222
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223/* --- Bloom Filters ----------------------------------------------------- */
224
225/* stuff to implement simple "bloom filters" for Unicode characters.
226 to keep things simple, we use a single bitmask, using the least 5
227 bits from each unicode characters as the bit index. */
228
229/* the linebreak mask is set up by Unicode_Init below */
230
Antoine Pitrouf068f942010-01-13 14:19:12 +0000231#if LONG_BIT >= 128
232#define BLOOM_WIDTH 128
233#elif LONG_BIT >= 64
234#define BLOOM_WIDTH 64
235#elif LONG_BIT >= 32
236#define BLOOM_WIDTH 32
237#else
238#error "LONG_BIT is smaller than 32"
239#endif
240
Thomas Wouters477c8d52006-05-27 19:21:47 +0000241#define BLOOM_MASK unsigned long
242
243static BLOOM_MASK bloom_linebreak;
244
Antoine Pitrouf068f942010-01-13 14:19:12 +0000245#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
246#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000247
Benjamin Peterson29060642009-01-31 22:14:21 +0000248#define BLOOM_LINEBREAK(ch) \
249 ((ch) < 128U ? ascii_linebreak[(ch)] : \
250 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000251
Alexander Belopolsky40018472011-02-26 01:02:56 +0000252Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200253make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000254{
255 /* calculate simple bloom-style bitmask for a given unicode string */
256
Antoine Pitrouf068f942010-01-13 14:19:12 +0000257 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000258 Py_ssize_t i;
259
260 mask = 0;
261 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200262 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000263
264 return mask;
265}
266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200267#define BLOOM_MEMBER(mask, chr, str) \
268 (BLOOM(mask, chr) \
269 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271/* --- Unicode Object ----------------------------------------------------- */
272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200273static PyObject *
274substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len);
275
276static PyObject *
277fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
278
279Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
280 Py_ssize_t size, Py_UCS4 ch,
281 int direction)
282{
283 /* like wcschr, but doesn't stop at NULL characters */
284 Py_ssize_t i;
285 if (direction == 1) {
286 for(i = 0; i < size; i++)
287 if (PyUnicode_READ(kind, s, i) == ch)
288 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
289 }
290 else {
291 for(i = size-1; i >= 0; i--)
292 if (PyUnicode_READ(kind, s, i) == ch)
293 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
294 }
295 return NULL;
296}
297
Alexander Belopolsky40018472011-02-26 01:02:56 +0000298static int
299unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200300 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301{
302 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200304 /* Resizing is only supported for old unicode objects. */
305 assert(!PyUnicode_IS_COMPACT(unicode));
306 assert(_PyUnicode_WSTR(unicode) != NULL);
307
308 /* ... and only if they have not been readied yet, because
309 callees usually rely on the wstr representation when resizing. */
310 assert(unicode->data.any == NULL);
311
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000312 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200313 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 /* Resizing shared object (unicode_empty or single character
317 objects) in-place is not allowed. Use PyUnicode_Resize()
318 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
322 _PyUnicode_WSTR(unicode)[0] < 256U &&
323 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000325 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 return -1;
327 }
328
Thomas Wouters477c8d52006-05-27 19:21:47 +0000329 /* We allocate one more byte to make sure the string is Ux0000 terminated.
330 The overallocation is also used by fastsearch, which assumes that it's
331 safe to look at str[length] (without making any assumptions about what
332 it contains). */
333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200334 oldstr = _PyUnicode_WSTR(unicode);
335 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
336 sizeof(Py_UNICODE) * (length + 1));
337 if (!_PyUnicode_WSTR(unicode)) {
338 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000339 PyErr_NoMemory();
340 return -1;
341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200342 _PyUnicode_WSTR(unicode)[length] = 0;
343 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000344
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 if (unicode->data.any != NULL) {
347 PyObject_FREE(unicode->data.any);
348 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
349 PyObject_FREE(unicode->_base.utf8);
350 }
351 unicode->_base.utf8 = NULL;
352 unicode->_base.utf8_length = 0;
353 unicode->data.any = NULL;
354 _PyUnicode_LENGTH(unicode) = 0;
355 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
356 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 _PyUnicode_HASH(unicode) = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000359
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 return 0;
361}
362
363/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000364 Ux0000 terminated; some code (e.g. new_identifier)
365 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366
367 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369
370*/
371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200372#ifdef Py_DEBUG
373int unicode_old_new_calls = 0;
374#endif
375
Alexander Belopolsky40018472011-02-26 01:02:56 +0000376static PyUnicodeObject *
377_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378{
379 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200380 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381
Thomas Wouters477c8d52006-05-27 19:21:47 +0000382 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383 if (length == 0 && unicode_empty != NULL) {
384 Py_INCREF(unicode_empty);
385 return unicode_empty;
386 }
387
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000388 /* Ensure we won't overflow the size. */
389 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
390 return (PyUnicodeObject *)PyErr_NoMemory();
391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200392 if (length < 0) {
393 PyErr_SetString(PyExc_SystemError,
394 "Negative size passed to _PyUnicode_New");
395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396 }
397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200398#ifdef Py_DEBUG
399 ++unicode_old_new_calls;
400#endif
401
402 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
403 if (unicode == NULL)
404 return NULL;
405 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
406 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
407 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 PyErr_NoMemory();
409 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200411
Jeremy Hyltond8082792003-09-16 19:41:39 +0000412 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000413 * the caller fails before initializing str -- unicode_resize()
414 * reads str[0], and the Keep-Alive optimization can keep memory
415 * allocated for str alive across a call to unicode_dealloc(unicode).
416 * We don't want unicode_resize to read uninitialized memory in
417 * that case.
418 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419 _PyUnicode_WSTR(unicode)[0] = 0;
420 _PyUnicode_WSTR(unicode)[length] = 0;
421 _PyUnicode_WSTR_LENGTH(unicode) = length;
422 _PyUnicode_HASH(unicode) = -1;
423 _PyUnicode_STATE(unicode).interned = 0;
424 _PyUnicode_STATE(unicode).kind = 0;
425 _PyUnicode_STATE(unicode).compact = 0;
426 _PyUnicode_STATE(unicode).ready = 0;
427 _PyUnicode_STATE(unicode).ascii = 0;
428 unicode->data.any = NULL;
429 _PyUnicode_LENGTH(unicode) = 0;
430 unicode->_base.utf8 = NULL;
431 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000433
Benjamin Peterson29060642009-01-31 22:14:21 +0000434 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000435 /* XXX UNREF/NEWREF interface should be more symmetrical */
436 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000437 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000438 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440}
441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200442#ifdef Py_DEBUG
443int unicode_new_new_calls = 0;
444
445/* Functions wrapping macros for use in debugger */
446char *_PyUnicode_utf8(void *unicode){
447 return _PyUnicode_UTF8(unicode);
448}
449
450void *_PyUnicode_compact_data(void *unicode) {
451 return _PyUnicode_COMPACT_DATA(unicode);
452}
453void *_PyUnicode_data(void *unicode){
454 printf("obj %p\n", unicode);
455 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
456 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
457 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
458 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
459 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
460 return PyUnicode_DATA(unicode);
461}
462#endif
463
464PyObject *
465PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
466{
467 PyObject *obj;
468 PyCompactUnicodeObject *unicode;
469 void *data;
470 int kind_state;
471 int is_sharing = 0, is_ascii = 0;
472 Py_ssize_t char_size;
473 Py_ssize_t struct_size;
474
475 /* Optimization for empty strings */
476 if (size == 0 && unicode_empty != NULL) {
477 Py_INCREF(unicode_empty);
478 return (PyObject *)unicode_empty;
479 }
480
481#ifdef Py_DEBUG
482 ++unicode_new_new_calls;
483#endif
484
485 struct_size = sizeof(PyCompactUnicodeObject);
486 if (maxchar < 128) {
487 kind_state = PyUnicode_1BYTE_KIND;
488 char_size = 1;
489 is_ascii = 1;
490 struct_size = sizeof(PyASCIIObject);
491 }
492 else if (maxchar < 256) {
493 kind_state = PyUnicode_1BYTE_KIND;
494 char_size = 1;
495 }
496 else if (maxchar < 65536) {
497 kind_state = PyUnicode_2BYTE_KIND;
498 char_size = 2;
499 if (sizeof(wchar_t) == 2)
500 is_sharing = 1;
501 }
502 else {
503 kind_state = PyUnicode_4BYTE_KIND;
504 char_size = 4;
505 if (sizeof(wchar_t) == 4)
506 is_sharing = 1;
507 }
508
509 /* Ensure we won't overflow the size. */
510 if (size < 0) {
511 PyErr_SetString(PyExc_SystemError,
512 "Negative size passed to PyUnicode_New");
513 return NULL;
514 }
515 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
516 return PyErr_NoMemory();
517
518 /* Duplicated allocation code from _PyObject_New() instead of a call to
519 * PyObject_New() so we are able to allocate space for the object and
520 * it's data buffer.
521 */
522 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
523 if (obj == NULL)
524 return PyErr_NoMemory();
525 obj = PyObject_INIT(obj, &PyUnicode_Type);
526 if (obj == NULL)
527 return NULL;
528
529 unicode = (PyCompactUnicodeObject *)obj;
530 if (is_ascii)
531 data = ((PyASCIIObject*)obj) + 1;
532 else
533 data = unicode + 1;
534 _PyUnicode_LENGTH(unicode) = size;
535 _PyUnicode_HASH(unicode) = -1;
536 _PyUnicode_STATE(unicode).interned = 0;
537 _PyUnicode_STATE(unicode).kind = kind_state;
538 _PyUnicode_STATE(unicode).compact = 1;
539 _PyUnicode_STATE(unicode).ready = 1;
540 _PyUnicode_STATE(unicode).ascii = is_ascii;
541 if (is_ascii) {
542 ((char*)data)[size] = 0;
543 _PyUnicode_WSTR(unicode) = NULL;
544 }
545 else if (kind_state == PyUnicode_1BYTE_KIND) {
546 ((char*)data)[size] = 0;
547 _PyUnicode_WSTR(unicode) = NULL;
548 _PyUnicode_WSTR_LENGTH(unicode) = 0;
549 unicode->utf8_length = 0;
550 unicode->utf8 = NULL;
551 }
552 else {
553 unicode->utf8 = NULL;
554 if (kind_state == PyUnicode_2BYTE_KIND)
555 ((Py_UCS2*)data)[size] = 0;
556 else /* kind_state == PyUnicode_4BYTE_KIND */
557 ((Py_UCS4*)data)[size] = 0;
558 if (is_sharing) {
559 _PyUnicode_WSTR_LENGTH(unicode) = size;
560 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
561 }
562 else {
563 _PyUnicode_WSTR_LENGTH(unicode) = 0;
564 _PyUnicode_WSTR(unicode) = NULL;
565 }
566 }
567 return obj;
568}
569
570#if SIZEOF_WCHAR_T == 2
571/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
572 will decode surrogate pairs, the other conversions are implemented as macros
573 for efficency.
574
575 This function assumes that unicode can hold one more code point than wstr
576 characters for a terminating null character. */
577static int
578unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
579 PyUnicodeObject *unicode)
580{
581 const wchar_t *iter;
582 Py_UCS4 *ucs4_out;
583
584 assert(unicode && PyUnicode_Check(unicode));
585 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
586 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
587
588 for (iter = begin; iter < end; ) {
589 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
590 _PyUnicode_GET_LENGTH(unicode)));
591 if (*iter >= 0xD800 && *iter <= 0xDBFF
592 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
593 {
594 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
595 iter += 2;
596 }
597 else {
598 *ucs4_out++ = *iter;
599 iter++;
600 }
601 }
602 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
603 _PyUnicode_GET_LENGTH(unicode)));
604
605 return 0;
606}
607#endif
608
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200609Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
611 PyObject *from, Py_ssize_t from_start,
612 Py_ssize_t how_many)
613{
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200614 unsigned int from_kind;
615 unsigned int to_kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200616
617 assert(PyUnicode_Check(from));
618 assert(PyUnicode_Check(to));
619
620 if (PyUnicode_READY(from))
621 return -1;
622 if (PyUnicode_READY(to))
623 return -1;
624
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200625 how_many = PY_MIN(PyUnicode_GET_LENGTH(from), how_many);
626 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
627 PyErr_Format(PyExc_ValueError,
628 "Cannot write %zi characters at %zi "
629 "in a string of %zi characters",
630 how_many, to_start, PyUnicode_GET_LENGTH(to));
631 return -1;
632 }
633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634 from_kind = PyUnicode_KIND(from);
635 to_kind = PyUnicode_KIND(to);
636
637 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200638 /* fast path */
639 Py_MEMCPY((char*)PyUnicode_DATA(to)
640 + PyUnicode_KIND_SIZE(to_kind, to_start),
641 (char*)PyUnicode_DATA(from)
642 + PyUnicode_KIND_SIZE(from_kind, from_start),
643 PyUnicode_KIND_SIZE(to_kind, how_many));
644 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200646
647 if (from_kind > to_kind) {
648 /* slow path to check for character overflow */
649 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
650 void *from_data = PyUnicode_DATA(from);
651 void *to_data = PyUnicode_DATA(to);
652 Py_UCS4 ch, maxchar;
653 Py_ssize_t i;
654 int overflow;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200656 maxchar = 0;
657 for (i=0; i < how_many; i++) {
658 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
659 if (ch > maxchar) {
660 maxchar = ch;
661 if (maxchar > to_maxchar) {
662 overflow = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200663 break;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200665 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200666 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
667 }
668 if (!overflow)
669 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200670 }
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200671 else if (from_kind == PyUnicode_1BYTE_KIND && to_kind == PyUnicode_2BYTE_KIND)
672 {
673 _PyUnicode_CONVERT_BYTES(
674 Py_UCS1, Py_UCS2,
675 PyUnicode_1BYTE_DATA(from) + from_start,
676 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
677 PyUnicode_2BYTE_DATA(to) + to_start
678 );
679 return how_many;
680 }
681 else if (from_kind == PyUnicode_1BYTE_KIND
682 && to_kind == PyUnicode_4BYTE_KIND)
683 {
684 _PyUnicode_CONVERT_BYTES(
685 Py_UCS1, Py_UCS4,
686 PyUnicode_1BYTE_DATA(from) + from_start,
687 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
688 PyUnicode_4BYTE_DATA(to) + to_start
689 );
690 return how_many;
691 }
692 else if (from_kind == PyUnicode_2BYTE_KIND
693 && to_kind == PyUnicode_4BYTE_KIND)
694 {
695 _PyUnicode_CONVERT_BYTES(
696 Py_UCS2, Py_UCS4,
697 PyUnicode_2BYTE_DATA(from) + from_start,
698 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
699 PyUnicode_4BYTE_DATA(to) + to_start
700 );
701 return how_many;
702 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200703 PyErr_Format(PyExc_ValueError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200704 "Cannot copy UCS%u characters "
705 "into a string of UCS%u characters",
706 1 << (from_kind - 1),
707 1 << (to_kind -1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200708 return -1;
709}
710
711int
712_PyUnicode_FindMaxCharAndNumSurrogatePairs(const wchar_t *begin,
713 const wchar_t *end,
714 Py_UCS4 *maxchar,
715 Py_ssize_t *num_surrogates)
716{
717 const wchar_t *iter;
718
719 if (num_surrogates == NULL || maxchar == NULL) {
720 PyErr_SetString(PyExc_SystemError,
721 "unexpected NULL arguments to "
722 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
723 return -1;
724 }
725
726 *num_surrogates = 0;
727 *maxchar = 0;
728
729 for (iter = begin; iter < end; ) {
730 if (*iter > *maxchar)
731 *maxchar = *iter;
732#if SIZEOF_WCHAR_T == 2
733 if (*iter >= 0xD800 && *iter <= 0xDBFF
734 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
735 {
736 Py_UCS4 surrogate_val;
737 surrogate_val = (((iter[0] & 0x3FF)<<10)
738 | (iter[1] & 0x3FF)) + 0x10000;
739 ++(*num_surrogates);
740 if (surrogate_val > *maxchar)
741 *maxchar = surrogate_val;
742 iter += 2;
743 }
744 else
745 iter++;
746#else
747 iter++;
748#endif
749 }
750 return 0;
751}
752
753#ifdef Py_DEBUG
754int unicode_ready_calls = 0;
755#endif
756
757int
758_PyUnicode_Ready(PyUnicodeObject *unicode)
759{
760 wchar_t *end;
761 Py_UCS4 maxchar = 0;
762 Py_ssize_t num_surrogates;
763#if SIZEOF_WCHAR_T == 2
764 Py_ssize_t length_wo_surrogates;
765#endif
766
767 assert(PyUnicode_Check(unicode));
768
769 if (unicode->data.any != NULL) {
770 assert(PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
771 return 0;
772 }
773
774 /* _PyUnicode_Ready() is only intented for old-style API usage where
775 * strings were created using _PyObject_New() and where no canonical
776 * representation (the str field) has been set yet aka strings
777 * which are not yet ready.
778 */
779 assert(_PyUnicode_WSTR(unicode) != NULL);
780 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
781 assert(!PyUnicode_IS_COMPACT(unicode));
782 assert(!PyUnicode_IS_READY(unicode));
783 /* Actually, it should neither be interned nor be anything else: */
784 assert(_PyUnicode_STATE(unicode).interned == 0);
785 assert(unicode->_base.utf8 == NULL);
786
787#ifdef Py_DEBUG
788 ++unicode_ready_calls;
789#endif
790
791 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
792 if (_PyUnicode_FindMaxCharAndNumSurrogatePairs(_PyUnicode_WSTR(unicode), end,
793 &maxchar,
794 &num_surrogates) == -1) {
795 assert(0 && "PyUnicode_FindMaxCharAndNumSurrogatePairs failed");
796 return -1;
797 }
798
799 if (maxchar < 256) {
800 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
801 if (!unicode->data.any) {
802 PyErr_NoMemory();
803 return -1;
804 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200805 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200806 _PyUnicode_WSTR(unicode), end,
807 PyUnicode_1BYTE_DATA(unicode));
808 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
809 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
810 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
811 if (maxchar < 128) {
812 unicode->_base.utf8 = unicode->data.any;
813 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
814 }
815 else {
816 unicode->_base.utf8 = NULL;
817 unicode->_base.utf8_length = 0;
818 }
819 PyObject_FREE(_PyUnicode_WSTR(unicode));
820 _PyUnicode_WSTR(unicode) = NULL;
821 _PyUnicode_WSTR_LENGTH(unicode) = 0;
822 }
823 /* In this case we might have to convert down from 4-byte native
824 wchar_t to 2-byte unicode. */
825 else if (maxchar < 65536) {
826 assert(num_surrogates == 0 &&
827 "FindMaxCharAndNumSurrogatePairs() messed up");
828
829 if (sizeof(wchar_t) == 2) {
830 /* We can share representations and are done. */
831 unicode->data.any = _PyUnicode_WSTR(unicode);
832 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
833 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
834 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
835 unicode->_base.utf8 = NULL;
836 unicode->_base.utf8_length = 0;
837 }
838 else {
839 assert(sizeof(wchar_t) == 4);
840
841 unicode->data.any = PyObject_MALLOC(
842 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
843 if (!unicode->data.any) {
844 PyErr_NoMemory();
845 return -1;
846 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200847 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200848 _PyUnicode_WSTR(unicode), end,
849 PyUnicode_2BYTE_DATA(unicode));
850 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
851 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
852 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
853 unicode->_base.utf8 = NULL;
854 unicode->_base.utf8_length = 0;
855 PyObject_FREE(_PyUnicode_WSTR(unicode));
856 _PyUnicode_WSTR(unicode) = NULL;
857 _PyUnicode_WSTR_LENGTH(unicode) = 0;
858 }
859 }
860 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
861 else {
862#if SIZEOF_WCHAR_T == 2
863 /* in case the native representation is 2-bytes, we need to allocate a
864 new normalized 4-byte version. */
865 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
866 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
867 if (!unicode->data.any) {
868 PyErr_NoMemory();
869 return -1;
870 }
871 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
872 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
873 unicode->_base.utf8 = NULL;
874 unicode->_base.utf8_length = 0;
875 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
876 unicode) < 0) {
877 assert(0 && "ConvertWideCharToUCS4 failed");
878 return -1;
879 }
880 PyObject_FREE(_PyUnicode_WSTR(unicode));
881 _PyUnicode_WSTR(unicode) = NULL;
882 _PyUnicode_WSTR_LENGTH(unicode) = 0;
883#else
884 assert(num_surrogates == 0);
885
886 unicode->data.any = _PyUnicode_WSTR(unicode);
887 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
888 unicode->_base.utf8 = NULL;
889 unicode->_base.utf8_length = 0;
890 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
891#endif
892 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
893 }
894 _PyUnicode_STATE(unicode).ready = 1;
895 return 0;
896}
897
Alexander Belopolsky40018472011-02-26 01:02:56 +0000898static void
899unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000900{
Walter Dörwald16807132007-05-25 13:52:07 +0000901 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000902 case SSTATE_NOT_INTERNED:
903 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000904
Benjamin Peterson29060642009-01-31 22:14:21 +0000905 case SSTATE_INTERNED_MORTAL:
906 /* revive dead object temporarily for DelItem */
907 Py_REFCNT(unicode) = 3;
908 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
909 Py_FatalError(
910 "deletion of interned string failed");
911 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000912
Benjamin Peterson29060642009-01-31 22:14:21 +0000913 case SSTATE_INTERNED_IMMORTAL:
914 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000915
Benjamin Peterson29060642009-01-31 22:14:21 +0000916 default:
917 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000918 }
919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200920 if (_PyUnicode_WSTR(unicode) &&
921 (!PyUnicode_IS_READY(unicode) ||
922 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
923 PyObject_DEL(_PyUnicode_WSTR(unicode));
924 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
925 PyObject_DEL(unicode->_base.utf8);
926
927 if (PyUnicode_IS_COMPACT(unicode)) {
928 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000929 }
930 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931 if (unicode->data.any)
932 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000933 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000934 }
935}
936
Alexander Belopolsky40018472011-02-26 01:02:56 +0000937static int
938_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000939{
940 register PyUnicodeObject *v;
941
942 /* Argument checks */
943 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000944 PyErr_BadInternalCall();
945 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000946 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000947 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200948 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
949 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000950 PyErr_BadInternalCall();
951 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000952 }
953
954 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200955 possible since these are being shared.
956 The same goes for new-representation unicode objects or objects which
957 have already been readied.
958 For these, we simply return a fresh copy with the same Unicode content.
959 */
960 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
961 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
962 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000963 PyUnicodeObject *w = _PyUnicode_New(length);
964 if (w == NULL)
965 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
967 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000968 Py_DECREF(*unicode);
969 *unicode = w;
970 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000971 }
972
973 /* Note that we don't have to modify *unicode for unshared Unicode
974 objects, since we can modify them in-place. */
975 return unicode_resize(v, length);
976}
977
Alexander Belopolsky40018472011-02-26 01:02:56 +0000978int
979PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000980{
981 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
982}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984static PyObject*
985get_latin1_char(unsigned char ch)
986{
987 PyUnicodeObject *unicode = unicode_latin1[ch];
988 if (!unicode) {
989 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
990 if (!unicode)
991 return NULL;
992 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
993 unicode_latin1[ch] = unicode;
994 }
995 Py_INCREF(unicode);
996 return (PyObject *)unicode;
997}
998
Alexander Belopolsky40018472011-02-26 01:02:56 +0000999PyObject *
1000PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001001{
1002 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001003 Py_UCS4 maxchar = 0;
1004 Py_ssize_t num_surrogates;
1005
1006 if (u == NULL)
1007 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001009 /* If the Unicode data is known at construction time, we can apply
1010 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001012 /* Optimization for empty strings */
1013 if (size == 0 && unicode_empty != NULL) {
1014 Py_INCREF(unicode_empty);
1015 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001016 }
Tim Petersced69f82003-09-16 20:30:58 +00001017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001018 /* Single character Unicode objects in the Latin-1 range are
1019 shared when using this constructor */
1020 if (size == 1 && *u < 256)
1021 return get_latin1_char((unsigned char)*u);
1022
1023 /* If not empty and not single character, copy the Unicode data
1024 into the new object */
1025 if (_PyUnicode_FindMaxCharAndNumSurrogatePairs(u, u + size, &maxchar,
1026 &num_surrogates) == -1)
1027 return NULL;
1028
1029 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1030 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031 if (!unicode)
1032 return NULL;
1033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001034 switch (PyUnicode_KIND(unicode)) {
1035 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001036 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001037 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1038 break;
1039 case PyUnicode_2BYTE_KIND:
1040#if Py_UNICODE_SIZE == 2
1041 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1042#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001043 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1045#endif
1046 break;
1047 case PyUnicode_4BYTE_KIND:
1048#if SIZEOF_WCHAR_T == 2
1049 /* This is the only case which has to process surrogates, thus
1050 a simple copy loop is not enough and we need a function. */
1051 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1052 Py_DECREF(unicode);
1053 return NULL;
1054 }
1055#else
1056 assert(num_surrogates == 0);
1057 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1058#endif
1059 break;
1060 default:
1061 assert(0 && "Impossible state");
1062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063
1064 return (PyObject *)unicode;
1065}
1066
Alexander Belopolsky40018472011-02-26 01:02:56 +00001067PyObject *
1068PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001069{
1070 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001071
Benjamin Peterson14339b62009-01-31 16:36:08 +00001072 if (size < 0) {
1073 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001074 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001075 return NULL;
1076 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001077
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001078 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001079 some optimizations which share commonly used objects.
1080 Also, this means the input must be UTF-8, so fall back to the
1081 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001082 if (u != NULL) {
1083
Benjamin Peterson29060642009-01-31 22:14:21 +00001084 /* Optimization for empty strings */
1085 if (size == 0 && unicode_empty != NULL) {
1086 Py_INCREF(unicode_empty);
1087 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001088 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001089
1090 /* Single characters are shared when using this constructor.
1091 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001092 if (size == 1 && Py_CHARMASK(*u) < 128)
1093 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001094
1095 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001096 }
1097
Walter Dörwald55507312007-05-18 13:12:10 +00001098 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001099 if (!unicode)
1100 return NULL;
1101
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001102 return (PyObject *)unicode;
1103}
1104
Alexander Belopolsky40018472011-02-26 01:02:56 +00001105PyObject *
1106PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001107{
1108 size_t size = strlen(u);
1109 if (size > PY_SSIZE_T_MAX) {
1110 PyErr_SetString(PyExc_OverflowError, "input too long");
1111 return NULL;
1112 }
1113
1114 return PyUnicode_FromStringAndSize(u, size);
1115}
1116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117PyObject*
1118PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001119{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 PyObject *res;
1121 unsigned char max = 127;
1122 Py_ssize_t i;
1123 for (i = 0; i < size; i++) {
1124 if (u[i] & 0x80) {
1125 max = 255;
1126 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001127 }
1128 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129 res = PyUnicode_New(size, max);
1130 if (!res)
1131 return NULL;
1132 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1133 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001134}
1135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136PyObject*
1137PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1138{
1139 PyObject *res;
1140 Py_UCS2 max = 0;
1141 Py_ssize_t i;
1142 for (i = 0; i < size; i++)
1143 if (u[i] > max)
1144 max = u[i];
1145 res = PyUnicode_New(size, max);
1146 if (!res)
1147 return NULL;
1148 if (max >= 256)
1149 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1150 else
1151 for (i = 0; i < size; i++)
1152 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1153 return res;
1154}
1155
1156PyObject*
1157PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1158{
1159 PyObject *res;
1160 Py_UCS4 max = 0;
1161 Py_ssize_t i;
1162 for (i = 0; i < size; i++)
1163 if (u[i] > max)
1164 max = u[i];
1165 res = PyUnicode_New(size, max);
1166 if (!res)
1167 return NULL;
1168 if (max >= 0x10000)
1169 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1170 else {
1171 int kind = PyUnicode_KIND(res);
1172 void *data = PyUnicode_DATA(res);
1173 for (i = 0; i < size; i++)
1174 PyUnicode_WRITE(kind, data, i, u[i]);
1175 }
1176 return res;
1177}
1178
1179PyObject*
1180PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1181{
1182 switch(kind) {
1183 case PyUnicode_1BYTE_KIND:
1184 return PyUnicode_FromUCS1(buffer, size);
1185 case PyUnicode_2BYTE_KIND:
1186 return PyUnicode_FromUCS2(buffer, size);
1187 case PyUnicode_4BYTE_KIND:
1188 return PyUnicode_FromUCS4(buffer, size);
1189 }
1190 assert(0);
1191 return NULL;
1192}
1193
1194
1195/* Widen Unicode objects to larger buffers.
1196 Return NULL if the string is too wide already. */
1197
1198void*
1199_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1200{
1201 Py_ssize_t i;
1202 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1203 void *d = PyUnicode_DATA(s);
1204 unsigned int skind = PyUnicode_KIND(s);
1205 if (PyUnicode_KIND(s) >= kind) {
1206 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1207 return NULL;
1208 }
1209 switch(kind) {
1210 case PyUnicode_2BYTE_KIND: {
1211 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1212 if (!result) {
1213 PyErr_NoMemory();
1214 return 0;
1215 }
1216 for (i = 0; i < len; i++)
1217 result[i] = ((Py_UCS1*)d)[i];
1218 return result;
1219 }
1220 case PyUnicode_4BYTE_KIND: {
1221 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1222 if (!result) {
1223 PyErr_NoMemory();
1224 return 0;
1225 }
1226 for (i = 0; i < len; i++)
1227 result[i] = PyUnicode_READ(skind, d, i);
1228 return result;
1229 }
1230 }
1231 Py_FatalError("invalid kind");
1232 return NULL;
1233}
1234
1235static Py_UCS4*
1236as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1237 int copy_null)
1238{
1239 int kind;
1240 void *data;
1241 Py_ssize_t len, targetlen;
1242 if (PyUnicode_READY(string) == -1)
1243 return NULL;
1244 kind = PyUnicode_KIND(string);
1245 data = PyUnicode_DATA(string);
1246 len = PyUnicode_GET_LENGTH(string);
1247 targetlen = len;
1248 if (copy_null)
1249 targetlen++;
1250 if (!target) {
1251 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1252 PyErr_NoMemory();
1253 return NULL;
1254 }
1255 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1256 if (!target) {
1257 PyErr_NoMemory();
1258 return NULL;
1259 }
1260 }
1261 else {
1262 if (targetsize < targetlen) {
1263 PyErr_Format(PyExc_SystemError,
1264 "string is longer than the buffer");
1265 if (copy_null && 0 < targetsize)
1266 target[0] = 0;
1267 return NULL;
1268 }
1269 }
1270 if (kind != PyUnicode_4BYTE_KIND) {
1271 Py_ssize_t i;
1272 for (i = 0; i < len; i++)
1273 target[i] = PyUnicode_READ(kind, data, i);
1274 }
1275 else
1276 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1277 if (copy_null)
1278 target[len] = 0;
1279 return target;
1280}
1281
1282Py_UCS4*
1283PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1284 int copy_null)
1285{
1286 if (target == NULL || targetsize < 1) {
1287 PyErr_BadInternalCall();
1288 return NULL;
1289 }
1290 return as_ucs4(string, target, targetsize, copy_null);
1291}
1292
1293Py_UCS4*
1294PyUnicode_AsUCS4Copy(PyObject *string)
1295{
1296 return as_ucs4(string, NULL, 0, 1);
1297}
1298
1299#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001300
Alexander Belopolsky40018472011-02-26 01:02:56 +00001301PyObject *
1302PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001305 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001307 PyErr_BadInternalCall();
1308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001309 }
1310
Martin v. Löwis790465f2008-04-05 20:41:37 +00001311 if (size == -1) {
1312 size = wcslen(w);
1313 }
1314
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316}
1317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001319
Walter Dörwald346737f2007-05-31 10:44:43 +00001320static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001321makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1322 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001323{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001324 *fmt++ = '%';
1325 if (width) {
1326 if (zeropad)
1327 *fmt++ = '0';
1328 fmt += sprintf(fmt, "%d", width);
1329 }
1330 if (precision)
1331 fmt += sprintf(fmt, ".%d", precision);
1332 if (longflag)
1333 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001334 else if (longlongflag) {
1335 /* longlongflag should only ever be nonzero on machines with
1336 HAVE_LONG_LONG defined */
1337#ifdef HAVE_LONG_LONG
1338 char *f = PY_FORMAT_LONG_LONG;
1339 while (*f)
1340 *fmt++ = *f++;
1341#else
1342 /* we shouldn't ever get here */
1343 assert(0);
1344 *fmt++ = 'l';
1345#endif
1346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001347 else if (size_tflag) {
1348 char *f = PY_FORMAT_SIZE_T;
1349 while (*f)
1350 *fmt++ = *f++;
1351 }
1352 *fmt++ = c;
1353 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001354}
1355
Victor Stinner96865452011-03-01 23:44:09 +00001356/* helper for PyUnicode_FromFormatV() */
1357
1358static const char*
1359parse_format_flags(const char *f,
1360 int *p_width, int *p_precision,
1361 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1362{
1363 int width, precision, longflag, longlongflag, size_tflag;
1364
1365 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1366 f++;
1367 width = 0;
1368 while (Py_ISDIGIT((unsigned)*f))
1369 width = (width*10) + *f++ - '0';
1370 precision = 0;
1371 if (*f == '.') {
1372 f++;
1373 while (Py_ISDIGIT((unsigned)*f))
1374 precision = (precision*10) + *f++ - '0';
1375 if (*f == '%') {
1376 /* "%.3%s" => f points to "3" */
1377 f--;
1378 }
1379 }
1380 if (*f == '\0') {
1381 /* bogus format "%.1" => go backward, f points to "1" */
1382 f--;
1383 }
1384 if (p_width != NULL)
1385 *p_width = width;
1386 if (p_precision != NULL)
1387 *p_precision = precision;
1388
1389 /* Handle %ld, %lu, %lld and %llu. */
1390 longflag = 0;
1391 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001392 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001393
1394 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001395 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001396 longflag = 1;
1397 ++f;
1398 }
1399#ifdef HAVE_LONG_LONG
1400 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001401 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001402 longlongflag = 1;
1403 f += 2;
1404 }
1405#endif
1406 }
1407 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001408 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001409 size_tflag = 1;
1410 ++f;
1411 }
1412 if (p_longflag != NULL)
1413 *p_longflag = longflag;
1414 if (p_longlongflag != NULL)
1415 *p_longlongflag = longlongflag;
1416 if (p_size_tflag != NULL)
1417 *p_size_tflag = size_tflag;
1418 return f;
1419}
1420
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001421/* maximum number of characters required for output of %ld. 21 characters
1422 allows for 64-bit integers (in decimal) and an optional sign. */
1423#define MAX_LONG_CHARS 21
1424/* maximum number of characters required for output of %lld.
1425 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1426 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1427#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1428
Walter Dörwaldd2034312007-05-18 16:29:38 +00001429PyObject *
1430PyUnicode_FromFormatV(const char *format, va_list vargs)
1431{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001432 va_list count;
1433 Py_ssize_t callcount = 0;
1434 PyObject **callresults = NULL;
1435 PyObject **callresult = NULL;
1436 Py_ssize_t n = 0;
1437 int width = 0;
1438 int precision = 0;
1439 int zeropad;
1440 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001442 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001443 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1445 Py_UCS4 argmaxchar;
1446 Py_ssize_t numbersize = 0;
1447 char *numberresults = NULL;
1448 char *numberresult = NULL;
1449 Py_ssize_t i;
1450 int kind;
1451 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001452
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001453 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001454 /* step 1: count the number of %S/%R/%A/%s format specifications
1455 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1456 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 * result in an array)
1458 * also esimate a upper bound for all the number formats in the string,
1459 * numbers will be formated in step 3 and be keept in a '\0'-separated
1460 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001461 for (f = format; *f; f++) {
1462 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001463 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1465 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1466 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1467 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001470#ifdef HAVE_LONG_LONG
1471 if (longlongflag) {
1472 if (width < MAX_LONG_LONG_CHARS)
1473 width = MAX_LONG_LONG_CHARS;
1474 }
1475 else
1476#endif
1477 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1478 including sign. Decimal takes the most space. This
1479 isn't enough for octal. If a width is specified we
1480 need more (which we allocate later). */
1481 if (width < MAX_LONG_CHARS)
1482 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483
1484 /* account for the size + '\0' to separate numbers
1485 inside of the numberresults buffer */
1486 numbersize += (width + 1);
1487 }
1488 }
1489 else if ((unsigned char)*f > 127) {
1490 PyErr_Format(PyExc_ValueError,
1491 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1492 "string, got a non-ASCII byte: 0x%02x",
1493 (unsigned char)*f);
1494 return NULL;
1495 }
1496 }
1497 /* step 2: allocate memory for the results of
1498 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1499 if (callcount) {
1500 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1501 if (!callresults) {
1502 PyErr_NoMemory();
1503 return NULL;
1504 }
1505 callresult = callresults;
1506 }
1507 /* step 2.5: allocate memory for the results of formating numbers */
1508 if (numbersize) {
1509 numberresults = PyObject_Malloc(numbersize);
1510 if (!numberresults) {
1511 PyErr_NoMemory();
1512 goto fail;
1513 }
1514 numberresult = numberresults;
1515 }
1516
1517 /* step 3: format numbers and figure out how large a buffer we need */
1518 for (f = format; *f; f++) {
1519 if (*f == '%') {
1520 const char* p;
1521 int longflag;
1522 int longlongflag;
1523 int size_tflag;
1524 int numprinted;
1525
1526 p = f;
1527 zeropad = (f[1] == '0');
1528 f = parse_format_flags(f, &width, &precision,
1529 &longflag, &longlongflag, &size_tflag);
1530 switch (*f) {
1531 case 'c':
1532 {
1533 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001534 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001535 n++;
1536 break;
1537 }
1538 case '%':
1539 n++;
1540 break;
1541 case 'i':
1542 case 'd':
1543 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1544 width, precision, *f);
1545 if (longflag)
1546 numprinted = sprintf(numberresult, fmt,
1547 va_arg(count, long));
1548#ifdef HAVE_LONG_LONG
1549 else if (longlongflag)
1550 numprinted = sprintf(numberresult, fmt,
1551 va_arg(count, PY_LONG_LONG));
1552#endif
1553 else if (size_tflag)
1554 numprinted = sprintf(numberresult, fmt,
1555 va_arg(count, Py_ssize_t));
1556 else
1557 numprinted = sprintf(numberresult, fmt,
1558 va_arg(count, int));
1559 n += numprinted;
1560 /* advance by +1 to skip over the '\0' */
1561 numberresult += (numprinted + 1);
1562 assert(*(numberresult - 1) == '\0');
1563 assert(*(numberresult - 2) != '\0');
1564 assert(numprinted >= 0);
1565 assert(numberresult <= numberresults + numbersize);
1566 break;
1567 case 'u':
1568 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1569 width, precision, 'u');
1570 if (longflag)
1571 numprinted = sprintf(numberresult, fmt,
1572 va_arg(count, unsigned long));
1573#ifdef HAVE_LONG_LONG
1574 else if (longlongflag)
1575 numprinted = sprintf(numberresult, fmt,
1576 va_arg(count, unsigned PY_LONG_LONG));
1577#endif
1578 else if (size_tflag)
1579 numprinted = sprintf(numberresult, fmt,
1580 va_arg(count, size_t));
1581 else
1582 numprinted = sprintf(numberresult, fmt,
1583 va_arg(count, unsigned int));
1584 n += numprinted;
1585 numberresult += (numprinted + 1);
1586 assert(*(numberresult - 1) == '\0');
1587 assert(*(numberresult - 2) != '\0');
1588 assert(numprinted >= 0);
1589 assert(numberresult <= numberresults + numbersize);
1590 break;
1591 case 'x':
1592 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1593 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1594 n += numprinted;
1595 numberresult += (numprinted + 1);
1596 assert(*(numberresult - 1) == '\0');
1597 assert(*(numberresult - 2) != '\0');
1598 assert(numprinted >= 0);
1599 assert(numberresult <= numberresults + numbersize);
1600 break;
1601 case 'p':
1602 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1603 /* %p is ill-defined: ensure leading 0x. */
1604 if (numberresult[1] == 'X')
1605 numberresult[1] = 'x';
1606 else if (numberresult[1] != 'x') {
1607 memmove(numberresult + 2, numberresult,
1608 strlen(numberresult) + 1);
1609 numberresult[0] = '0';
1610 numberresult[1] = 'x';
1611 numprinted += 2;
1612 }
1613 n += numprinted;
1614 numberresult += (numprinted + 1);
1615 assert(*(numberresult - 1) == '\0');
1616 assert(*(numberresult - 2) != '\0');
1617 assert(numprinted >= 0);
1618 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001619 break;
1620 case 's':
1621 {
1622 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001623 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001624 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1625 if (!str)
1626 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001627 /* since PyUnicode_DecodeUTF8 returns already flexible
1628 unicode objects, there is no need to call ready on them */
1629 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001630 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001632 /* Remember the str and switch to the next slot */
1633 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001634 break;
1635 }
1636 case 'U':
1637 {
1638 PyObject *obj = va_arg(count, PyObject *);
1639 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640 if (PyUnicode_READY(obj) == -1)
1641 goto fail;
1642 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001643 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001645 break;
1646 }
1647 case 'V':
1648 {
1649 PyObject *obj = va_arg(count, PyObject *);
1650 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001651 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001652 assert(obj || str);
1653 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001654 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 if (PyUnicode_READY(obj) == -1)
1656 goto fail;
1657 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001658 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001659 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001660 *callresult++ = NULL;
1661 }
1662 else {
1663 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1664 if (!str_obj)
1665 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001667 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001668 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001669 *callresult++ = str_obj;
1670 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001671 break;
1672 }
1673 case 'S':
1674 {
1675 PyObject *obj = va_arg(count, PyObject *);
1676 PyObject *str;
1677 assert(obj);
1678 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001680 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001682 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001684 /* Remember the str and switch to the next slot */
1685 *callresult++ = str;
1686 break;
1687 }
1688 case 'R':
1689 {
1690 PyObject *obj = va_arg(count, PyObject *);
1691 PyObject *repr;
1692 assert(obj);
1693 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001695 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001696 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001697 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001699 /* Remember the repr and switch to the next slot */
1700 *callresult++ = repr;
1701 break;
1702 }
1703 case 'A':
1704 {
1705 PyObject *obj = va_arg(count, PyObject *);
1706 PyObject *ascii;
1707 assert(obj);
1708 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001710 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001712 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001714 /* Remember the repr and switch to the next slot */
1715 *callresult++ = ascii;
1716 break;
1717 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001718 default:
1719 /* if we stumble upon an unknown
1720 formatting code, copy the rest of
1721 the format string to the output
1722 string. (we cannot just skip the
1723 code, since there's no way to know
1724 what's in the argument list) */
1725 n += strlen(p);
1726 goto expand;
1727 }
1728 } else
1729 n++;
1730 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001731 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001732 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001734 we don't have to resize the string.
1735 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001737 if (!string)
1738 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739 kind = PyUnicode_KIND(string);
1740 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001741 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001745 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001746 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001747
1748 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1750 /* checking for == because the last argument could be a empty
1751 string, which causes i to point to end, the assert at the end of
1752 the loop */
1753 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001754
Benjamin Peterson14339b62009-01-31 16:36:08 +00001755 switch (*f) {
1756 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001757 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 const int ordinal = va_arg(vargs, int);
1759 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001760 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001761 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001762 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001763 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001764 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001765 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 case 'p':
1767 /* unused, since we already have the result */
1768 if (*f == 'p')
1769 (void) va_arg(vargs, void *);
1770 else
1771 (void) va_arg(vargs, int);
1772 /* extract the result from numberresults and append. */
1773 for (; *numberresult; ++i, ++numberresult)
1774 PyUnicode_WRITE(kind, data, i, *numberresult);
1775 /* skip over the separating '\0' */
1776 assert(*numberresult == '\0');
1777 numberresult++;
1778 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001779 break;
1780 case 's':
1781 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001782 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001784 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 size = PyUnicode_GET_LENGTH(*callresult);
1786 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
1787 PyUnicode_CopyCharacters((PyObject*)string, i,
1788 *callresult, 0,
1789 size);
1790 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001791 /* We're done with the unicode()/repr() => forget it */
1792 Py_DECREF(*callresult);
1793 /* switch to next unicode()/repr() result */
1794 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001795 break;
1796 }
1797 case 'U':
1798 {
1799 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 Py_ssize_t size;
1801 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1802 size = PyUnicode_GET_LENGTH(obj);
1803 PyUnicode_CopyCharacters((PyObject*)string, i,
1804 obj, 0,
1805 size);
1806 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001807 break;
1808 }
1809 case 'V':
1810 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001812 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001813 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001814 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 size = PyUnicode_GET_LENGTH(obj);
1816 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1817 PyUnicode_CopyCharacters((PyObject*)string, i,
1818 obj, 0,
1819 size);
1820 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001821 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 size = PyUnicode_GET_LENGTH(*callresult);
1823 assert(PyUnicode_KIND(*callresult) <=
1824 PyUnicode_KIND(string));
1825 PyUnicode_CopyCharacters((PyObject*)string, i,
1826 *callresult,
1827 0, size);
1828 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001829 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001830 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001831 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001832 break;
1833 }
1834 case 'S':
1835 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001836 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001837 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001838 /* unused, since we already have the result */
1839 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
1841 PyUnicode_CopyCharacters((PyObject*)string, i,
1842 *callresult, 0,
1843 PyUnicode_GET_LENGTH(*callresult));
1844 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001845 /* We're done with the unicode()/repr() => forget it */
1846 Py_DECREF(*callresult);
1847 /* switch to next unicode()/repr() result */
1848 ++callresult;
1849 break;
1850 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001851 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001852 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001853 break;
1854 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 for (; *p; ++p, ++i)
1856 PyUnicode_WRITE(kind, data, i, *p);
1857 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001858 goto end;
1859 }
Victor Stinner1205f272010-09-11 00:54:47 +00001860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001861 else {
1862 assert(i < PyUnicode_GET_LENGTH(string));
1863 PyUnicode_WRITE(kind, data, i++, *f);
1864 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001867
Benjamin Peterson29060642009-01-31 22:14:21 +00001868 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001869 if (callresults)
1870 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871 if (numberresults)
1872 PyObject_Free(numberresults);
1873 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001874 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001875 if (callresults) {
1876 PyObject **callresult2 = callresults;
1877 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001878 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001879 ++callresult2;
1880 }
1881 PyObject_Free(callresults);
1882 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 if (numberresults)
1884 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001885 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001886}
1887
Walter Dörwaldd2034312007-05-18 16:29:38 +00001888PyObject *
1889PyUnicode_FromFormat(const char *format, ...)
1890{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001891 PyObject* ret;
1892 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001893
1894#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001895 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001896#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001897 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001898#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001899 ret = PyUnicode_FromFormatV(format, vargs);
1900 va_end(vargs);
1901 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001902}
1903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001904#ifdef HAVE_WCHAR_H
1905
Victor Stinner5593d8a2010-10-02 11:11:27 +00001906/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1907 convert a Unicode object to a wide character string.
1908
Victor Stinnerd88d9832011-09-06 02:00:05 +02001909 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001910 character) required to convert the unicode object. Ignore size argument.
1911
Victor Stinnerd88d9832011-09-06 02:00:05 +02001912 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001913 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001914 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001915static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001916unicode_aswidechar(PyUnicodeObject *unicode,
1917 wchar_t *w,
1918 Py_ssize_t size)
1919{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001920 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921 const wchar_t *wstr;
1922
1923 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1924 if (wstr == NULL)
1925 return -1;
1926
Victor Stinner5593d8a2010-10-02 11:11:27 +00001927 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001928 if (size > res)
1929 size = res + 1;
1930 else
1931 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001933 return res;
1934 }
1935 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001937}
1938
1939Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001940PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001941 wchar_t *w,
1942 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943{
1944 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001945 PyErr_BadInternalCall();
1946 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001948 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949}
1950
Victor Stinner137c34c2010-09-29 10:25:54 +00001951wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001952PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001953 Py_ssize_t *size)
1954{
1955 wchar_t* buffer;
1956 Py_ssize_t buflen;
1957
1958 if (unicode == NULL) {
1959 PyErr_BadInternalCall();
1960 return NULL;
1961 }
1962
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001963 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 if (buflen == -1)
1965 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001966 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001967 PyErr_NoMemory();
1968 return NULL;
1969 }
1970
Victor Stinner137c34c2010-09-29 10:25:54 +00001971 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1972 if (buffer == NULL) {
1973 PyErr_NoMemory();
1974 return NULL;
1975 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001976 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 if (buflen == -1)
1978 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001979 if (size != NULL)
1980 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001981 return buffer;
1982}
1983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985
Alexander Belopolsky40018472011-02-26 01:02:56 +00001986PyObject *
1987PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001988{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001990 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001991 PyErr_SetString(PyExc_ValueError,
1992 "chr() arg not in range(0x110000)");
1993 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001994 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 if (ordinal < 256)
1997 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 v = PyUnicode_New(1, ordinal);
2000 if (v == NULL)
2001 return NULL;
2002 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2003 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002004}
2005
Alexander Belopolsky40018472011-02-26 01:02:56 +00002006PyObject *
2007PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002009 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002010 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002011 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002012 Py_INCREF(obj);
2013 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002014 }
2015 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002016 /* For a Unicode subtype that's not a Unicode object,
2017 return a true Unicode object with the same data. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 if (PyUnicode_READY(obj) == -1)
2019 return NULL;
2020 return substring((PyUnicodeObject *)obj, 0, PyUnicode_GET_LENGTH(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002021 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002022 PyErr_Format(PyExc_TypeError,
2023 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002024 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002025 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002026}
2027
Alexander Belopolsky40018472011-02-26 01:02:56 +00002028PyObject *
2029PyUnicode_FromEncodedObject(register PyObject *obj,
2030 const char *encoding,
2031 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002032{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002033 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002034 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002035
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002037 PyErr_BadInternalCall();
2038 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002040
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002041 /* Decoding bytes objects is the most common case and should be fast */
2042 if (PyBytes_Check(obj)) {
2043 if (PyBytes_GET_SIZE(obj) == 0) {
2044 Py_INCREF(unicode_empty);
2045 v = (PyObject *) unicode_empty;
2046 }
2047 else {
2048 v = PyUnicode_Decode(
2049 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2050 encoding, errors);
2051 }
2052 return v;
2053 }
2054
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002055 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002056 PyErr_SetString(PyExc_TypeError,
2057 "decoding str is not supported");
2058 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002059 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002060
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002061 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2062 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2063 PyErr_Format(PyExc_TypeError,
2064 "coercing to str: need bytes, bytearray "
2065 "or buffer-like object, %.80s found",
2066 Py_TYPE(obj)->tp_name);
2067 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002068 }
Tim Petersced69f82003-09-16 20:30:58 +00002069
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002070 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002071 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002072 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 }
Tim Petersced69f82003-09-16 20:30:58 +00002074 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002075 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002076
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002077 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002078 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079}
2080
Victor Stinner600d3be2010-06-10 12:00:55 +00002081/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002082 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2083 1 on success. */
2084static int
2085normalize_encoding(const char *encoding,
2086 char *lower,
2087 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002089 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002090 char *l;
2091 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002092
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002093 e = encoding;
2094 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002095 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002096 while (*e) {
2097 if (l == l_end)
2098 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002099 if (Py_ISUPPER(*e)) {
2100 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002101 }
2102 else if (*e == '_') {
2103 *l++ = '-';
2104 e++;
2105 }
2106 else {
2107 *l++ = *e++;
2108 }
2109 }
2110 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002111 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002112}
2113
Alexander Belopolsky40018472011-02-26 01:02:56 +00002114PyObject *
2115PyUnicode_Decode(const char *s,
2116 Py_ssize_t size,
2117 const char *encoding,
2118 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002119{
2120 PyObject *buffer = NULL, *unicode;
2121 Py_buffer info;
2122 char lower[11]; /* Enough for any encoding shortcut */
2123
2124 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002125 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002126
2127 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002128 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002129 if ((strcmp(lower, "utf-8") == 0) ||
2130 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002131 return PyUnicode_DecodeUTF8(s, size, errors);
2132 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002133 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002134 (strcmp(lower, "iso-8859-1") == 0))
2135 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002136#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002137 else if (strcmp(lower, "mbcs") == 0)
2138 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002139#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002140 else if (strcmp(lower, "ascii") == 0)
2141 return PyUnicode_DecodeASCII(s, size, errors);
2142 else if (strcmp(lower, "utf-16") == 0)
2143 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2144 else if (strcmp(lower, "utf-32") == 0)
2145 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147
2148 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002149 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002150 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002151 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002152 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153 if (buffer == NULL)
2154 goto onError;
2155 unicode = PyCodec_Decode(buffer, encoding, errors);
2156 if (unicode == NULL)
2157 goto onError;
2158 if (!PyUnicode_Check(unicode)) {
2159 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002160 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002161 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 Py_DECREF(unicode);
2163 goto onError;
2164 }
2165 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 if (PyUnicode_READY(unicode)) {
2167 Py_DECREF(unicode);
2168 return NULL;
2169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002171
Benjamin Peterson29060642009-01-31 22:14:21 +00002172 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002173 Py_XDECREF(buffer);
2174 return NULL;
2175}
2176
Alexander Belopolsky40018472011-02-26 01:02:56 +00002177PyObject *
2178PyUnicode_AsDecodedObject(PyObject *unicode,
2179 const char *encoding,
2180 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002181{
2182 PyObject *v;
2183
2184 if (!PyUnicode_Check(unicode)) {
2185 PyErr_BadArgument();
2186 goto onError;
2187 }
2188
2189 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002190 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002191
2192 /* Decode via the codec registry */
2193 v = PyCodec_Decode(unicode, encoding, errors);
2194 if (v == NULL)
2195 goto onError;
2196 return v;
2197
Benjamin Peterson29060642009-01-31 22:14:21 +00002198 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002199 return NULL;
2200}
2201
Alexander Belopolsky40018472011-02-26 01:02:56 +00002202PyObject *
2203PyUnicode_AsDecodedUnicode(PyObject *unicode,
2204 const char *encoding,
2205 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002206{
2207 PyObject *v;
2208
2209 if (!PyUnicode_Check(unicode)) {
2210 PyErr_BadArgument();
2211 goto onError;
2212 }
2213
2214 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002215 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002216
2217 /* Decode via the codec registry */
2218 v = PyCodec_Decode(unicode, encoding, errors);
2219 if (v == NULL)
2220 goto onError;
2221 if (!PyUnicode_Check(v)) {
2222 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002223 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002224 Py_TYPE(v)->tp_name);
2225 Py_DECREF(v);
2226 goto onError;
2227 }
2228 return v;
2229
Benjamin Peterson29060642009-01-31 22:14:21 +00002230 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002231 return NULL;
2232}
2233
Alexander Belopolsky40018472011-02-26 01:02:56 +00002234PyObject *
2235PyUnicode_Encode(const Py_UNICODE *s,
2236 Py_ssize_t size,
2237 const char *encoding,
2238 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239{
2240 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002241
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242 unicode = PyUnicode_FromUnicode(s, size);
2243 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002244 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2246 Py_DECREF(unicode);
2247 return v;
2248}
2249
Alexander Belopolsky40018472011-02-26 01:02:56 +00002250PyObject *
2251PyUnicode_AsEncodedObject(PyObject *unicode,
2252 const char *encoding,
2253 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002254{
2255 PyObject *v;
2256
2257 if (!PyUnicode_Check(unicode)) {
2258 PyErr_BadArgument();
2259 goto onError;
2260 }
2261
2262 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002263 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002264
2265 /* Encode via the codec registry */
2266 v = PyCodec_Encode(unicode, encoding, errors);
2267 if (v == NULL)
2268 goto onError;
2269 return v;
2270
Benjamin Peterson29060642009-01-31 22:14:21 +00002271 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002272 return NULL;
2273}
2274
Victor Stinnerad158722010-10-27 00:25:46 +00002275PyObject *
2276PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002277{
Victor Stinner99b95382011-07-04 14:23:54 +02002278#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002279 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2280 PyUnicode_GET_SIZE(unicode),
2281 NULL);
2282#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002284#else
Victor Stinner793b5312011-04-27 00:24:21 +02002285 PyInterpreterState *interp = PyThreadState_GET()->interp;
2286 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2287 cannot use it to encode and decode filenames before it is loaded. Load
2288 the Python codec requires to encode at least its own filename. Use the C
2289 version of the locale codec until the codec registry is initialized and
2290 the Python codec is loaded.
2291
2292 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2293 cannot only rely on it: check also interp->fscodec_initialized for
2294 subinterpreters. */
2295 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002296 return PyUnicode_AsEncodedString(unicode,
2297 Py_FileSystemDefaultEncoding,
2298 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002299 }
2300 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002301 /* locale encoding with surrogateescape */
2302 wchar_t *wchar;
2303 char *bytes;
2304 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002305 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002306
2307 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2308 if (wchar == NULL)
2309 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002310 bytes = _Py_wchar2char(wchar, &error_pos);
2311 if (bytes == NULL) {
2312 if (error_pos != (size_t)-1) {
2313 char *errmsg = strerror(errno);
2314 PyObject *exc = NULL;
2315 if (errmsg == NULL)
2316 errmsg = "Py_wchar2char() failed";
2317 raise_encode_exception(&exc,
2318 "filesystemencoding",
2319 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2320 error_pos, error_pos+1,
2321 errmsg);
2322 Py_XDECREF(exc);
2323 }
2324 else
2325 PyErr_NoMemory();
2326 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002327 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002328 }
2329 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002330
2331 bytes_obj = PyBytes_FromString(bytes);
2332 PyMem_Free(bytes);
2333 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002334 }
Victor Stinnerad158722010-10-27 00:25:46 +00002335#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002336}
2337
Alexander Belopolsky40018472011-02-26 01:02:56 +00002338PyObject *
2339PyUnicode_AsEncodedString(PyObject *unicode,
2340 const char *encoding,
2341 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002342{
2343 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002344 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002345
Guido van Rossumd57fd912000-03-10 22:53:23 +00002346 if (!PyUnicode_Check(unicode)) {
2347 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349 }
Fred Drakee4315f52000-05-09 19:53:39 +00002350
Victor Stinner2f283c22011-03-02 01:21:46 +00002351 if (encoding == NULL) {
2352 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002353 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002354 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002355 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002356 }
Fred Drakee4315f52000-05-09 19:53:39 +00002357
2358 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002359 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002360 if ((strcmp(lower, "utf-8") == 0) ||
2361 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002362 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002363 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002365 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002366 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002367 }
Victor Stinner37296e82010-06-10 13:36:23 +00002368 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002369 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002370 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002372#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002373 else if (strcmp(lower, "mbcs") == 0)
2374 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2375 PyUnicode_GET_SIZE(unicode),
2376 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002377#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002378 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002379 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002381
2382 /* Encode via the codec registry */
2383 v = PyCodec_Encode(unicode, encoding, errors);
2384 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002385 return NULL;
2386
2387 /* The normal path */
2388 if (PyBytes_Check(v))
2389 return v;
2390
2391 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002392 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002393 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002394 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002395
2396 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2397 "encoder %s returned bytearray instead of bytes",
2398 encoding);
2399 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002400 Py_DECREF(v);
2401 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002402 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002403
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002404 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2405 Py_DECREF(v);
2406 return b;
2407 }
2408
2409 PyErr_Format(PyExc_TypeError,
2410 "encoder did not return a bytes object (type=%.400s)",
2411 Py_TYPE(v)->tp_name);
2412 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002413 return NULL;
2414}
2415
Alexander Belopolsky40018472011-02-26 01:02:56 +00002416PyObject *
2417PyUnicode_AsEncodedUnicode(PyObject *unicode,
2418 const char *encoding,
2419 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002420{
2421 PyObject *v;
2422
2423 if (!PyUnicode_Check(unicode)) {
2424 PyErr_BadArgument();
2425 goto onError;
2426 }
2427
2428 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002429 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002430
2431 /* Encode via the codec registry */
2432 v = PyCodec_Encode(unicode, encoding, errors);
2433 if (v == NULL)
2434 goto onError;
2435 if (!PyUnicode_Check(v)) {
2436 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002437 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002438 Py_TYPE(v)->tp_name);
2439 Py_DECREF(v);
2440 goto onError;
2441 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002443
Benjamin Peterson29060642009-01-31 22:14:21 +00002444 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002445 return NULL;
2446}
2447
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002448PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002449PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002450 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002451 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2452}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002453
Christian Heimes5894ba72007-11-04 11:43:14 +00002454PyObject*
2455PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2456{
Victor Stinner99b95382011-07-04 14:23:54 +02002457#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002458 return PyUnicode_DecodeMBCS(s, size, NULL);
2459#elif defined(__APPLE__)
2460 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2461#else
Victor Stinner793b5312011-04-27 00:24:21 +02002462 PyInterpreterState *interp = PyThreadState_GET()->interp;
2463 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2464 cannot use it to encode and decode filenames before it is loaded. Load
2465 the Python codec requires to encode at least its own filename. Use the C
2466 version of the locale codec until the codec registry is initialized and
2467 the Python codec is loaded.
2468
2469 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2470 cannot only rely on it: check also interp->fscodec_initialized for
2471 subinterpreters. */
2472 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002473 return PyUnicode_Decode(s, size,
2474 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002475 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002476 }
2477 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002478 /* locale encoding with surrogateescape */
2479 wchar_t *wchar;
2480 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002481 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002482
2483 if (s[size] != '\0' || size != strlen(s)) {
2484 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2485 return NULL;
2486 }
2487
Victor Stinner168e1172010-10-16 23:16:16 +00002488 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002489 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002490 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002491
Victor Stinner168e1172010-10-16 23:16:16 +00002492 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002493 PyMem_Free(wchar);
2494 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002495 }
Victor Stinnerad158722010-10-27 00:25:46 +00002496#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002497}
2498
Martin v. Löwis011e8422009-05-05 04:43:17 +00002499
2500int
2501PyUnicode_FSConverter(PyObject* arg, void* addr)
2502{
2503 PyObject *output = NULL;
2504 Py_ssize_t size;
2505 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002506 if (arg == NULL) {
2507 Py_DECREF(*(PyObject**)addr);
2508 return 1;
2509 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002510 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002511 output = arg;
2512 Py_INCREF(output);
2513 }
2514 else {
2515 arg = PyUnicode_FromObject(arg);
2516 if (!arg)
2517 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002518 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002519 Py_DECREF(arg);
2520 if (!output)
2521 return 0;
2522 if (!PyBytes_Check(output)) {
2523 Py_DECREF(output);
2524 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2525 return 0;
2526 }
2527 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002528 size = PyBytes_GET_SIZE(output);
2529 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002530 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002531 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002532 Py_DECREF(output);
2533 return 0;
2534 }
2535 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002536 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002537}
2538
2539
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002540int
2541PyUnicode_FSDecoder(PyObject* arg, void* addr)
2542{
2543 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002544 if (arg == NULL) {
2545 Py_DECREF(*(PyObject**)addr);
2546 return 1;
2547 }
2548 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002549 if (PyUnicode_READY(arg))
2550 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002551 output = arg;
2552 Py_INCREF(output);
2553 }
2554 else {
2555 arg = PyBytes_FromObject(arg);
2556 if (!arg)
2557 return 0;
2558 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2559 PyBytes_GET_SIZE(arg));
2560 Py_DECREF(arg);
2561 if (!output)
2562 return 0;
2563 if (!PyUnicode_Check(output)) {
2564 Py_DECREF(output);
2565 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2566 return 0;
2567 }
2568 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002569 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2570 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002571 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2572 Py_DECREF(output);
2573 return 0;
2574 }
2575 *(PyObject**)addr = output;
2576 return Py_CLEANUP_SUPPORTED;
2577}
2578
2579
Martin v. Löwis5b222132007-06-10 09:51:05 +00002580char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002581PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002582{
Christian Heimesf3863112007-11-22 07:46:41 +00002583 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2585
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002586 if (!PyUnicode_Check(unicode)) {
2587 PyErr_BadArgument();
2588 return NULL;
2589 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002591 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002592
2593 if (_PyUnicode_UTF8(unicode) == NULL) {
2594 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2595 if (bytes == NULL)
2596 return NULL;
2597 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2598 if (u->_base.utf8 == NULL) {
2599 Py_DECREF(bytes);
2600 return NULL;
2601 }
2602 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2603 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2604 Py_DECREF(bytes);
2605 }
2606
2607 if (psize)
2608 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2609 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002610}
2611
2612char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002614{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002615 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2616}
2617
2618#ifdef Py_DEBUG
2619int unicode_as_unicode_calls = 0;
2620#endif
2621
2622
2623Py_UNICODE *
2624PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2625{
2626 PyUnicodeObject *u;
2627 const unsigned char *one_byte;
2628#if SIZEOF_WCHAR_T == 4
2629 const Py_UCS2 *two_bytes;
2630#else
2631 const Py_UCS4 *four_bytes;
2632 const Py_UCS4 *ucs4_end;
2633 Py_ssize_t num_surrogates;
2634#endif
2635 wchar_t *w;
2636 wchar_t *wchar_end;
2637
2638 if (!PyUnicode_Check(unicode)) {
2639 PyErr_BadArgument();
2640 return NULL;
2641 }
2642 u = (PyUnicodeObject*)unicode;
2643 if (_PyUnicode_WSTR(u) == NULL) {
2644 /* Non-ASCII compact unicode object */
2645 assert(_PyUnicode_KIND(u) != 0);
2646 assert(PyUnicode_IS_READY(u));
2647
2648#ifdef Py_DEBUG
2649 ++unicode_as_unicode_calls;
2650#endif
2651
2652 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2653#if SIZEOF_WCHAR_T == 2
2654 four_bytes = PyUnicode_4BYTE_DATA(u);
2655 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2656 num_surrogates = 0;
2657
2658 for (; four_bytes < ucs4_end; ++four_bytes) {
2659 if (*four_bytes > 0xFFFF)
2660 ++num_surrogates;
2661 }
2662
2663 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2664 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2665 if (!_PyUnicode_WSTR(u)) {
2666 PyErr_NoMemory();
2667 return NULL;
2668 }
2669 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2670
2671 w = _PyUnicode_WSTR(u);
2672 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2673 four_bytes = PyUnicode_4BYTE_DATA(u);
2674 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2675 if (*four_bytes > 0xFFFF) {
2676 /* encode surrogate pair in this case */
2677 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2678 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2679 }
2680 else
2681 *w = *four_bytes;
2682
2683 if (w > wchar_end) {
2684 assert(0 && "Miscalculated string end");
2685 }
2686 }
2687 *w = 0;
2688#else
2689 /* sizeof(wchar_t) == 4 */
2690 Py_FatalError("Impossible unicode object state, wstr and str "
2691 "should share memory already.");
2692 return NULL;
2693#endif
2694 }
2695 else {
2696 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2697 (_PyUnicode_LENGTH(u) + 1));
2698 if (!_PyUnicode_WSTR(u)) {
2699 PyErr_NoMemory();
2700 return NULL;
2701 }
2702 if (!PyUnicode_IS_COMPACT_ASCII(u))
2703 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2704 w = _PyUnicode_WSTR(u);
2705 wchar_end = w + _PyUnicode_LENGTH(u);
2706
2707 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2708 one_byte = PyUnicode_1BYTE_DATA(u);
2709 for (; w < wchar_end; ++one_byte, ++w)
2710 *w = *one_byte;
2711 /* null-terminate the wstr */
2712 *w = 0;
2713 }
2714 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2715#if SIZEOF_WCHAR_T == 4
2716 two_bytes = PyUnicode_2BYTE_DATA(u);
2717 for (; w < wchar_end; ++two_bytes, ++w)
2718 *w = *two_bytes;
2719 /* null-terminate the wstr */
2720 *w = 0;
2721#else
2722 /* sizeof(wchar_t) == 2 */
2723 PyObject_FREE(_PyUnicode_WSTR(u));
2724 _PyUnicode_WSTR(u) = NULL;
2725 Py_FatalError("Impossible unicode object state, wstr "
2726 "and str should share memory already.");
2727 return NULL;
2728#endif
2729 }
2730 else {
2731 assert(0 && "This should never happen.");
2732 }
2733 }
2734 }
2735 if (size != NULL)
2736 *size = PyUnicode_WSTR_LENGTH(u);
2737 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002738}
2739
Alexander Belopolsky40018472011-02-26 01:02:56 +00002740Py_UNICODE *
2741PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744}
2745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002746
Alexander Belopolsky40018472011-02-26 01:02:56 +00002747Py_ssize_t
2748PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749{
2750 if (!PyUnicode_Check(unicode)) {
2751 PyErr_BadArgument();
2752 goto onError;
2753 }
2754 return PyUnicode_GET_SIZE(unicode);
2755
Benjamin Peterson29060642009-01-31 22:14:21 +00002756 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 return -1;
2758}
2759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002760Py_ssize_t
2761PyUnicode_GetLength(PyObject *unicode)
2762{
2763 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2764 PyErr_BadArgument();
2765 return -1;
2766 }
2767
2768 return PyUnicode_GET_LENGTH(unicode);
2769}
2770
2771Py_UCS4
2772PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2773{
2774 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2775 return PyErr_BadArgument();
2776 return (Py_UCS4)-1;
2777 }
2778 return PyUnicode_READ_CHAR(unicode, index);
2779}
2780
2781int
2782PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2783{
2784 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2785 return PyErr_BadArgument();
2786 return -1;
2787 }
2788
2789 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2790 index, ch);
2791 return 0;
2792}
2793
Alexander Belopolsky40018472011-02-26 01:02:56 +00002794const char *
2795PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002796{
Victor Stinner42cb4622010-09-01 19:39:01 +00002797 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002798}
2799
Victor Stinner554f3f02010-06-16 23:33:54 +00002800/* create or adjust a UnicodeDecodeError */
2801static void
2802make_decode_exception(PyObject **exceptionObject,
2803 const char *encoding,
2804 const char *input, Py_ssize_t length,
2805 Py_ssize_t startpos, Py_ssize_t endpos,
2806 const char *reason)
2807{
2808 if (*exceptionObject == NULL) {
2809 *exceptionObject = PyUnicodeDecodeError_Create(
2810 encoding, input, length, startpos, endpos, reason);
2811 }
2812 else {
2813 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2814 goto onError;
2815 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2816 goto onError;
2817 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2818 goto onError;
2819 }
2820 return;
2821
2822onError:
2823 Py_DECREF(*exceptionObject);
2824 *exceptionObject = NULL;
2825}
2826
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827/* error handling callback helper:
2828 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002829 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002830 and adjust various state variables.
2831 return 0 on success, -1 on error
2832*/
2833
Alexander Belopolsky40018472011-02-26 01:02:56 +00002834static int
2835unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2836 const char *encoding, const char *reason,
2837 const char **input, const char **inend, Py_ssize_t *startinpos,
2838 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2839 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002840{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002841 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842
2843 PyObject *restuple = NULL;
2844 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002845 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002846 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002847 Py_ssize_t requiredsize;
2848 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002849 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002850 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002851 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002852 int res = -1;
2853
2854 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002855 *errorHandler = PyCodec_LookupError(errors);
2856 if (*errorHandler == NULL)
2857 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002858 }
2859
Victor Stinner554f3f02010-06-16 23:33:54 +00002860 make_decode_exception(exceptionObject,
2861 encoding,
2862 *input, *inend - *input,
2863 *startinpos, *endinpos,
2864 reason);
2865 if (*exceptionObject == NULL)
2866 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867
2868 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2869 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002870 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002871 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002872 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002873 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002874 }
2875 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002876 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002877
2878 /* Copy back the bytes variables, which might have been modified by the
2879 callback */
2880 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2881 if (!inputobj)
2882 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002883 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002884 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002885 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002886 *input = PyBytes_AS_STRING(inputobj);
2887 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002888 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002889 /* we can DECREF safely, as the exception has another reference,
2890 so the object won't go away. */
2891 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002892
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002893 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002894 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002895 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002896 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2897 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002898 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002899
2900 /* need more space? (at least enough for what we
2901 have+the replacement+the rest of the string (starting
2902 at the new input position), so we won't have to check space
2903 when there are no errors in the rest of the string) */
2904 repptr = PyUnicode_AS_UNICODE(repunicode);
2905 repsize = PyUnicode_GET_SIZE(repunicode);
2906 requiredsize = *outpos + repsize + insize-newpos;
2907 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002908 if (requiredsize<2*outsize)
2909 requiredsize = 2*outsize;
2910 if (_PyUnicode_Resize(output, requiredsize) < 0)
2911 goto onError;
2912 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002913 }
2914 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002915 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002916 Py_UNICODE_COPY(*outptr, repptr, repsize);
2917 *outptr += repsize;
2918 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002919
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002920 /* we made it! */
2921 res = 0;
2922
Benjamin Peterson29060642009-01-31 22:14:21 +00002923 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002924 Py_XDECREF(restuple);
2925 return res;
2926}
2927
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002928/* --- UTF-7 Codec -------------------------------------------------------- */
2929
Antoine Pitrou244651a2009-05-04 18:56:13 +00002930/* See RFC2152 for details. We encode conservatively and decode liberally. */
2931
2932/* Three simple macros defining base-64. */
2933
2934/* Is c a base-64 character? */
2935
2936#define IS_BASE64(c) \
2937 (((c) >= 'A' && (c) <= 'Z') || \
2938 ((c) >= 'a' && (c) <= 'z') || \
2939 ((c) >= '0' && (c) <= '9') || \
2940 (c) == '+' || (c) == '/')
2941
2942/* given that c is a base-64 character, what is its base-64 value? */
2943
2944#define FROM_BASE64(c) \
2945 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2946 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2947 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2948 (c) == '+' ? 62 : 63)
2949
2950/* What is the base-64 character of the bottom 6 bits of n? */
2951
2952#define TO_BASE64(n) \
2953 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2954
2955/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2956 * decoded as itself. We are permissive on decoding; the only ASCII
2957 * byte not decoding to itself is the + which begins a base64
2958 * string. */
2959
2960#define DECODE_DIRECT(c) \
2961 ((c) <= 127 && (c) != '+')
2962
2963/* The UTF-7 encoder treats ASCII characters differently according to
2964 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2965 * the above). See RFC2152. This array identifies these different
2966 * sets:
2967 * 0 : "Set D"
2968 * alphanumeric and '(),-./:?
2969 * 1 : "Set O"
2970 * !"#$%&*;<=>@[]^_`{|}
2971 * 2 : "whitespace"
2972 * ht nl cr sp
2973 * 3 : special (must be base64 encoded)
2974 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2975 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002976
Tim Petersced69f82003-09-16 20:30:58 +00002977static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002978char utf7_category[128] = {
2979/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2980 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2981/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2982 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2983/* sp ! " # $ % & ' ( ) * + , - . / */
2984 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2985/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2986 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2987/* @ A B C D E F G H I J K L M N O */
2988 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2989/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2990 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2991/* ` a b c d e f g h i j k l m n o */
2992 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2993/* p q r s t u v w x y z { | } ~ del */
2994 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002995};
2996
Antoine Pitrou244651a2009-05-04 18:56:13 +00002997/* ENCODE_DIRECT: this character should be encoded as itself. The
2998 * answer depends on whether we are encoding set O as itself, and also
2999 * on whether we are encoding whitespace as itself. RFC2152 makes it
3000 * clear that the answers to these questions vary between
3001 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003002
Antoine Pitrou244651a2009-05-04 18:56:13 +00003003#define ENCODE_DIRECT(c, directO, directWS) \
3004 ((c) < 128 && (c) > 0 && \
3005 ((utf7_category[(c)] == 0) || \
3006 (directWS && (utf7_category[(c)] == 2)) || \
3007 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003008
Alexander Belopolsky40018472011-02-26 01:02:56 +00003009PyObject *
3010PyUnicode_DecodeUTF7(const char *s,
3011 Py_ssize_t size,
3012 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003013{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003014 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3015}
3016
Antoine Pitrou244651a2009-05-04 18:56:13 +00003017/* The decoder. The only state we preserve is our read position,
3018 * i.e. how many characters we have consumed. So if we end in the
3019 * middle of a shift sequence we have to back off the read position
3020 * and the output to the beginning of the sequence, otherwise we lose
3021 * all the shift state (seen bits, number of bits seen, high
3022 * surrogate). */
3023
Alexander Belopolsky40018472011-02-26 01:02:56 +00003024PyObject *
3025PyUnicode_DecodeUTF7Stateful(const char *s,
3026 Py_ssize_t size,
3027 const char *errors,
3028 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003029{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003030 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003031 Py_ssize_t startinpos;
3032 Py_ssize_t endinpos;
3033 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003034 const char *e;
3035 PyUnicodeObject *unicode;
3036 Py_UNICODE *p;
3037 const char *errmsg = "";
3038 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003039 Py_UNICODE *shiftOutStart;
3040 unsigned int base64bits = 0;
3041 unsigned long base64buffer = 0;
3042 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003043 PyObject *errorHandler = NULL;
3044 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003045
3046 unicode = _PyUnicode_New(size);
3047 if (!unicode)
3048 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003049 if (size == 0) {
3050 if (consumed)
3051 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003052 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003053 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003055 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003056 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003057 e = s + size;
3058
3059 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003060 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003061 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003062 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003063
Antoine Pitrou244651a2009-05-04 18:56:13 +00003064 if (inShift) { /* in a base-64 section */
3065 if (IS_BASE64(ch)) { /* consume a base-64 character */
3066 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3067 base64bits += 6;
3068 s++;
3069 if (base64bits >= 16) {
3070 /* we have enough bits for a UTF-16 value */
3071 Py_UNICODE outCh = (Py_UNICODE)
3072 (base64buffer >> (base64bits-16));
3073 base64bits -= 16;
3074 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3075 if (surrogate) {
3076 /* expecting a second surrogate */
3077 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3078#ifdef Py_UNICODE_WIDE
3079 *p++ = (((surrogate & 0x3FF)<<10)
3080 | (outCh & 0x3FF)) + 0x10000;
3081#else
3082 *p++ = surrogate;
3083 *p++ = outCh;
3084#endif
3085 surrogate = 0;
3086 }
3087 else {
3088 surrogate = 0;
3089 errmsg = "second surrogate missing";
3090 goto utf7Error;
3091 }
3092 }
3093 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3094 /* first surrogate */
3095 surrogate = outCh;
3096 }
3097 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3098 errmsg = "unexpected second surrogate";
3099 goto utf7Error;
3100 }
3101 else {
3102 *p++ = outCh;
3103 }
3104 }
3105 }
3106 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003107 inShift = 0;
3108 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003109 if (surrogate) {
3110 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003111 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003112 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003113 if (base64bits > 0) { /* left-over bits */
3114 if (base64bits >= 6) {
3115 /* We've seen at least one base-64 character */
3116 errmsg = "partial character in shift sequence";
3117 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003118 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003119 else {
3120 /* Some bits remain; they should be zero */
3121 if (base64buffer != 0) {
3122 errmsg = "non-zero padding bits in shift sequence";
3123 goto utf7Error;
3124 }
3125 }
3126 }
3127 if (ch != '-') {
3128 /* '-' is absorbed; other terminating
3129 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003130 *p++ = ch;
3131 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003132 }
3133 }
3134 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003135 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003136 s++; /* consume '+' */
3137 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003138 s++;
3139 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003140 }
3141 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003142 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003143 shiftOutStart = p;
3144 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003145 }
3146 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003147 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003148 *p++ = ch;
3149 s++;
3150 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003151 else {
3152 startinpos = s-starts;
3153 s++;
3154 errmsg = "unexpected special character";
3155 goto utf7Error;
3156 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003157 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003158utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003159 outpos = p-PyUnicode_AS_UNICODE(unicode);
3160 endinpos = s-starts;
3161 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003162 errors, &errorHandler,
3163 "utf7", errmsg,
3164 &starts, &e, &startinpos, &endinpos, &exc, &s,
3165 &unicode, &outpos, &p))
3166 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003167 }
3168
Antoine Pitrou244651a2009-05-04 18:56:13 +00003169 /* end of string */
3170
3171 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3172 /* if we're in an inconsistent state, that's an error */
3173 if (surrogate ||
3174 (base64bits >= 6) ||
3175 (base64bits > 0 && base64buffer != 0)) {
3176 outpos = p-PyUnicode_AS_UNICODE(unicode);
3177 endinpos = size;
3178 if (unicode_decode_call_errorhandler(
3179 errors, &errorHandler,
3180 "utf7", "unterminated shift sequence",
3181 &starts, &e, &startinpos, &endinpos, &exc, &s,
3182 &unicode, &outpos, &p))
3183 goto onError;
3184 if (s < e)
3185 goto restart;
3186 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003187 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003188
3189 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003190 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003191 if (inShift) {
3192 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003193 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003194 }
3195 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003196 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003197 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003198 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003199
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003200 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003201 goto onError;
3202
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003203 Py_XDECREF(errorHandler);
3204 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003205 if (PyUnicode_READY(unicode) == -1) {
3206 Py_DECREF(unicode);
3207 return NULL;
3208 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003209 return (PyObject *)unicode;
3210
Benjamin Peterson29060642009-01-31 22:14:21 +00003211 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003212 Py_XDECREF(errorHandler);
3213 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003214 Py_DECREF(unicode);
3215 return NULL;
3216}
3217
3218
Alexander Belopolsky40018472011-02-26 01:02:56 +00003219PyObject *
3220PyUnicode_EncodeUTF7(const Py_UNICODE *s,
3221 Py_ssize_t size,
3222 int base64SetO,
3223 int base64WhiteSpace,
3224 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003225{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003226 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003227 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003228 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003229 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003230 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003231 unsigned int base64bits = 0;
3232 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003233 char * out;
3234 char * start;
3235
3236 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003237 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003238
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003239 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003240 return PyErr_NoMemory();
3241
Antoine Pitrou244651a2009-05-04 18:56:13 +00003242 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003243 if (v == NULL)
3244 return NULL;
3245
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003246 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003247 for (;i < size; ++i) {
3248 Py_UNICODE ch = s[i];
3249
Antoine Pitrou244651a2009-05-04 18:56:13 +00003250 if (inShift) {
3251 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3252 /* shifting out */
3253 if (base64bits) { /* output remaining bits */
3254 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3255 base64buffer = 0;
3256 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003257 }
3258 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003259 /* Characters not in the BASE64 set implicitly unshift the sequence
3260 so no '-' is required, except if the character is itself a '-' */
3261 if (IS_BASE64(ch) || ch == '-') {
3262 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003263 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003264 *out++ = (char) ch;
3265 }
3266 else {
3267 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003268 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003269 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003270 else { /* not in a shift sequence */
3271 if (ch == '+') {
3272 *out++ = '+';
3273 *out++ = '-';
3274 }
3275 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3276 *out++ = (char) ch;
3277 }
3278 else {
3279 *out++ = '+';
3280 inShift = 1;
3281 goto encode_char;
3282 }
3283 }
3284 continue;
3285encode_char:
3286#ifdef Py_UNICODE_WIDE
3287 if (ch >= 0x10000) {
3288 /* code first surrogate */
3289 base64bits += 16;
3290 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3291 while (base64bits >= 6) {
3292 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3293 base64bits -= 6;
3294 }
3295 /* prepare second surrogate */
3296 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3297 }
3298#endif
3299 base64bits += 16;
3300 base64buffer = (base64buffer << 16) | ch;
3301 while (base64bits >= 6) {
3302 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3303 base64bits -= 6;
3304 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003305 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003306 if (base64bits)
3307 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3308 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003309 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003310 if (_PyBytes_Resize(&v, out - start) < 0)
3311 return NULL;
3312 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003313}
3314
Antoine Pitrou244651a2009-05-04 18:56:13 +00003315#undef IS_BASE64
3316#undef FROM_BASE64
3317#undef TO_BASE64
3318#undef DECODE_DIRECT
3319#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003320
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321/* --- UTF-8 Codec -------------------------------------------------------- */
3322
Tim Petersced69f82003-09-16 20:30:58 +00003323static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003324char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003325 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3326 illegal prefix. See RFC 3629 for details */
3327 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3328 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003329 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3331 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3332 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3333 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003334 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3335 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3337 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003338 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3339 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3340 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3341 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3342 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343};
3344
Alexander Belopolsky40018472011-02-26 01:02:56 +00003345PyObject *
3346PyUnicode_DecodeUTF8(const char *s,
3347 Py_ssize_t size,
3348 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349{
Walter Dörwald69652032004-09-07 20:24:22 +00003350 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3351}
3352
Antoine Pitrouab868312009-01-10 15:40:25 +00003353/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3354#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3355
3356/* Mask to quickly check whether a C 'long' contains a
3357 non-ASCII, UTF8-encoded char. */
3358#if (SIZEOF_LONG == 8)
3359# define ASCII_CHAR_MASK 0x8080808080808080L
3360#elif (SIZEOF_LONG == 4)
3361# define ASCII_CHAR_MASK 0x80808080L
3362#else
3363# error C 'long' size should be either 4 or 8!
3364#endif
3365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003366/* Scans a UTF-8 string and returns the maximum character to be expected,
3367 the size of the decoded unicode string and if any major errors were
3368 encountered.
3369
3370 This function does check basic UTF-8 sanity, it does however NOT CHECK
3371 if the string contains surrogates, and if all continuation bytes are
3372 within the correct ranges, these checks are performed in
3373 PyUnicode_DecodeUTF8Stateful.
3374
3375 If it sets has_errors to 1, it means the value of unicode_size and max_char
3376 will be bogus and you should not rely on useful information in them.
3377 */
3378static Py_UCS4
3379utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3380 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3381 int *has_errors)
3382{
3383 Py_ssize_t n;
3384 Py_ssize_t char_count = 0;
3385 Py_UCS4 max_char = 127, new_max;
3386 Py_UCS4 upper_bound;
3387 const unsigned char *p = (const unsigned char *)s;
3388 const unsigned char *end = p + string_size;
3389 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3390 int err = 0;
3391
3392 for (; p < end && !err; ++p, ++char_count) {
3393 /* Only check value if it's not a ASCII char... */
3394 if (*p < 0x80) {
3395 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3396 an explanation. */
3397 if (!((size_t) p & LONG_PTR_MASK)) {
3398 /* Help register allocation */
3399 register const unsigned char *_p = p;
3400 while (_p < aligned_end) {
3401 unsigned long value = *(unsigned long *) _p;
3402 if (value & ASCII_CHAR_MASK)
3403 break;
3404 _p += SIZEOF_LONG;
3405 char_count += SIZEOF_LONG;
3406 }
3407 p = _p;
3408 if (p == end)
3409 break;
3410 }
3411 }
3412 if (*p >= 0x80) {
3413 n = utf8_code_length[*p];
3414 new_max = max_char;
3415 switch (n) {
3416 /* invalid start byte */
3417 case 0:
3418 err = 1;
3419 break;
3420 case 2:
3421 /* Code points between 0x00FF and 0x07FF inclusive.
3422 Approximate the upper bound of the code point,
3423 if this flips over 255 we can be sure it will be more
3424 than 255 and the string will need 2 bytes per code coint,
3425 if it stays under or equal to 255, we can be sure 1 byte
3426 is enough.
3427 ((*p & 0b00011111) << 6) | 0b00111111 */
3428 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3429 if (max_char < upper_bound)
3430 new_max = upper_bound;
3431 /* Ensure we track at least that we left ASCII space. */
3432 if (new_max < 128)
3433 new_max = 128;
3434 break;
3435 case 3:
3436 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3437 always > 255 and <= 65535 and will always need 2 bytes. */
3438 if (max_char < 65535)
3439 new_max = 65535;
3440 break;
3441 case 4:
3442 /* Code point will be above 0xFFFF for sure in this case. */
3443 new_max = 65537;
3444 break;
3445 /* Internal error, this should be caught by the first if */
3446 case 1:
3447 default:
3448 assert(0 && "Impossible case in utf8_max_char_and_size");
3449 err = 1;
3450 }
3451 /* Instead of number of overall bytes for this code point,
3452 n containts the number of following bytes: */
3453 --n;
3454 /* Check if the follow up chars are all valid continuation bytes */
3455 if (n >= 1) {
3456 const unsigned char *cont;
3457 if ((p + n) >= end) {
3458 if (consumed == 0)
3459 /* incomplete data, non-incremental decoding */
3460 err = 1;
3461 break;
3462 }
3463 for (cont = p + 1; cont < (p + n); ++cont) {
3464 if ((*cont & 0xc0) != 0x80) {
3465 err = 1;
3466 break;
3467 }
3468 }
3469 p += n;
3470 }
3471 else
3472 err = 1;
3473 max_char = new_max;
3474 }
3475 }
3476
3477 if (unicode_size)
3478 *unicode_size = char_count;
3479 if (has_errors)
3480 *has_errors = err;
3481 return max_char;
3482}
3483
3484/* Similar to PyUnicode_WRITE but can also write into wstr field
3485 of the legacy unicode representation */
3486#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3487 do { \
3488 const int k_ = (kind); \
3489 if (k_ == PyUnicode_WCHAR_KIND) \
3490 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3491 else if (k_ == PyUnicode_1BYTE_KIND) \
3492 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3493 else if (k_ == PyUnicode_2BYTE_KIND) \
3494 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3495 else \
3496 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3497 } while (0)
3498
Alexander Belopolsky40018472011-02-26 01:02:56 +00003499PyObject *
3500PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003501 Py_ssize_t size,
3502 const char *errors,
3503 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003504{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003506 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003507 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003508 Py_ssize_t startinpos;
3509 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003510 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003512 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513 PyObject *errorHandler = NULL;
3514 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003515 Py_UCS4 maxchar = 0;
3516 Py_ssize_t unicode_size;
3517 Py_ssize_t i;
3518 int kind;
3519 void *data;
3520 int has_errors;
3521 Py_UNICODE *error_outptr;
3522#if SIZEOF_WCHAR_T == 2
3523 Py_ssize_t wchar_offset = 0;
3524#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525
Walter Dörwald69652032004-09-07 20:24:22 +00003526 if (size == 0) {
3527 if (consumed)
3528 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003529 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003530 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003531 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3532 consumed, &has_errors);
3533 if (has_errors) {
3534 unicode = _PyUnicode_New(size);
3535 if (!unicode)
3536 return NULL;
3537 kind = PyUnicode_WCHAR_KIND;
3538 data = PyUnicode_AS_UNICODE(unicode);
3539 assert(data != NULL);
3540 }
3541 else {
3542 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3543 if (!unicode)
3544 return NULL;
3545 /* When the string is ASCII only, just use memcpy and return.
3546 unicode_size may be != size if there is an incomplete UTF-8
3547 sequence at the end of the ASCII block. */
3548 if (maxchar < 128 && size == unicode_size) {
3549 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3550 return (PyObject *)unicode;
3551 }
3552 kind = PyUnicode_KIND(unicode);
3553 data = PyUnicode_DATA(unicode);
3554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003556 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003558 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559
3560 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003561 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562
3563 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003564 /* Fast path for runs of ASCII characters. Given that common UTF-8
3565 input will consist of an overwhelming majority of ASCII
3566 characters, we try to optimize for this case by checking
3567 as many characters as a C 'long' can contain.
3568 First, check if we can do an aligned read, as most CPUs have
3569 a penalty for unaligned reads.
3570 */
3571 if (!((size_t) s & LONG_PTR_MASK)) {
3572 /* Help register allocation */
3573 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003574 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003575 while (_s < aligned_end) {
3576 /* Read a whole long at a time (either 4 or 8 bytes),
3577 and do a fast unrolled copy if it only contains ASCII
3578 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003579 unsigned long value = *(unsigned long *) _s;
3580 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003581 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003582 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3583 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3584 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3585 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003586#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003587 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3588 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3589 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3590 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003591#endif
3592 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003593 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003594 }
3595 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003596 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003597 if (s == e)
3598 break;
3599 ch = (unsigned char)*s;
3600 }
3601 }
3602
3603 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003604 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003605 s++;
3606 continue;
3607 }
3608
3609 n = utf8_code_length[ch];
3610
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003611 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003612 if (consumed)
3613 break;
3614 else {
3615 errmsg = "unexpected end of data";
3616 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003617 endinpos = startinpos+1;
3618 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3619 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003620 goto utf8Error;
3621 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003622 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623
3624 switch (n) {
3625
3626 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003627 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003628 startinpos = s-starts;
3629 endinpos = startinpos+1;
3630 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631
3632 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003633 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003634 startinpos = s-starts;
3635 endinpos = startinpos+1;
3636 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637
3638 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003639 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003640 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003641 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003642 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003643 goto utf8Error;
3644 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003646 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003647 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648 break;
3649
3650 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003651 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3652 will result in surrogates in range d800-dfff. Surrogates are
3653 not valid UTF-8 so they are rejected.
3654 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3655 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003656 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003657 (s[2] & 0xc0) != 0x80 ||
3658 ((unsigned char)s[0] == 0xE0 &&
3659 (unsigned char)s[1] < 0xA0) ||
3660 ((unsigned char)s[0] == 0xED &&
3661 (unsigned char)s[1] > 0x9F)) {
3662 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003663 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003664 endinpos = startinpos + 1;
3665
3666 /* if s[1] first two bits are 1 and 0, then the invalid
3667 continuation byte is s[2], so increment endinpos by 1,
3668 if not, s[1] is invalid and endinpos doesn't need to
3669 be incremented. */
3670 if ((s[1] & 0xC0) == 0x80)
3671 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 goto utf8Error;
3673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003675 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003676 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003677 break;
3678
3679 case 4:
3680 if ((s[1] & 0xc0) != 0x80 ||
3681 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003682 (s[3] & 0xc0) != 0x80 ||
3683 ((unsigned char)s[0] == 0xF0 &&
3684 (unsigned char)s[1] < 0x90) ||
3685 ((unsigned char)s[0] == 0xF4 &&
3686 (unsigned char)s[1] > 0x8F)) {
3687 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003689 endinpos = startinpos + 1;
3690 if ((s[1] & 0xC0) == 0x80) {
3691 endinpos++;
3692 if ((s[2] & 0xC0) == 0x80)
3693 endinpos++;
3694 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003695 goto utf8Error;
3696 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003697 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003698 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3699 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003701 /* If the string is flexible or we have native UCS-4, write
3702 directly.. */
3703 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3704 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003706 else {
3707 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003709 /* translate from 10000..10FFFF to 0..FFFF */
3710 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003712 /* high surrogate = top 10 bits added to D800 */
3713 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3714 (Py_UNICODE)(0xD800 + (ch >> 10)));
3715
3716 /* low surrogate = bottom 10 bits added to DC00 */
3717 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3718 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3719 }
3720#if SIZEOF_WCHAR_T == 2
3721 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003722#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724 }
3725 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003726 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003727
Benjamin Peterson29060642009-01-31 22:14:21 +00003728 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729 /* If this is not yet a resizable string, make it one.. */
3730 if (kind != PyUnicode_WCHAR_KIND) {
3731 const Py_UNICODE *u;
3732 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3733 if (!new_unicode)
3734 goto onError;
3735 u = PyUnicode_AsUnicode((PyObject *)unicode);
3736 if (!u)
3737 goto onError;
3738#if SIZEOF_WCHAR_T == 2
3739 i += wchar_offset;
3740#endif
3741 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3742 Py_DECREF(unicode);
3743 unicode = new_unicode;
3744 kind = 0;
3745 data = PyUnicode_AS_UNICODE(new_unicode);
3746 assert(data != NULL);
3747 }
3748 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003749 if (unicode_decode_call_errorhandler(
3750 errors, &errorHandler,
3751 "utf8", errmsg,
3752 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003753 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003754 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003755 /* Update data because unicode_decode_call_errorhandler might have
3756 re-created or resized the unicode object. */
3757 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003758 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003760 /* Ensure the unicode_size calculation above was correct: */
3761 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3762
Walter Dörwald69652032004-09-07 20:24:22 +00003763 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003764 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766 /* Adjust length and ready string when it contained errors and
3767 is of the old resizable kind. */
3768 if (kind == PyUnicode_WCHAR_KIND) {
3769 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3770 PyUnicode_READY(unicode) == -1)
3771 goto onError;
3772 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003774 Py_XDECREF(errorHandler);
3775 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776 if (PyUnicode_READY(unicode) == -1) {
3777 Py_DECREF(unicode);
3778 return NULL;
3779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 return (PyObject *)unicode;
3781
Benjamin Peterson29060642009-01-31 22:14:21 +00003782 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 Py_XDECREF(errorHandler);
3784 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 Py_DECREF(unicode);
3786 return NULL;
3787}
3788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003789#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003790
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003791#ifdef __APPLE__
3792
3793/* Simplified UTF-8 decoder using surrogateescape error handler,
3794 used to decode the command line arguments on Mac OS X. */
3795
3796wchar_t*
3797_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3798{
3799 int n;
3800 const char *e;
3801 wchar_t *unicode, *p;
3802
3803 /* Note: size will always be longer than the resulting Unicode
3804 character count */
3805 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3806 PyErr_NoMemory();
3807 return NULL;
3808 }
3809 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3810 if (!unicode)
3811 return NULL;
3812
3813 /* Unpack UTF-8 encoded data */
3814 p = unicode;
3815 e = s + size;
3816 while (s < e) {
3817 Py_UCS4 ch = (unsigned char)*s;
3818
3819 if (ch < 0x80) {
3820 *p++ = (wchar_t)ch;
3821 s++;
3822 continue;
3823 }
3824
3825 n = utf8_code_length[ch];
3826 if (s + n > e) {
3827 goto surrogateescape;
3828 }
3829
3830 switch (n) {
3831 case 0:
3832 case 1:
3833 goto surrogateescape;
3834
3835 case 2:
3836 if ((s[1] & 0xc0) != 0x80)
3837 goto surrogateescape;
3838 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3839 assert ((ch > 0x007F) && (ch <= 0x07FF));
3840 *p++ = (wchar_t)ch;
3841 break;
3842
3843 case 3:
3844 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3845 will result in surrogates in range d800-dfff. Surrogates are
3846 not valid UTF-8 so they are rejected.
3847 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3848 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3849 if ((s[1] & 0xc0) != 0x80 ||
3850 (s[2] & 0xc0) != 0x80 ||
3851 ((unsigned char)s[0] == 0xE0 &&
3852 (unsigned char)s[1] < 0xA0) ||
3853 ((unsigned char)s[0] == 0xED &&
3854 (unsigned char)s[1] > 0x9F)) {
3855
3856 goto surrogateescape;
3857 }
3858 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3859 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003861 break;
3862
3863 case 4:
3864 if ((s[1] & 0xc0) != 0x80 ||
3865 (s[2] & 0xc0) != 0x80 ||
3866 (s[3] & 0xc0) != 0x80 ||
3867 ((unsigned char)s[0] == 0xF0 &&
3868 (unsigned char)s[1] < 0x90) ||
3869 ((unsigned char)s[0] == 0xF4 &&
3870 (unsigned char)s[1] > 0x8F)) {
3871 goto surrogateescape;
3872 }
3873 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3874 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3875 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3876
3877#if SIZEOF_WCHAR_T == 4
3878 *p++ = (wchar_t)ch;
3879#else
3880 /* compute and append the two surrogates: */
3881
3882 /* translate from 10000..10FFFF to 0..FFFF */
3883 ch -= 0x10000;
3884
3885 /* high surrogate = top 10 bits added to D800 */
3886 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3887
3888 /* low surrogate = bottom 10 bits added to DC00 */
3889 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3890#endif
3891 break;
3892 }
3893 s += n;
3894 continue;
3895
3896 surrogateescape:
3897 *p++ = 0xDC00 + ch;
3898 s++;
3899 }
3900 *p = L'\0';
3901 return unicode;
3902}
3903
3904#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003906/* Primary internal function which creates utf8 encoded bytes objects.
3907
3908 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003909 and allocate exactly as much space needed at the end. Else allocate the
3910 maximum possible needed (4 result bytes per Unicode character), and return
3911 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003912*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003913PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003914_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915{
Tim Peters602f7402002-04-27 18:03:26 +00003916#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003917
Guido van Rossum98297ee2007-11-06 21:34:58 +00003918 Py_ssize_t i; /* index into s of next input byte */
3919 PyObject *result; /* result string object */
3920 char *p; /* next free byte in output buffer */
3921 Py_ssize_t nallocated; /* number of result bytes allocated */
3922 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003923 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003924 PyObject *errorHandler = NULL;
3925 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 int kind;
3927 void *data;
3928 Py_ssize_t size;
3929 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3930#if SIZEOF_WCHAR_T == 2
3931 Py_ssize_t wchar_offset = 0;
3932#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934 if (!PyUnicode_Check(unicode)) {
3935 PyErr_BadArgument();
3936 return NULL;
3937 }
3938
3939 if (PyUnicode_READY(unicode) == -1)
3940 return NULL;
3941
3942 if (_PyUnicode_UTF8(unicode))
3943 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
3944 _PyUnicode_UTF8_LENGTH(unicode));
3945
3946 kind = PyUnicode_KIND(unicode);
3947 data = PyUnicode_DATA(unicode);
3948 size = PyUnicode_GET_LENGTH(unicode);
3949
Tim Peters602f7402002-04-27 18:03:26 +00003950 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951
Tim Peters602f7402002-04-27 18:03:26 +00003952 if (size <= MAX_SHORT_UNICHARS) {
3953 /* Write into the stack buffer; nallocated can't overflow.
3954 * At the end, we'll allocate exactly as much heap space as it
3955 * turns out we need.
3956 */
3957 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003958 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00003959 p = stackbuf;
3960 }
3961 else {
3962 /* Overallocate on the heap, and give the excess back at the end. */
3963 nallocated = size * 4;
3964 if (nallocated / 4 != size) /* overflow! */
3965 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00003966 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003967 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00003968 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003969 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003970 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003971
Tim Peters602f7402002-04-27 18:03:26 +00003972 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003974
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003975 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00003976 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003978
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00003980 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00003981 *p++ = (char)(0xc0 | (ch >> 6));
3982 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00003983 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003984 Py_ssize_t newpos;
3985 PyObject *rep;
3986 Py_ssize_t repsize, k, startpos;
3987 startpos = i-1;
3988#if SIZEOF_WCHAR_T == 2
3989 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00003990#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003991 rep = unicode_encode_call_errorhandler(
3992 errors, &errorHandler, "utf-8", "surrogates not allowed",
3993 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3994 &exc, startpos, startpos+1, &newpos);
3995 if (!rep)
3996 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00003997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998 if (PyBytes_Check(rep))
3999 repsize = PyBytes_GET_SIZE(rep);
4000 else
4001 repsize = PyUnicode_GET_SIZE(rep);
4002
4003 if (repsize > 4) {
4004 Py_ssize_t offset;
4005
4006 if (result == NULL)
4007 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004008 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004009 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4012 /* integer overflow */
4013 PyErr_NoMemory();
4014 goto error;
4015 }
4016 nallocated += repsize - 4;
4017 if (result != NULL) {
4018 if (_PyBytes_Resize(&result, nallocated) < 0)
4019 goto error;
4020 } else {
4021 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004022 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023 goto error;
4024 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4025 }
4026 p = PyBytes_AS_STRING(result) + offset;
4027 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004029 if (PyBytes_Check(rep)) {
4030 char *prep = PyBytes_AS_STRING(rep);
4031 for(k = repsize; k > 0; k--)
4032 *p++ = *prep++;
4033 } else /* rep is unicode */ {
4034 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4035 Py_UNICODE c;
4036
4037 for(k=0; k<repsize; k++) {
4038 c = prep[k];
4039 if (0x80 <= c) {
4040 raise_encode_exception(&exc, "utf-8",
4041 PyUnicode_AS_UNICODE(unicode),
4042 size, i-1, i,
4043 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004044 goto error;
4045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004046 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004047 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004048 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004050 } else if (ch < 0x10000) {
4051 *p++ = (char)(0xe0 | (ch >> 12));
4052 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4053 *p++ = (char)(0x80 | (ch & 0x3f));
4054 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004055 /* Encode UCS4 Unicode ordinals */
4056 *p++ = (char)(0xf0 | (ch >> 18));
4057 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4058 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4059 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060#if SIZEOF_WCHAR_T == 2
4061 wchar_offset++;
4062#endif
Tim Peters602f7402002-04-27 18:03:26 +00004063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004065
Guido van Rossum98297ee2007-11-06 21:34:58 +00004066 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004067 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004068 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004069 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004070 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004071 }
4072 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004073 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004074 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004075 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004076 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004078
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004079 Py_XDECREF(errorHandler);
4080 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004081 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004082 error:
4083 Py_XDECREF(errorHandler);
4084 Py_XDECREF(exc);
4085 Py_XDECREF(result);
4086 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004087
Tim Peters602f7402002-04-27 18:03:26 +00004088#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089}
4090
Alexander Belopolsky40018472011-02-26 01:02:56 +00004091PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004092PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4093 Py_ssize_t size,
4094 const char *errors)
4095{
4096 PyObject *v, *unicode;
4097
4098 unicode = PyUnicode_FromUnicode(s, size);
4099 if (unicode == NULL)
4100 return NULL;
4101 v = _PyUnicode_AsUTF8String(unicode, errors);
4102 Py_DECREF(unicode);
4103 return v;
4104}
4105
4106PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004107PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004109 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004110}
4111
Walter Dörwald41980ca2007-08-16 21:55:45 +00004112/* --- UTF-32 Codec ------------------------------------------------------- */
4113
4114PyObject *
4115PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 Py_ssize_t size,
4117 const char *errors,
4118 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004119{
4120 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4121}
4122
4123PyObject *
4124PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004125 Py_ssize_t size,
4126 const char *errors,
4127 int *byteorder,
4128 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004129{
4130 const char *starts = s;
4131 Py_ssize_t startinpos;
4132 Py_ssize_t endinpos;
4133 Py_ssize_t outpos;
4134 PyUnicodeObject *unicode;
4135 Py_UNICODE *p;
4136#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004137 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004138 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004139#else
4140 const int pairs = 0;
4141#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004142 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004143 int bo = 0; /* assume native ordering by default */
4144 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004145 /* Offsets from q for retrieving bytes in the right order. */
4146#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4147 int iorder[] = {0, 1, 2, 3};
4148#else
4149 int iorder[] = {3, 2, 1, 0};
4150#endif
4151 PyObject *errorHandler = NULL;
4152 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004153
Walter Dörwald41980ca2007-08-16 21:55:45 +00004154 q = (unsigned char *)s;
4155 e = q + size;
4156
4157 if (byteorder)
4158 bo = *byteorder;
4159
4160 /* Check for BOM marks (U+FEFF) in the input and adjust current
4161 byte order setting accordingly. In native mode, the leading BOM
4162 mark is skipped, in all other modes, it is copied to the output
4163 stream as-is (giving a ZWNBSP character). */
4164 if (bo == 0) {
4165 if (size >= 4) {
4166 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004168#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 if (bom == 0x0000FEFF) {
4170 q += 4;
4171 bo = -1;
4172 }
4173 else if (bom == 0xFFFE0000) {
4174 q += 4;
4175 bo = 1;
4176 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004177#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 if (bom == 0x0000FEFF) {
4179 q += 4;
4180 bo = 1;
4181 }
4182 else if (bom == 0xFFFE0000) {
4183 q += 4;
4184 bo = -1;
4185 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004186#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004187 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004188 }
4189
4190 if (bo == -1) {
4191 /* force LE */
4192 iorder[0] = 0;
4193 iorder[1] = 1;
4194 iorder[2] = 2;
4195 iorder[3] = 3;
4196 }
4197 else if (bo == 1) {
4198 /* force BE */
4199 iorder[0] = 3;
4200 iorder[1] = 2;
4201 iorder[2] = 1;
4202 iorder[3] = 0;
4203 }
4204
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004205 /* On narrow builds we split characters outside the BMP into two
4206 codepoints => count how much extra space we need. */
4207#ifndef Py_UNICODE_WIDE
4208 for (qq = q; qq < e; qq += 4)
4209 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4210 pairs++;
4211#endif
4212
4213 /* This might be one to much, because of a BOM */
4214 unicode = _PyUnicode_New((size+3)/4+pairs);
4215 if (!unicode)
4216 return NULL;
4217 if (size == 0)
4218 return (PyObject *)unicode;
4219
4220 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004221 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004222
Walter Dörwald41980ca2007-08-16 21:55:45 +00004223 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004224 Py_UCS4 ch;
4225 /* remaining bytes at the end? (size should be divisible by 4) */
4226 if (e-q<4) {
4227 if (consumed)
4228 break;
4229 errmsg = "truncated data";
4230 startinpos = ((const char *)q)-starts;
4231 endinpos = ((const char *)e)-starts;
4232 goto utf32Error;
4233 /* The remaining input chars are ignored if the callback
4234 chooses to skip the input */
4235 }
4236 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4237 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004238
Benjamin Peterson29060642009-01-31 22:14:21 +00004239 if (ch >= 0x110000)
4240 {
4241 errmsg = "codepoint not in range(0x110000)";
4242 startinpos = ((const char *)q)-starts;
4243 endinpos = startinpos+4;
4244 goto utf32Error;
4245 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004246#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 if (ch >= 0x10000)
4248 {
4249 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4250 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4251 }
4252 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004253#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004254 *p++ = ch;
4255 q += 4;
4256 continue;
4257 utf32Error:
4258 outpos = p-PyUnicode_AS_UNICODE(unicode);
4259 if (unicode_decode_call_errorhandler(
4260 errors, &errorHandler,
4261 "utf32", errmsg,
4262 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4263 &unicode, &outpos, &p))
4264 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004265 }
4266
4267 if (byteorder)
4268 *byteorder = bo;
4269
4270 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004272
4273 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004274 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004275 goto onError;
4276
4277 Py_XDECREF(errorHandler);
4278 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004279 if (PyUnicode_READY(unicode) == -1) {
4280 Py_DECREF(unicode);
4281 return NULL;
4282 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004283 return (PyObject *)unicode;
4284
Benjamin Peterson29060642009-01-31 22:14:21 +00004285 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004286 Py_DECREF(unicode);
4287 Py_XDECREF(errorHandler);
4288 Py_XDECREF(exc);
4289 return NULL;
4290}
4291
4292PyObject *
4293PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 Py_ssize_t size,
4295 const char *errors,
4296 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004297{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004298 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004299 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004300 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004301#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004302 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004303#else
4304 const int pairs = 0;
4305#endif
4306 /* Offsets from p for storing byte pairs in the right order. */
4307#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4308 int iorder[] = {0, 1, 2, 3};
4309#else
4310 int iorder[] = {3, 2, 1, 0};
4311#endif
4312
Benjamin Peterson29060642009-01-31 22:14:21 +00004313#define STORECHAR(CH) \
4314 do { \
4315 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4316 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4317 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4318 p[iorder[0]] = (CH) & 0xff; \
4319 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004320 } while(0)
4321
4322 /* In narrow builds we can output surrogate pairs as one codepoint,
4323 so we need less space. */
4324#ifndef Py_UNICODE_WIDE
4325 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004326 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4327 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4328 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004329#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004330 nsize = (size - pairs + (byteorder == 0));
4331 bytesize = nsize * 4;
4332 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004333 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004334 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004335 if (v == NULL)
4336 return NULL;
4337
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004338 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004339 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004340 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004341 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004342 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004343
4344 if (byteorder == -1) {
4345 /* force LE */
4346 iorder[0] = 0;
4347 iorder[1] = 1;
4348 iorder[2] = 2;
4349 iorder[3] = 3;
4350 }
4351 else if (byteorder == 1) {
4352 /* force BE */
4353 iorder[0] = 3;
4354 iorder[1] = 2;
4355 iorder[2] = 1;
4356 iorder[3] = 0;
4357 }
4358
4359 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004360 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004361#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004362 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4363 Py_UCS4 ch2 = *s;
4364 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4365 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4366 s++;
4367 size--;
4368 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004369 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004370#endif
4371 STORECHAR(ch);
4372 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004373
4374 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004375 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004376#undef STORECHAR
4377}
4378
Alexander Belopolsky40018472011-02-26 01:02:56 +00004379PyObject *
4380PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004381{
4382 if (!PyUnicode_Check(unicode)) {
4383 PyErr_BadArgument();
4384 return NULL;
4385 }
4386 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004387 PyUnicode_GET_SIZE(unicode),
4388 NULL,
4389 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004390}
4391
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392/* --- UTF-16 Codec ------------------------------------------------------- */
4393
Tim Peters772747b2001-08-09 22:21:55 +00004394PyObject *
4395PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004396 Py_ssize_t size,
4397 const char *errors,
4398 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399{
Walter Dörwald69652032004-09-07 20:24:22 +00004400 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4401}
4402
Antoine Pitrouab868312009-01-10 15:40:25 +00004403/* Two masks for fast checking of whether a C 'long' may contain
4404 UTF16-encoded surrogate characters. This is an efficient heuristic,
4405 assuming that non-surrogate characters with a code point >= 0x8000 are
4406 rare in most input.
4407 FAST_CHAR_MASK is used when the input is in native byte ordering,
4408 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004409*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004410#if (SIZEOF_LONG == 8)
4411# define FAST_CHAR_MASK 0x8000800080008000L
4412# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4413#elif (SIZEOF_LONG == 4)
4414# define FAST_CHAR_MASK 0x80008000L
4415# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4416#else
4417# error C 'long' size should be either 4 or 8!
4418#endif
4419
Walter Dörwald69652032004-09-07 20:24:22 +00004420PyObject *
4421PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 Py_ssize_t size,
4423 const char *errors,
4424 int *byteorder,
4425 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004426{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004428 Py_ssize_t startinpos;
4429 Py_ssize_t endinpos;
4430 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 PyUnicodeObject *unicode;
4432 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004433 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004434 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004435 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004436 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004437 /* Offsets from q for retrieving byte pairs in the right order. */
4438#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4439 int ihi = 1, ilo = 0;
4440#else
4441 int ihi = 0, ilo = 1;
4442#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 PyObject *errorHandler = NULL;
4444 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445
4446 /* Note: size will always be longer than the resulting Unicode
4447 character count */
4448 unicode = _PyUnicode_New(size);
4449 if (!unicode)
4450 return NULL;
4451 if (size == 0)
4452 return (PyObject *)unicode;
4453
4454 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004455 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004456 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004457 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458
4459 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004460 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004462 /* Check for BOM marks (U+FEFF) in the input and adjust current
4463 byte order setting accordingly. In native mode, the leading BOM
4464 mark is skipped, in all other modes, it is copied to the output
4465 stream as-is (giving a ZWNBSP character). */
4466 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004467 if (size >= 2) {
4468 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004469#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004470 if (bom == 0xFEFF) {
4471 q += 2;
4472 bo = -1;
4473 }
4474 else if (bom == 0xFFFE) {
4475 q += 2;
4476 bo = 1;
4477 }
Tim Petersced69f82003-09-16 20:30:58 +00004478#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004479 if (bom == 0xFEFF) {
4480 q += 2;
4481 bo = 1;
4482 }
4483 else if (bom == 0xFFFE) {
4484 q += 2;
4485 bo = -1;
4486 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004487#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490
Tim Peters772747b2001-08-09 22:21:55 +00004491 if (bo == -1) {
4492 /* force LE */
4493 ihi = 1;
4494 ilo = 0;
4495 }
4496 else if (bo == 1) {
4497 /* force BE */
4498 ihi = 0;
4499 ilo = 1;
4500 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004501#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4502 native_ordering = ilo < ihi;
4503#else
4504 native_ordering = ilo > ihi;
4505#endif
Tim Peters772747b2001-08-09 22:21:55 +00004506
Antoine Pitrouab868312009-01-10 15:40:25 +00004507 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004508 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004510 /* First check for possible aligned read of a C 'long'. Unaligned
4511 reads are more expensive, better to defer to another iteration. */
4512 if (!((size_t) q & LONG_PTR_MASK)) {
4513 /* Fast path for runs of non-surrogate chars. */
4514 register const unsigned char *_q = q;
4515 Py_UNICODE *_p = p;
4516 if (native_ordering) {
4517 /* Native ordering is simple: as long as the input cannot
4518 possibly contain a surrogate char, do an unrolled copy
4519 of several 16-bit code points to the target object.
4520 The non-surrogate check is done on several input bytes
4521 at a time (as many as a C 'long' can contain). */
4522 while (_q < aligned_end) {
4523 unsigned long data = * (unsigned long *) _q;
4524 if (data & FAST_CHAR_MASK)
4525 break;
4526 _p[0] = ((unsigned short *) _q)[0];
4527 _p[1] = ((unsigned short *) _q)[1];
4528#if (SIZEOF_LONG == 8)
4529 _p[2] = ((unsigned short *) _q)[2];
4530 _p[3] = ((unsigned short *) _q)[3];
4531#endif
4532 _q += SIZEOF_LONG;
4533 _p += SIZEOF_LONG / 2;
4534 }
4535 }
4536 else {
4537 /* Byteswapped ordering is similar, but we must decompose
4538 the copy bytewise, and take care of zero'ing out the
4539 upper bytes if the target object is in 32-bit units
4540 (that is, in UCS-4 builds). */
4541 while (_q < aligned_end) {
4542 unsigned long data = * (unsigned long *) _q;
4543 if (data & SWAPPED_FAST_CHAR_MASK)
4544 break;
4545 /* Zero upper bytes in UCS-4 builds */
4546#if (Py_UNICODE_SIZE > 2)
4547 _p[0] = 0;
4548 _p[1] = 0;
4549#if (SIZEOF_LONG == 8)
4550 _p[2] = 0;
4551 _p[3] = 0;
4552#endif
4553#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004554 /* Issue #4916; UCS-4 builds on big endian machines must
4555 fill the two last bytes of each 4-byte unit. */
4556#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4557# define OFF 2
4558#else
4559# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004560#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004561 ((unsigned char *) _p)[OFF + 1] = _q[0];
4562 ((unsigned char *) _p)[OFF + 0] = _q[1];
4563 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4564 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4565#if (SIZEOF_LONG == 8)
4566 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4567 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4568 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4569 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4570#endif
4571#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004572 _q += SIZEOF_LONG;
4573 _p += SIZEOF_LONG / 2;
4574 }
4575 }
4576 p = _p;
4577 q = _q;
4578 if (q >= e)
4579 break;
4580 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582
Benjamin Peterson14339b62009-01-31 16:36:08 +00004583 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004584
4585 if (ch < 0xD800 || ch > 0xDFFF) {
4586 *p++ = ch;
4587 continue;
4588 }
4589
4590 /* UTF-16 code pair: */
4591 if (q > e) {
4592 errmsg = "unexpected end of data";
4593 startinpos = (((const char *)q) - 2) - starts;
4594 endinpos = ((const char *)e) + 1 - starts;
4595 goto utf16Error;
4596 }
4597 if (0xD800 <= ch && ch <= 0xDBFF) {
4598 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4599 q += 2;
4600 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004601#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004602 *p++ = ch;
4603 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004604#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004605 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004606#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 continue;
4608 }
4609 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004610 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004611 startinpos = (((const char *)q)-4)-starts;
4612 endinpos = startinpos+2;
4613 goto utf16Error;
4614 }
4615
Benjamin Peterson14339b62009-01-31 16:36:08 +00004616 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004617 errmsg = "illegal encoding";
4618 startinpos = (((const char *)q)-2)-starts;
4619 endinpos = startinpos+2;
4620 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004621
Benjamin Peterson29060642009-01-31 22:14:21 +00004622 utf16Error:
4623 outpos = p - PyUnicode_AS_UNICODE(unicode);
4624 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004625 errors,
4626 &errorHandler,
4627 "utf16", errmsg,
4628 &starts,
4629 (const char **)&e,
4630 &startinpos,
4631 &endinpos,
4632 &exc,
4633 (const char **)&q,
4634 &unicode,
4635 &outpos,
4636 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004637 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004638 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004639 /* remaining byte at the end? (size should be even) */
4640 if (e == q) {
4641 if (!consumed) {
4642 errmsg = "truncated data";
4643 startinpos = ((const char *)q) - starts;
4644 endinpos = ((const char *)e) + 1 - starts;
4645 outpos = p - PyUnicode_AS_UNICODE(unicode);
4646 if (unicode_decode_call_errorhandler(
4647 errors,
4648 &errorHandler,
4649 "utf16", errmsg,
4650 &starts,
4651 (const char **)&e,
4652 &startinpos,
4653 &endinpos,
4654 &exc,
4655 (const char **)&q,
4656 &unicode,
4657 &outpos,
4658 &p))
4659 goto onError;
4660 /* The remaining input chars are ignored if the callback
4661 chooses to skip the input */
4662 }
4663 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664
4665 if (byteorder)
4666 *byteorder = bo;
4667
Walter Dörwald69652032004-09-07 20:24:22 +00004668 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004669 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004670
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004672 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004673 goto onError;
4674
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004675 Py_XDECREF(errorHandler);
4676 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004677 if (PyUnicode_READY(unicode) == -1) {
4678 Py_DECREF(unicode);
4679 return NULL;
4680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681 return (PyObject *)unicode;
4682
Benjamin Peterson29060642009-01-31 22:14:21 +00004683 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004685 Py_XDECREF(errorHandler);
4686 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 return NULL;
4688}
4689
Antoine Pitrouab868312009-01-10 15:40:25 +00004690#undef FAST_CHAR_MASK
4691#undef SWAPPED_FAST_CHAR_MASK
4692
Tim Peters772747b2001-08-09 22:21:55 +00004693PyObject *
4694PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004695 Py_ssize_t size,
4696 const char *errors,
4697 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004699 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004700 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004701 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004702#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004703 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004704#else
4705 const int pairs = 0;
4706#endif
Tim Peters772747b2001-08-09 22:21:55 +00004707 /* Offsets from p for storing byte pairs in the right order. */
4708#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4709 int ihi = 1, ilo = 0;
4710#else
4711 int ihi = 0, ilo = 1;
4712#endif
4713
Benjamin Peterson29060642009-01-31 22:14:21 +00004714#define STORECHAR(CH) \
4715 do { \
4716 p[ihi] = ((CH) >> 8) & 0xff; \
4717 p[ilo] = (CH) & 0xff; \
4718 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004719 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004721#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004722 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004723 if (s[i] >= 0x10000)
4724 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004725#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004726 /* 2 * (size + pairs + (byteorder == 0)) */
4727 if (size > PY_SSIZE_T_MAX ||
4728 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004730 nsize = size + pairs + (byteorder == 0);
4731 bytesize = nsize * 2;
4732 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004733 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004734 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735 if (v == NULL)
4736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004738 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004740 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004741 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004742 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004743
4744 if (byteorder == -1) {
4745 /* force LE */
4746 ihi = 1;
4747 ilo = 0;
4748 }
4749 else if (byteorder == 1) {
4750 /* force BE */
4751 ihi = 0;
4752 ilo = 1;
4753 }
4754
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004755 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004756 Py_UNICODE ch = *s++;
4757 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004758#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004759 if (ch >= 0x10000) {
4760 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4761 ch = 0xD800 | ((ch-0x10000) >> 10);
4762 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004763#endif
Tim Peters772747b2001-08-09 22:21:55 +00004764 STORECHAR(ch);
4765 if (ch2)
4766 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004767 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004768
4769 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004770 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004771#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772}
4773
Alexander Belopolsky40018472011-02-26 01:02:56 +00004774PyObject *
4775PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776{
4777 if (!PyUnicode_Check(unicode)) {
4778 PyErr_BadArgument();
4779 return NULL;
4780 }
4781 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 PyUnicode_GET_SIZE(unicode),
4783 NULL,
4784 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785}
4786
4787/* --- Unicode Escape Codec ----------------------------------------------- */
4788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004789/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4790 if all the escapes in the string make it still a valid ASCII string.
4791 Returns -1 if any escapes were found which cause the string to
4792 pop out of ASCII range. Otherwise returns the length of the
4793 required buffer to hold the string.
4794 */
4795Py_ssize_t
4796length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4797{
4798 const unsigned char *p = (const unsigned char *)s;
4799 const unsigned char *end = p + size;
4800 Py_ssize_t length = 0;
4801
4802 if (size < 0)
4803 return -1;
4804
4805 for (; p < end; ++p) {
4806 if (*p > 127) {
4807 /* Non-ASCII */
4808 return -1;
4809 }
4810 else if (*p != '\\') {
4811 /* Normal character */
4812 ++length;
4813 }
4814 else {
4815 /* Backslash-escape, check next char */
4816 ++p;
4817 /* Escape sequence reaches till end of string or
4818 non-ASCII follow-up. */
4819 if (p >= end || *p > 127)
4820 return -1;
4821 switch (*p) {
4822 case '\n':
4823 /* backslash + \n result in zero characters */
4824 break;
4825 case '\\': case '\'': case '\"':
4826 case 'b': case 'f': case 't':
4827 case 'n': case 'r': case 'v': case 'a':
4828 ++length;
4829 break;
4830 case '0': case '1': case '2': case '3':
4831 case '4': case '5': case '6': case '7':
4832 case 'x': case 'u': case 'U': case 'N':
4833 /* these do not guarantee ASCII characters */
4834 return -1;
4835 default:
4836 /* count the backslash + the other character */
4837 length += 2;
4838 }
4839 }
4840 }
4841 return length;
4842}
4843
4844/* Similar to PyUnicode_WRITE but either write into wstr field
4845 or treat string as ASCII. */
4846#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4847 do { \
4848 if ((kind) != PyUnicode_WCHAR_KIND) \
4849 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4850 else \
4851 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4852 } while (0)
4853
4854#define WRITE_WSTR(buf, index, value) \
4855 assert(kind == PyUnicode_WCHAR_KIND), \
4856 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4857
4858
Fredrik Lundh06d12682001-01-24 07:59:11 +00004859static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004860
Alexander Belopolsky40018472011-02-26 01:02:56 +00004861PyObject *
4862PyUnicode_DecodeUnicodeEscape(const char *s,
4863 Py_ssize_t size,
4864 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004867 Py_ssize_t startinpos;
4868 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004869 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004873 char* message;
4874 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004875 PyObject *errorHandler = NULL;
4876 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004877 Py_ssize_t ascii_length;
4878 Py_ssize_t i;
4879 int kind;
4880 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882 ascii_length = length_of_escaped_ascii_string(s, size);
4883
4884 /* After length_of_escaped_ascii_string() there are two alternatives,
4885 either the string is pure ASCII with named escapes like \n, etc.
4886 and we determined it's exact size (common case)
4887 or it contains \x, \u, ... escape sequences. then we create a
4888 legacy wchar string and resize it at the end of this function. */
4889 if (ascii_length >= 0) {
4890 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4891 if (!v)
4892 goto onError;
4893 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4894 kind = PyUnicode_1BYTE_KIND;
4895 data = PyUnicode_DATA(v);
4896 }
4897 else {
4898 /* Escaped strings will always be longer than the resulting
4899 Unicode string, so we start with size here and then reduce the
4900 length after conversion to the true value.
4901 (but if the error callback returns a long replacement string
4902 we'll have to allocate more space) */
4903 v = _PyUnicode_New(size);
4904 if (!v)
4905 goto onError;
4906 kind = PyUnicode_WCHAR_KIND;
4907 data = PyUnicode_AS_UNICODE(v);
4908 }
4909
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910 if (size == 0)
4911 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004912 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004914
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915 while (s < end) {
4916 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004917 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004918 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004920 if (kind == PyUnicode_WCHAR_KIND) {
4921 assert(i < _PyUnicode_WSTR_LENGTH(v));
4922 }
4923 else {
4924 /* The only case in which i == ascii_length is a backslash
4925 followed by a newline. */
4926 assert(i <= ascii_length);
4927 }
4928
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929 /* Non-escape characters are interpreted as Unicode ordinals */
4930 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004931 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932 continue;
4933 }
4934
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004935 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 /* \ - Escapes */
4937 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004938 c = *s++;
4939 if (s > end)
4940 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004941
4942 if (kind == PyUnicode_WCHAR_KIND) {
4943 assert(i < _PyUnicode_WSTR_LENGTH(v));
4944 }
4945 else {
4946 /* The only case in which i == ascii_length is a backslash
4947 followed by a newline. */
4948 assert(i < ascii_length || (i == ascii_length && c == '\n'));
4949 }
4950
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004951 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952
Benjamin Peterson29060642009-01-31 22:14:21 +00004953 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004955 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
4956 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
4957 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
4958 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
4959 /* FF */
4960 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
4961 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
4962 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
4963 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
4964 /* VT */
4965 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
4966 /* BEL, not classic C */
4967 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970 case '0': case '1': case '2': case '3':
4971 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004972 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004973 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004974 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004975 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004976 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004978 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979 break;
4980
Benjamin Peterson29060642009-01-31 22:14:21 +00004981 /* hex escapes */
4982 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00004984 digits = 2;
4985 message = "truncated \\xXX escape";
4986 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987
Benjamin Peterson29060642009-01-31 22:14:21 +00004988 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00004990 digits = 4;
4991 message = "truncated \\uXXXX escape";
4992 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993
Benjamin Peterson29060642009-01-31 22:14:21 +00004994 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00004995 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00004996 digits = 8;
4997 message = "truncated \\UXXXXXXXX escape";
4998 hexescape:
4999 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005000 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005001 if (s+digits>end) {
5002 endinpos = size;
5003 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 errors, &errorHandler,
5005 "unicodeescape", "end of string in escape sequence",
5006 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005007 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005008 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005009 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005010 goto nextByte;
5011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005012 for (j = 0; j < digits; ++j) {
5013 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005014 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005015 endinpos = (s+j+1)-starts;
5016 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005017 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005018 errors, &errorHandler,
5019 "unicodeescape", message,
5020 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005021 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005022 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005023 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005024 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005025 }
5026 chr = (chr<<4) & ~0xF;
5027 if (c >= '0' && c <= '9')
5028 chr += c - '0';
5029 else if (c >= 'a' && c <= 'f')
5030 chr += 10 + c - 'a';
5031 else
5032 chr += 10 + c - 'A';
5033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005034 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005035 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005036 /* _decoding_error will have already written into the
5037 target buffer. */
5038 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005039 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005040 /* when we get here, chr is a 32-bit unicode character */
5041 if (chr <= 0xffff)
5042 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005043 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005044 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005045 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005046 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005047#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005048 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005049#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005050 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005051 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5052 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005053#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005054 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005055 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005056 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005057 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005058 errors, &errorHandler,
5059 "unicodeescape", "illegal Unicode character",
5060 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005061 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005062 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005063 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005064 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005065 break;
5066
Benjamin Peterson29060642009-01-31 22:14:21 +00005067 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005068 case 'N':
5069 message = "malformed \\N character escape";
5070 if (ucnhash_CAPI == NULL) {
5071 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005072 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5073 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005074 if (ucnhash_CAPI == NULL)
5075 goto ucnhashError;
5076 }
5077 if (*s == '{') {
5078 const char *start = s+1;
5079 /* look for the closing brace */
5080 while (*s != '}' && s < end)
5081 s++;
5082 if (s > start && s < end && *s == '}') {
5083 /* found a name. look it up in the unicode database */
5084 message = "unknown Unicode character name";
5085 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005086 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5087 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005088 goto store;
5089 }
5090 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005091 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005092 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005093 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005094 errors, &errorHandler,
5095 "unicodeescape", message,
5096 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005097 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005098 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005099 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005100 break;
5101
5102 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005103 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005104 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005105 message = "\\ at end of string";
5106 s--;
5107 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005108 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005109 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005110 errors, &errorHandler,
5111 "unicodeescape", message,
5112 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005113 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005114 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005115 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005116 }
5117 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005118 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5119 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005120 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005121 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005123 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005124 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005126 /* Ensure the length prediction worked in case of ASCII strings */
5127 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5128
5129 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5130 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005131 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005132 Py_XDECREF(errorHandler);
5133 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005135
Benjamin Peterson29060642009-01-31 22:14:21 +00005136 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005137 PyErr_SetString(
5138 PyExc_UnicodeError,
5139 "\\N escapes not supported (can't load unicodedata module)"
5140 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005141 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005142 Py_XDECREF(errorHandler);
5143 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005144 return NULL;
5145
Benjamin Peterson29060642009-01-31 22:14:21 +00005146 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005148 Py_XDECREF(errorHandler);
5149 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150 return NULL;
5151}
5152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005153#undef WRITE_ASCII_OR_WSTR
5154#undef WRITE_WSTR
5155
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156/* Return a Unicode-Escape string version of the Unicode object.
5157
5158 If quotes is true, the string is enclosed in u"" or u'' quotes as
5159 appropriate.
5160
5161*/
5162
Walter Dörwald79e913e2007-05-12 11:08:06 +00005163static const char *hexdigits = "0123456789abcdef";
5164
Alexander Belopolsky40018472011-02-26 01:02:56 +00005165PyObject *
5166PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5167 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005169 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005172#ifdef Py_UNICODE_WIDE
5173 const Py_ssize_t expandsize = 10;
5174#else
5175 const Py_ssize_t expandsize = 6;
5176#endif
5177
Thomas Wouters89f507f2006-12-13 04:49:30 +00005178 /* XXX(nnorwitz): rather than over-allocating, it would be
5179 better to choose a different scheme. Perhaps scan the
5180 first N-chars of the string and allocate based on that size.
5181 */
5182 /* Initial allocation is based on the longest-possible unichr
5183 escape.
5184
5185 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5186 unichr, so in this case it's the longest unichr escape. In
5187 narrow (UTF-16) builds this is five chars per source unichr
5188 since there are two unichrs in the surrogate pair, so in narrow
5189 (UTF-16) builds it's not the longest unichr escape.
5190
5191 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5192 so in the narrow (UTF-16) build case it's the longest unichr
5193 escape.
5194 */
5195
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005196 if (size == 0)
5197 return PyBytes_FromStringAndSize(NULL, 0);
5198
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005199 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005200 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005201
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005202 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005203 2
5204 + expandsize*size
5205 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 if (repr == NULL)
5207 return NULL;
5208
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005209 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211 while (size-- > 0) {
5212 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005213
Walter Dörwald79e913e2007-05-12 11:08:06 +00005214 /* Escape backslashes */
5215 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 *p++ = '\\';
5217 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005218 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005219 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005220
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005221#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005222 /* Map 21-bit characters to '\U00xxxxxx' */
5223 else if (ch >= 0x10000) {
5224 *p++ = '\\';
5225 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005226 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5227 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5228 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5229 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5230 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5231 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5232 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5233 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005234 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005235 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005236#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005237 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5238 else if (ch >= 0xD800 && ch < 0xDC00) {
5239 Py_UNICODE ch2;
5240 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005241
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 ch2 = *s++;
5243 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005244 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5246 *p++ = '\\';
5247 *p++ = 'U';
5248 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5249 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5250 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5251 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5252 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5253 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5254 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5255 *p++ = hexdigits[ucs & 0x0000000F];
5256 continue;
5257 }
5258 /* Fall through: isolated surrogates are copied as-is */
5259 s--;
5260 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005261 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005262#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005263
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005265 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 *p++ = '\\';
5267 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005268 *p++ = hexdigits[(ch >> 12) & 0x000F];
5269 *p++ = hexdigits[(ch >> 8) & 0x000F];
5270 *p++ = hexdigits[(ch >> 4) & 0x000F];
5271 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005273
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005274 /* Map special whitespace to '\t', \n', '\r' */
5275 else if (ch == '\t') {
5276 *p++ = '\\';
5277 *p++ = 't';
5278 }
5279 else if (ch == '\n') {
5280 *p++ = '\\';
5281 *p++ = 'n';
5282 }
5283 else if (ch == '\r') {
5284 *p++ = '\\';
5285 *p++ = 'r';
5286 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005287
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005288 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005289 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005291 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005292 *p++ = hexdigits[(ch >> 4) & 0x000F];
5293 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005294 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005295
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296 /* Copy everything else as-is */
5297 else
5298 *p++ = (char) ch;
5299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005301 assert(p - PyBytes_AS_STRING(repr) > 0);
5302 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5303 return NULL;
5304 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305}
5306
Alexander Belopolsky40018472011-02-26 01:02:56 +00005307PyObject *
5308PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005310 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 if (!PyUnicode_Check(unicode)) {
5312 PyErr_BadArgument();
5313 return NULL;
5314 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005315 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5316 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005317 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318}
5319
5320/* --- Raw Unicode Escape Codec ------------------------------------------- */
5321
Alexander Belopolsky40018472011-02-26 01:02:56 +00005322PyObject *
5323PyUnicode_DecodeRawUnicodeEscape(const char *s,
5324 Py_ssize_t size,
5325 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005327 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005328 Py_ssize_t startinpos;
5329 Py_ssize_t endinpos;
5330 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005332 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333 const char *end;
5334 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005335 PyObject *errorHandler = NULL;
5336 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005337
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 /* Escaped strings will always be longer than the resulting
5339 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005340 length after conversion to the true value. (But decoding error
5341 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 v = _PyUnicode_New(size);
5343 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 end = s + size;
5349 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 unsigned char c;
5351 Py_UCS4 x;
5352 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005353 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 /* Non-escape characters are interpreted as Unicode ordinals */
5356 if (*s != '\\') {
5357 *p++ = (unsigned char)*s++;
5358 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005359 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 startinpos = s-starts;
5361
5362 /* \u-escapes are only interpreted iff the number of leading
5363 backslashes if odd */
5364 bs = s;
5365 for (;s < end;) {
5366 if (*s != '\\')
5367 break;
5368 *p++ = (unsigned char)*s++;
5369 }
5370 if (((s - bs) & 1) == 0 ||
5371 s >= end ||
5372 (*s != 'u' && *s != 'U')) {
5373 continue;
5374 }
5375 p--;
5376 count = *s=='u' ? 4 : 8;
5377 s++;
5378
5379 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5380 outpos = p-PyUnicode_AS_UNICODE(v);
5381 for (x = 0, i = 0; i < count; ++i, ++s) {
5382 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005383 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005384 endinpos = s-starts;
5385 if (unicode_decode_call_errorhandler(
5386 errors, &errorHandler,
5387 "rawunicodeescape", "truncated \\uXXXX",
5388 &starts, &end, &startinpos, &endinpos, &exc, &s,
5389 &v, &outpos, &p))
5390 goto onError;
5391 goto nextByte;
5392 }
5393 x = (x<<4) & ~0xF;
5394 if (c >= '0' && c <= '9')
5395 x += c - '0';
5396 else if (c >= 'a' && c <= 'f')
5397 x += 10 + c - 'a';
5398 else
5399 x += 10 + c - 'A';
5400 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005401 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005402 /* UCS-2 character */
5403 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005404 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 /* UCS-4 character. Either store directly, or as
5406 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005407#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005409#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 x -= 0x10000L;
5411 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5412 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005413#endif
5414 } else {
5415 endinpos = s-starts;
5416 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005417 if (unicode_decode_call_errorhandler(
5418 errors, &errorHandler,
5419 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 &starts, &end, &startinpos, &endinpos, &exc, &s,
5421 &v, &outpos, &p))
5422 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005423 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 nextByte:
5425 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005427 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005429 Py_XDECREF(errorHandler);
5430 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005431 if (PyUnicode_READY(v) == -1) {
5432 Py_DECREF(v);
5433 return NULL;
5434 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005436
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005439 Py_XDECREF(errorHandler);
5440 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 return NULL;
5442}
5443
Alexander Belopolsky40018472011-02-26 01:02:56 +00005444PyObject *
5445PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5446 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005448 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 char *p;
5450 char *q;
5451
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005452#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005453 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005454#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005455 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005456#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005457
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005458 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005459 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005460
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005461 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 if (repr == NULL)
5463 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005464 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005465 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005467 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 while (size-- > 0) {
5469 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005470#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 /* Map 32-bit characters to '\Uxxxxxxxx' */
5472 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005473 *p++ = '\\';
5474 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005475 *p++ = hexdigits[(ch >> 28) & 0xf];
5476 *p++ = hexdigits[(ch >> 24) & 0xf];
5477 *p++ = hexdigits[(ch >> 20) & 0xf];
5478 *p++ = hexdigits[(ch >> 16) & 0xf];
5479 *p++ = hexdigits[(ch >> 12) & 0xf];
5480 *p++ = hexdigits[(ch >> 8) & 0xf];
5481 *p++ = hexdigits[(ch >> 4) & 0xf];
5482 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005483 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005484 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005485#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5487 if (ch >= 0xD800 && ch < 0xDC00) {
5488 Py_UNICODE ch2;
5489 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005490
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 ch2 = *s++;
5492 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005493 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5495 *p++ = '\\';
5496 *p++ = 'U';
5497 *p++ = hexdigits[(ucs >> 28) & 0xf];
5498 *p++ = hexdigits[(ucs >> 24) & 0xf];
5499 *p++ = hexdigits[(ucs >> 20) & 0xf];
5500 *p++ = hexdigits[(ucs >> 16) & 0xf];
5501 *p++ = hexdigits[(ucs >> 12) & 0xf];
5502 *p++ = hexdigits[(ucs >> 8) & 0xf];
5503 *p++ = hexdigits[(ucs >> 4) & 0xf];
5504 *p++ = hexdigits[ucs & 0xf];
5505 continue;
5506 }
5507 /* Fall through: isolated surrogates are copied as-is */
5508 s--;
5509 size++;
5510 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005511#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 /* Map 16-bit characters to '\uxxxx' */
5513 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514 *p++ = '\\';
5515 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005516 *p++ = hexdigits[(ch >> 12) & 0xf];
5517 *p++ = hexdigits[(ch >> 8) & 0xf];
5518 *p++ = hexdigits[(ch >> 4) & 0xf];
5519 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 /* Copy everything else as-is */
5522 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 *p++ = (char) ch;
5524 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005525 size = p - q;
5526
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005527 assert(size > 0);
5528 if (_PyBytes_Resize(&repr, size) < 0)
5529 return NULL;
5530 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531}
5532
Alexander Belopolsky40018472011-02-26 01:02:56 +00005533PyObject *
5534PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005536 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005538 PyErr_BadArgument();
5539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005541 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5542 PyUnicode_GET_SIZE(unicode));
5543
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005544 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545}
5546
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005547/* --- Unicode Internal Codec ------------------------------------------- */
5548
Alexander Belopolsky40018472011-02-26 01:02:56 +00005549PyObject *
5550_PyUnicode_DecodeUnicodeInternal(const char *s,
5551 Py_ssize_t size,
5552 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005553{
5554 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005555 Py_ssize_t startinpos;
5556 Py_ssize_t endinpos;
5557 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005558 PyUnicodeObject *v;
5559 Py_UNICODE *p;
5560 const char *end;
5561 const char *reason;
5562 PyObject *errorHandler = NULL;
5563 PyObject *exc = NULL;
5564
Neal Norwitzd43069c2006-01-08 01:12:10 +00005565#ifdef Py_UNICODE_WIDE
5566 Py_UNICODE unimax = PyUnicode_GetMax();
5567#endif
5568
Thomas Wouters89f507f2006-12-13 04:49:30 +00005569 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005570 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5571 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005572 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005573 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5574 as string was created with the old API. */
5575 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005577 p = PyUnicode_AS_UNICODE(v);
5578 end = s + size;
5579
5580 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005581 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005582 /* We have to sanity check the raw data, otherwise doom looms for
5583 some malformed UCS-4 data. */
5584 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005585#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005586 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005587#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005588 end-s < Py_UNICODE_SIZE
5589 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005590 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005591 startinpos = s - starts;
5592 if (end-s < Py_UNICODE_SIZE) {
5593 endinpos = end-starts;
5594 reason = "truncated input";
5595 }
5596 else {
5597 endinpos = s - starts + Py_UNICODE_SIZE;
5598 reason = "illegal code point (> 0x10FFFF)";
5599 }
5600 outpos = p - PyUnicode_AS_UNICODE(v);
5601 if (unicode_decode_call_errorhandler(
5602 errors, &errorHandler,
5603 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005604 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005605 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005606 goto onError;
5607 }
5608 }
5609 else {
5610 p++;
5611 s += Py_UNICODE_SIZE;
5612 }
5613 }
5614
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005615 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005616 goto onError;
5617 Py_XDECREF(errorHandler);
5618 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005619 if (PyUnicode_READY(v) == -1) {
5620 Py_DECREF(v);
5621 return NULL;
5622 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005623 return (PyObject *)v;
5624
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005626 Py_XDECREF(v);
5627 Py_XDECREF(errorHandler);
5628 Py_XDECREF(exc);
5629 return NULL;
5630}
5631
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632/* --- Latin-1 Codec ------------------------------------------------------ */
5633
Alexander Belopolsky40018472011-02-26 01:02:56 +00005634PyObject *
5635PyUnicode_DecodeLatin1(const char *s,
5636 Py_ssize_t size,
5637 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005640 return PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641}
5642
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005643/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005644static void
5645make_encode_exception(PyObject **exceptionObject,
5646 const char *encoding,
5647 const Py_UNICODE *unicode, Py_ssize_t size,
5648 Py_ssize_t startpos, Py_ssize_t endpos,
5649 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 *exceptionObject = PyUnicodeEncodeError_Create(
5653 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 }
5655 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5657 goto onError;
5658 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5659 goto onError;
5660 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5661 goto onError;
5662 return;
5663 onError:
5664 Py_DECREF(*exceptionObject);
5665 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 }
5667}
5668
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005669/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005670static void
5671raise_encode_exception(PyObject **exceptionObject,
5672 const char *encoding,
5673 const Py_UNICODE *unicode, Py_ssize_t size,
5674 Py_ssize_t startpos, Py_ssize_t endpos,
5675 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005676{
5677 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005679 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005681}
5682
5683/* error handling callback helper:
5684 build arguments, call the callback and check the arguments,
5685 put the result into newpos and return the replacement string, which
5686 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005687static PyObject *
5688unicode_encode_call_errorhandler(const char *errors,
5689 PyObject **errorHandler,
5690 const char *encoding, const char *reason,
5691 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5692 Py_ssize_t startpos, Py_ssize_t endpos,
5693 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005694{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005695 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005696
5697 PyObject *restuple;
5698 PyObject *resunicode;
5699
5700 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005704 }
5705
5706 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005708 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005710
5711 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005716 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 Py_DECREF(restuple);
5718 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005719 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005720 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005721 &resunicode, newpos)) {
5722 Py_DECREF(restuple);
5723 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005725 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5726 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5727 Py_DECREF(restuple);
5728 return NULL;
5729 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005730 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005731 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005732 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5734 Py_DECREF(restuple);
5735 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005736 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 Py_INCREF(resunicode);
5738 Py_DECREF(restuple);
5739 return resunicode;
5740}
5741
Alexander Belopolsky40018472011-02-26 01:02:56 +00005742static PyObject *
5743unicode_encode_ucs1(const Py_UNICODE *p,
5744 Py_ssize_t size,
5745 const char *errors,
5746 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005747{
5748 /* output object */
5749 PyObject *res;
5750 /* pointers to the beginning and end+1 of input */
5751 const Py_UNICODE *startp = p;
5752 const Py_UNICODE *endp = p + size;
5753 /* pointer to the beginning of the unencodable characters */
5754 /* const Py_UNICODE *badp = NULL; */
5755 /* pointer into the output */
5756 char *str;
5757 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005758 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005759 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5760 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 PyObject *errorHandler = NULL;
5762 PyObject *exc = NULL;
5763 /* the following variable is used for caching string comparisons
5764 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5765 int known_errorHandler = -1;
5766
5767 /* allocate enough for a simple encoding without
5768 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005769 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005770 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005771 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005772 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005773 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005774 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 ressize = size;
5776
5777 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005779
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 /* can we encode this? */
5781 if (c<limit) {
5782 /* no overflow check, because we know that the space is enough */
5783 *str++ = (char)c;
5784 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005785 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 else {
5787 Py_ssize_t unicodepos = p-startp;
5788 Py_ssize_t requiredsize;
5789 PyObject *repunicode;
5790 Py_ssize_t repsize;
5791 Py_ssize_t newpos;
5792 Py_ssize_t respos;
5793 Py_UNICODE *uni2;
5794 /* startpos for collecting unencodable chars */
5795 const Py_UNICODE *collstart = p;
5796 const Py_UNICODE *collend = p;
5797 /* find all unecodable characters */
5798 while ((collend < endp) && ((*collend)>=limit))
5799 ++collend;
5800 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5801 if (known_errorHandler==-1) {
5802 if ((errors==NULL) || (!strcmp(errors, "strict")))
5803 known_errorHandler = 1;
5804 else if (!strcmp(errors, "replace"))
5805 known_errorHandler = 2;
5806 else if (!strcmp(errors, "ignore"))
5807 known_errorHandler = 3;
5808 else if (!strcmp(errors, "xmlcharrefreplace"))
5809 known_errorHandler = 4;
5810 else
5811 known_errorHandler = 0;
5812 }
5813 switch (known_errorHandler) {
5814 case 1: /* strict */
5815 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5816 goto onError;
5817 case 2: /* replace */
5818 while (collstart++<collend)
5819 *str++ = '?'; /* fall through */
5820 case 3: /* ignore */
5821 p = collend;
5822 break;
5823 case 4: /* xmlcharrefreplace */
5824 respos = str - PyBytes_AS_STRING(res);
5825 /* determine replacement size (temporarily (mis)uses p) */
5826 for (p = collstart, repsize = 0; p < collend; ++p) {
5827 if (*p<10)
5828 repsize += 2+1+1;
5829 else if (*p<100)
5830 repsize += 2+2+1;
5831 else if (*p<1000)
5832 repsize += 2+3+1;
5833 else if (*p<10000)
5834 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005835#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 else
5837 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005838#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 else if (*p<100000)
5840 repsize += 2+5+1;
5841 else if (*p<1000000)
5842 repsize += 2+6+1;
5843 else
5844 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005845#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 }
5847 requiredsize = respos+repsize+(endp-collend);
5848 if (requiredsize > ressize) {
5849 if (requiredsize<2*ressize)
5850 requiredsize = 2*ressize;
5851 if (_PyBytes_Resize(&res, requiredsize))
5852 goto onError;
5853 str = PyBytes_AS_STRING(res) + respos;
5854 ressize = requiredsize;
5855 }
5856 /* generate replacement (temporarily (mis)uses p) */
5857 for (p = collstart; p < collend; ++p) {
5858 str += sprintf(str, "&#%d;", (int)*p);
5859 }
5860 p = collend;
5861 break;
5862 default:
5863 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5864 encoding, reason, startp, size, &exc,
5865 collstart-startp, collend-startp, &newpos);
5866 if (repunicode == NULL)
5867 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005868 if (PyBytes_Check(repunicode)) {
5869 /* Directly copy bytes result to output. */
5870 repsize = PyBytes_Size(repunicode);
5871 if (repsize > 1) {
5872 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005873 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005874 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5875 Py_DECREF(repunicode);
5876 goto onError;
5877 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005878 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005879 ressize += repsize-1;
5880 }
5881 memcpy(str, PyBytes_AsString(repunicode), repsize);
5882 str += repsize;
5883 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005884 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005885 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005886 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 /* need more space? (at least enough for what we
5888 have+the replacement+the rest of the string, so
5889 we won't have to check space for encodable characters) */
5890 respos = str - PyBytes_AS_STRING(res);
5891 repsize = PyUnicode_GET_SIZE(repunicode);
5892 requiredsize = respos+repsize+(endp-collend);
5893 if (requiredsize > ressize) {
5894 if (requiredsize<2*ressize)
5895 requiredsize = 2*ressize;
5896 if (_PyBytes_Resize(&res, requiredsize)) {
5897 Py_DECREF(repunicode);
5898 goto onError;
5899 }
5900 str = PyBytes_AS_STRING(res) + respos;
5901 ressize = requiredsize;
5902 }
5903 /* check if there is anything unencodable in the replacement
5904 and copy it to the output */
5905 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5906 c = *uni2;
5907 if (c >= limit) {
5908 raise_encode_exception(&exc, encoding, startp, size,
5909 unicodepos, unicodepos+1, reason);
5910 Py_DECREF(repunicode);
5911 goto onError;
5912 }
5913 *str = (char)c;
5914 }
5915 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005916 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005918 }
5919 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005920 /* Resize if we allocated to much */
5921 size = str - PyBytes_AS_STRING(res);
5922 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005923 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005924 if (_PyBytes_Resize(&res, size) < 0)
5925 goto onError;
5926 }
5927
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005928 Py_XDECREF(errorHandler);
5929 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005930 return res;
5931
5932 onError:
5933 Py_XDECREF(res);
5934 Py_XDECREF(errorHandler);
5935 Py_XDECREF(exc);
5936 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005937}
5938
Alexander Belopolsky40018472011-02-26 01:02:56 +00005939PyObject *
5940PyUnicode_EncodeLatin1(const Py_UNICODE *p,
5941 Py_ssize_t size,
5942 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005944 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945}
5946
Alexander Belopolsky40018472011-02-26 01:02:56 +00005947PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005948_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949{
5950 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 PyErr_BadArgument();
5952 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005954 if (PyUnicode_READY(unicode) == -1)
5955 return NULL;
5956 /* Fast path: if it is a one-byte string, construct
5957 bytes object directly. */
5958 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
5959 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
5960 PyUnicode_GET_LENGTH(unicode));
5961 /* Non-Latin-1 characters present. Defer to above function to
5962 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005965 errors);
5966}
5967
5968PyObject*
5969PyUnicode_AsLatin1String(PyObject *unicode)
5970{
5971 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972}
5973
5974/* --- 7-bit ASCII Codec -------------------------------------------------- */
5975
Alexander Belopolsky40018472011-02-26 01:02:56 +00005976PyObject *
5977PyUnicode_DecodeASCII(const char *s,
5978 Py_ssize_t size,
5979 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005981 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 PyUnicodeObject *v;
5983 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005984 Py_ssize_t startinpos;
5985 Py_ssize_t endinpos;
5986 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005988 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005989 PyObject *errorHandler = NULL;
5990 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005991 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00005992
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005994 if (size == 1 && *(unsigned char*)s < 128)
5995 return PyUnicode_FromOrdinal(*(unsigned char*)s);
5996
5997 /* Fast path. Assume the input actually *is* ASCII, and allocate
5998 a single-block Unicode object with that assumption. If there is
5999 an error, drop the object and start over. */
6000 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6001 if (v == NULL)
6002 goto onError;
6003 d = PyUnicode_1BYTE_DATA(v);
6004 for (i = 0; i < size; i++) {
6005 unsigned char ch = ((unsigned char*)s)[i];
6006 if (ch < 128)
6007 d[i] = ch;
6008 else
6009 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006011 if (i == size)
6012 return (PyObject*)v;
6013 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006014
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 v = _PyUnicode_New(size);
6016 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006021 e = s + size;
6022 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 register unsigned char c = (unsigned char)*s;
6024 if (c < 128) {
6025 *p++ = c;
6026 ++s;
6027 }
6028 else {
6029 startinpos = s-starts;
6030 endinpos = startinpos + 1;
6031 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6032 if (unicode_decode_call_errorhandler(
6033 errors, &errorHandler,
6034 "ascii", "ordinal not in range(128)",
6035 &starts, &e, &startinpos, &endinpos, &exc, &s,
6036 &v, &outpos, &p))
6037 goto onError;
6038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006040 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6042 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043 Py_XDECREF(errorHandler);
6044 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006045 if (PyUnicode_READY(v) == -1) {
6046 Py_DECREF(v);
6047 return NULL;
6048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006050
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006053 Py_XDECREF(errorHandler);
6054 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 return NULL;
6056}
6057
Alexander Belopolsky40018472011-02-26 01:02:56 +00006058PyObject *
6059PyUnicode_EncodeASCII(const Py_UNICODE *p,
6060 Py_ssize_t size,
6061 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006063 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064}
6065
Alexander Belopolsky40018472011-02-26 01:02:56 +00006066PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006067_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068{
6069 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 PyErr_BadArgument();
6071 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006073 if (PyUnicode_READY(unicode) == -1)
6074 return NULL;
6075 /* Fast path: if it is an ASCII-only string, construct bytes object
6076 directly. Else defer to above function to raise the exception. */
6077 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6078 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6079 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006082 errors);
6083}
6084
6085PyObject *
6086PyUnicode_AsASCIIString(PyObject *unicode)
6087{
6088 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089}
6090
Victor Stinner99b95382011-07-04 14:23:54 +02006091#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006092
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006093/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006094
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006095#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006096#define NEED_RETRY
6097#endif
6098
6099/* XXX This code is limited to "true" double-byte encodings, as
6100 a) it assumes an incomplete character consists of a single byte, and
6101 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006103
Alexander Belopolsky40018472011-02-26 01:02:56 +00006104static int
6105is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006106{
6107 const char *curr = s + offset;
6108
6109 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 const char *prev = CharPrev(s, curr);
6111 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006112 }
6113 return 0;
6114}
6115
6116/*
6117 * Decode MBCS string into unicode object. If 'final' is set, converts
6118 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6119 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006120static int
6121decode_mbcs(PyUnicodeObject **v,
6122 const char *s, /* MBCS string */
6123 int size, /* sizeof MBCS string */
6124 int final,
6125 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006126{
6127 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006128 Py_ssize_t n;
6129 DWORD usize;
6130 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006131
6132 assert(size >= 0);
6133
Victor Stinner554f3f02010-06-16 23:33:54 +00006134 /* check and handle 'errors' arg */
6135 if (errors==NULL || strcmp(errors, "strict")==0)
6136 flags = MB_ERR_INVALID_CHARS;
6137 else if (strcmp(errors, "ignore")==0)
6138 flags = 0;
6139 else {
6140 PyErr_Format(PyExc_ValueError,
6141 "mbcs encoding does not support errors='%s'",
6142 errors);
6143 return -1;
6144 }
6145
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006146 /* Skip trailing lead-byte unless 'final' is set */
6147 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006148 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006149
6150 /* First get the size of the result */
6151 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006152 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6153 if (usize==0)
6154 goto mbcs_decode_error;
6155 } else
6156 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006157
6158 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006159 /* Create unicode object */
6160 *v = _PyUnicode_New(usize);
6161 if (*v == NULL)
6162 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006163 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006164 }
6165 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 /* Extend unicode object */
6167 n = PyUnicode_GET_SIZE(*v);
6168 if (_PyUnicode_Resize(v, n + usize) < 0)
6169 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006170 }
6171
6172 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006173 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006175 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6176 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006178 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006179 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006180
6181mbcs_decode_error:
6182 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6183 we raise a UnicodeDecodeError - else it is a 'generic'
6184 windows error
6185 */
6186 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6187 /* Ideally, we should get reason from FormatMessage - this
6188 is the Windows 2000 English version of the message
6189 */
6190 PyObject *exc = NULL;
6191 const char *reason = "No mapping for the Unicode character exists "
6192 "in the target multi-byte code page.";
6193 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6194 if (exc != NULL) {
6195 PyCodec_StrictErrors(exc);
6196 Py_DECREF(exc);
6197 }
6198 } else {
6199 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6200 }
6201 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006202}
6203
Alexander Belopolsky40018472011-02-26 01:02:56 +00006204PyObject *
6205PyUnicode_DecodeMBCSStateful(const char *s,
6206 Py_ssize_t size,
6207 const char *errors,
6208 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006209{
6210 PyUnicodeObject *v = NULL;
6211 int done;
6212
6213 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006215
6216#ifdef NEED_RETRY
6217 retry:
6218 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006219 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006220 else
6221#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006222 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006223
6224 if (done < 0) {
6225 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006227 }
6228
6229 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006231
6232#ifdef NEED_RETRY
6233 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 s += done;
6235 size -= done;
6236 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006237 }
6238#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006239 if (PyUnicode_READY(v) == -1) {
6240 Py_DECREF(v);
6241 return NULL;
6242 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006243 return (PyObject *)v;
6244}
6245
Alexander Belopolsky40018472011-02-26 01:02:56 +00006246PyObject *
6247PyUnicode_DecodeMBCS(const char *s,
6248 Py_ssize_t size,
6249 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006250{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006251 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6252}
6253
6254/*
6255 * Convert unicode into string object (MBCS).
6256 * Returns 0 if succeed, -1 otherwise.
6257 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006258static int
6259encode_mbcs(PyObject **repr,
6260 const Py_UNICODE *p, /* unicode */
6261 int size, /* size of unicode */
6262 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006263{
Victor Stinner554f3f02010-06-16 23:33:54 +00006264 BOOL usedDefaultChar = FALSE;
6265 BOOL *pusedDefaultChar;
6266 int mbcssize;
6267 Py_ssize_t n;
6268 PyObject *exc = NULL;
6269 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006270
6271 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006272
Victor Stinner554f3f02010-06-16 23:33:54 +00006273 /* check and handle 'errors' arg */
6274 if (errors==NULL || strcmp(errors, "strict")==0) {
6275 flags = WC_NO_BEST_FIT_CHARS;
6276 pusedDefaultChar = &usedDefaultChar;
6277 } else if (strcmp(errors, "replace")==0) {
6278 flags = 0;
6279 pusedDefaultChar = NULL;
6280 } else {
6281 PyErr_Format(PyExc_ValueError,
6282 "mbcs encoding does not support errors='%s'",
6283 errors);
6284 return -1;
6285 }
6286
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006287 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006288 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006289 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6290 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 if (mbcssize == 0) {
6292 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6293 return -1;
6294 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006295 /* If we used a default char, then we failed! */
6296 if (pusedDefaultChar && *pusedDefaultChar)
6297 goto mbcs_encode_error;
6298 } else {
6299 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006300 }
6301
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006302 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 /* Create string object */
6304 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6305 if (*repr == NULL)
6306 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006307 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006308 }
6309 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 /* Extend string object */
6311 n = PyBytes_Size(*repr);
6312 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6313 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006314 }
6315
6316 /* Do the conversion */
6317 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006318 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006319 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6320 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6322 return -1;
6323 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006324 if (pusedDefaultChar && *pusedDefaultChar)
6325 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006326 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006327 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006328
6329mbcs_encode_error:
6330 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6331 Py_XDECREF(exc);
6332 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006333}
6334
Alexander Belopolsky40018472011-02-26 01:02:56 +00006335PyObject *
6336PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6337 Py_ssize_t size,
6338 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006339{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006340 PyObject *repr = NULL;
6341 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006342
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006343#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006345 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006346 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006347 else
6348#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006349 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006350
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006351 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 Py_XDECREF(repr);
6353 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006354 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006355
6356#ifdef NEED_RETRY
6357 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 p += INT_MAX;
6359 size -= INT_MAX;
6360 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006361 }
6362#endif
6363
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006364 return repr;
6365}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006366
Alexander Belopolsky40018472011-02-26 01:02:56 +00006367PyObject *
6368PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006369{
6370 if (!PyUnicode_Check(unicode)) {
6371 PyErr_BadArgument();
6372 return NULL;
6373 }
6374 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 PyUnicode_GET_SIZE(unicode),
6376 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006377}
6378
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006379#undef NEED_RETRY
6380
Victor Stinner99b95382011-07-04 14:23:54 +02006381#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006382
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383/* --- Character Mapping Codec -------------------------------------------- */
6384
Alexander Belopolsky40018472011-02-26 01:02:56 +00006385PyObject *
6386PyUnicode_DecodeCharmap(const char *s,
6387 Py_ssize_t size,
6388 PyObject *mapping,
6389 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006391 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006392 Py_ssize_t startinpos;
6393 Py_ssize_t endinpos;
6394 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006395 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396 PyUnicodeObject *v;
6397 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006398 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 PyObject *errorHandler = NULL;
6400 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006401 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006402 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006403
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 /* Default to Latin-1 */
6405 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407
6408 v = _PyUnicode_New(size);
6409 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006415 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 mapstring = PyUnicode_AS_UNICODE(mapping);
6417 maplen = PyUnicode_GET_SIZE(mapping);
6418 while (s < e) {
6419 unsigned char ch = *s;
6420 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 if (ch < maplen)
6423 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 if (x == 0xfffe) {
6426 /* undefined mapping */
6427 outpos = p-PyUnicode_AS_UNICODE(v);
6428 startinpos = s-starts;
6429 endinpos = startinpos+1;
6430 if (unicode_decode_call_errorhandler(
6431 errors, &errorHandler,
6432 "charmap", "character maps to <undefined>",
6433 &starts, &e, &startinpos, &endinpos, &exc, &s,
6434 &v, &outpos, &p)) {
6435 goto onError;
6436 }
6437 continue;
6438 }
6439 *p++ = x;
6440 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006441 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006442 }
6443 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 while (s < e) {
6445 unsigned char ch = *s;
6446 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006447
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6449 w = PyLong_FromLong((long)ch);
6450 if (w == NULL)
6451 goto onError;
6452 x = PyObject_GetItem(mapping, w);
6453 Py_DECREF(w);
6454 if (x == NULL) {
6455 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6456 /* No mapping found means: mapping is undefined. */
6457 PyErr_Clear();
6458 x = Py_None;
6459 Py_INCREF(x);
6460 } else
6461 goto onError;
6462 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006463
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 /* Apply mapping */
6465 if (PyLong_Check(x)) {
6466 long value = PyLong_AS_LONG(x);
6467 if (value < 0 || value > 65535) {
6468 PyErr_SetString(PyExc_TypeError,
6469 "character mapping must be in range(65536)");
6470 Py_DECREF(x);
6471 goto onError;
6472 }
6473 *p++ = (Py_UNICODE)value;
6474 }
6475 else if (x == Py_None) {
6476 /* undefined mapping */
6477 outpos = p-PyUnicode_AS_UNICODE(v);
6478 startinpos = s-starts;
6479 endinpos = startinpos+1;
6480 if (unicode_decode_call_errorhandler(
6481 errors, &errorHandler,
6482 "charmap", "character maps to <undefined>",
6483 &starts, &e, &startinpos, &endinpos, &exc, &s,
6484 &v, &outpos, &p)) {
6485 Py_DECREF(x);
6486 goto onError;
6487 }
6488 Py_DECREF(x);
6489 continue;
6490 }
6491 else if (PyUnicode_Check(x)) {
6492 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006493
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 if (targetsize == 1)
6495 /* 1-1 mapping */
6496 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006497
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 else if (targetsize > 1) {
6499 /* 1-n mapping */
6500 if (targetsize > extrachars) {
6501 /* resize first */
6502 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6503 Py_ssize_t needed = (targetsize - extrachars) + \
6504 (targetsize << 2);
6505 extrachars += needed;
6506 /* XXX overflow detection missing */
6507 if (_PyUnicode_Resize(&v,
6508 PyUnicode_GET_SIZE(v) + needed) < 0) {
6509 Py_DECREF(x);
6510 goto onError;
6511 }
6512 p = PyUnicode_AS_UNICODE(v) + oldpos;
6513 }
6514 Py_UNICODE_COPY(p,
6515 PyUnicode_AS_UNICODE(x),
6516 targetsize);
6517 p += targetsize;
6518 extrachars -= targetsize;
6519 }
6520 /* 1-0 mapping: skip the character */
6521 }
6522 else {
6523 /* wrong return value */
6524 PyErr_SetString(PyExc_TypeError,
6525 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006526 Py_DECREF(x);
6527 goto onError;
6528 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 Py_DECREF(x);
6530 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006531 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 }
6533 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6535 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006536 Py_XDECREF(errorHandler);
6537 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006538 if (PyUnicode_READY(v) == -1) {
6539 Py_DECREF(v);
6540 return NULL;
6541 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006543
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006545 Py_XDECREF(errorHandler);
6546 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 Py_XDECREF(v);
6548 return NULL;
6549}
6550
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006551/* Charmap encoding: the lookup table */
6552
Alexander Belopolsky40018472011-02-26 01:02:56 +00006553struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 PyObject_HEAD
6555 unsigned char level1[32];
6556 int count2, count3;
6557 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006558};
6559
6560static PyObject*
6561encoding_map_size(PyObject *obj, PyObject* args)
6562{
6563 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006564 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006566}
6567
6568static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006569 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 PyDoc_STR("Return the size (in bytes) of this object") },
6571 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006572};
6573
6574static void
6575encoding_map_dealloc(PyObject* o)
6576{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006577 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006578}
6579
6580static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006581 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 "EncodingMap", /*tp_name*/
6583 sizeof(struct encoding_map), /*tp_basicsize*/
6584 0, /*tp_itemsize*/
6585 /* methods */
6586 encoding_map_dealloc, /*tp_dealloc*/
6587 0, /*tp_print*/
6588 0, /*tp_getattr*/
6589 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006590 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 0, /*tp_repr*/
6592 0, /*tp_as_number*/
6593 0, /*tp_as_sequence*/
6594 0, /*tp_as_mapping*/
6595 0, /*tp_hash*/
6596 0, /*tp_call*/
6597 0, /*tp_str*/
6598 0, /*tp_getattro*/
6599 0, /*tp_setattro*/
6600 0, /*tp_as_buffer*/
6601 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6602 0, /*tp_doc*/
6603 0, /*tp_traverse*/
6604 0, /*tp_clear*/
6605 0, /*tp_richcompare*/
6606 0, /*tp_weaklistoffset*/
6607 0, /*tp_iter*/
6608 0, /*tp_iternext*/
6609 encoding_map_methods, /*tp_methods*/
6610 0, /*tp_members*/
6611 0, /*tp_getset*/
6612 0, /*tp_base*/
6613 0, /*tp_dict*/
6614 0, /*tp_descr_get*/
6615 0, /*tp_descr_set*/
6616 0, /*tp_dictoffset*/
6617 0, /*tp_init*/
6618 0, /*tp_alloc*/
6619 0, /*tp_new*/
6620 0, /*tp_free*/
6621 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006622};
6623
6624PyObject*
6625PyUnicode_BuildEncodingMap(PyObject* string)
6626{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006627 PyObject *result;
6628 struct encoding_map *mresult;
6629 int i;
6630 int need_dict = 0;
6631 unsigned char level1[32];
6632 unsigned char level2[512];
6633 unsigned char *mlevel1, *mlevel2, *mlevel3;
6634 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006635 int kind;
6636 void *data;
6637 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006639 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006640 PyErr_BadArgument();
6641 return NULL;
6642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006643 kind = PyUnicode_KIND(string);
6644 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006645 memset(level1, 0xFF, sizeof level1);
6646 memset(level2, 0xFF, sizeof level2);
6647
6648 /* If there isn't a one-to-one mapping of NULL to \0,
6649 or if there are non-BMP characters, we need to use
6650 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006651 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006652 need_dict = 1;
6653 for (i = 1; i < 256; i++) {
6654 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006655 ch = PyUnicode_READ(kind, data, i);
6656 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006657 need_dict = 1;
6658 break;
6659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006660 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006661 /* unmapped character */
6662 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006663 l1 = ch >> 11;
6664 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006665 if (level1[l1] == 0xFF)
6666 level1[l1] = count2++;
6667 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006668 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006669 }
6670
6671 if (count2 >= 0xFF || count3 >= 0xFF)
6672 need_dict = 1;
6673
6674 if (need_dict) {
6675 PyObject *result = PyDict_New();
6676 PyObject *key, *value;
6677 if (!result)
6678 return NULL;
6679 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006680 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006681 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006682 if (!key || !value)
6683 goto failed1;
6684 if (PyDict_SetItem(result, key, value) == -1)
6685 goto failed1;
6686 Py_DECREF(key);
6687 Py_DECREF(value);
6688 }
6689 return result;
6690 failed1:
6691 Py_XDECREF(key);
6692 Py_XDECREF(value);
6693 Py_DECREF(result);
6694 return NULL;
6695 }
6696
6697 /* Create a three-level trie */
6698 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6699 16*count2 + 128*count3 - 1);
6700 if (!result)
6701 return PyErr_NoMemory();
6702 PyObject_Init(result, &EncodingMapType);
6703 mresult = (struct encoding_map*)result;
6704 mresult->count2 = count2;
6705 mresult->count3 = count3;
6706 mlevel1 = mresult->level1;
6707 mlevel2 = mresult->level23;
6708 mlevel3 = mresult->level23 + 16*count2;
6709 memcpy(mlevel1, level1, 32);
6710 memset(mlevel2, 0xFF, 16*count2);
6711 memset(mlevel3, 0, 128*count3);
6712 count3 = 0;
6713 for (i = 1; i < 256; i++) {
6714 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006715 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006716 /* unmapped character */
6717 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006718 o1 = PyUnicode_READ(kind, data, i)>>11;
6719 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006720 i2 = 16*mlevel1[o1] + o2;
6721 if (mlevel2[i2] == 0xFF)
6722 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006723 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006724 i3 = 128*mlevel2[i2] + o3;
6725 mlevel3[i3] = i;
6726 }
6727 return result;
6728}
6729
6730static int
6731encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6732{
6733 struct encoding_map *map = (struct encoding_map*)mapping;
6734 int l1 = c>>11;
6735 int l2 = (c>>7) & 0xF;
6736 int l3 = c & 0x7F;
6737 int i;
6738
6739#ifdef Py_UNICODE_WIDE
6740 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006742 }
6743#endif
6744 if (c == 0)
6745 return 0;
6746 /* level 1*/
6747 i = map->level1[l1];
6748 if (i == 0xFF) {
6749 return -1;
6750 }
6751 /* level 2*/
6752 i = map->level23[16*i+l2];
6753 if (i == 0xFF) {
6754 return -1;
6755 }
6756 /* level 3 */
6757 i = map->level23[16*map->count2 + 128*i + l3];
6758 if (i == 0) {
6759 return -1;
6760 }
6761 return i;
6762}
6763
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006764/* Lookup the character ch in the mapping. If the character
6765 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006766 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006767static PyObject *
6768charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769{
Christian Heimes217cfd12007-12-02 14:31:20 +00006770 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006771 PyObject *x;
6772
6773 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006775 x = PyObject_GetItem(mapping, w);
6776 Py_DECREF(w);
6777 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6779 /* No mapping found means: mapping is undefined. */
6780 PyErr_Clear();
6781 x = Py_None;
6782 Py_INCREF(x);
6783 return x;
6784 } else
6785 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006787 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006789 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 long value = PyLong_AS_LONG(x);
6791 if (value < 0 || value > 255) {
6792 PyErr_SetString(PyExc_TypeError,
6793 "character mapping must be in range(256)");
6794 Py_DECREF(x);
6795 return NULL;
6796 }
6797 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006799 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006800 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 /* wrong return value */
6803 PyErr_Format(PyExc_TypeError,
6804 "character mapping must return integer, bytes or None, not %.400s",
6805 x->ob_type->tp_name);
6806 Py_DECREF(x);
6807 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808 }
6809}
6810
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006811static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006812charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006813{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006814 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6815 /* exponentially overallocate to minimize reallocations */
6816 if (requiredsize < 2*outsize)
6817 requiredsize = 2*outsize;
6818 if (_PyBytes_Resize(outobj, requiredsize))
6819 return -1;
6820 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006821}
6822
Benjamin Peterson14339b62009-01-31 16:36:08 +00006823typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006825} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006826/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006827 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006828 space is available. Return a new reference to the object that
6829 was put in the output buffer, or Py_None, if the mapping was undefined
6830 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006831 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006832static charmapencode_result
6833charmapencode_output(Py_UNICODE c, PyObject *mapping,
6834 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006835{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006836 PyObject *rep;
6837 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006838 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006839
Christian Heimes90aa7642007-12-19 02:45:37 +00006840 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006841 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006842 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006843 if (res == -1)
6844 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 if (outsize<requiredsize)
6846 if (charmapencode_resize(outobj, outpos, requiredsize))
6847 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006848 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 outstart[(*outpos)++] = (char)res;
6850 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006851 }
6852
6853 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006854 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006856 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 Py_DECREF(rep);
6858 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006859 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 if (PyLong_Check(rep)) {
6861 Py_ssize_t requiredsize = *outpos+1;
6862 if (outsize<requiredsize)
6863 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6864 Py_DECREF(rep);
6865 return enc_EXCEPTION;
6866 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006867 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006869 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 else {
6871 const char *repchars = PyBytes_AS_STRING(rep);
6872 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6873 Py_ssize_t requiredsize = *outpos+repsize;
6874 if (outsize<requiredsize)
6875 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6876 Py_DECREF(rep);
6877 return enc_EXCEPTION;
6878 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006879 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 memcpy(outstart + *outpos, repchars, repsize);
6881 *outpos += repsize;
6882 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006883 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006884 Py_DECREF(rep);
6885 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006886}
6887
6888/* handle an error in PyUnicode_EncodeCharmap
6889 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006890static int
6891charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006892 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006893 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006894 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006895 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896{
6897 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006898 Py_ssize_t repsize;
6899 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006900 Py_UNICODE *uni2;
6901 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006902 Py_ssize_t collstartpos = *inpos;
6903 Py_ssize_t collendpos = *inpos+1;
6904 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006905 char *encoding = "charmap";
6906 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006907 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006908
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006909 /* find all unencodable characters */
6910 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006911 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006912 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006913 int res = encoding_map_lookup(p[collendpos], mapping);
6914 if (res != -1)
6915 break;
6916 ++collendpos;
6917 continue;
6918 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006919
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 rep = charmapencode_lookup(p[collendpos], mapping);
6921 if (rep==NULL)
6922 return -1;
6923 else if (rep!=Py_None) {
6924 Py_DECREF(rep);
6925 break;
6926 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006927 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006929 }
6930 /* cache callback name lookup
6931 * (if not done yet, i.e. it's the first error) */
6932 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 if ((errors==NULL) || (!strcmp(errors, "strict")))
6934 *known_errorHandler = 1;
6935 else if (!strcmp(errors, "replace"))
6936 *known_errorHandler = 2;
6937 else if (!strcmp(errors, "ignore"))
6938 *known_errorHandler = 3;
6939 else if (!strcmp(errors, "xmlcharrefreplace"))
6940 *known_errorHandler = 4;
6941 else
6942 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006943 }
6944 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006945 case 1: /* strict */
6946 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6947 return -1;
6948 case 2: /* replace */
6949 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006950 x = charmapencode_output('?', mapping, res, respos);
6951 if (x==enc_EXCEPTION) {
6952 return -1;
6953 }
6954 else if (x==enc_FAILED) {
6955 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6956 return -1;
6957 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006958 }
6959 /* fall through */
6960 case 3: /* ignore */
6961 *inpos = collendpos;
6962 break;
6963 case 4: /* xmlcharrefreplace */
6964 /* generate replacement (temporarily (mis)uses p) */
6965 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 char buffer[2+29+1+1];
6967 char *cp;
6968 sprintf(buffer, "&#%d;", (int)p[collpos]);
6969 for (cp = buffer; *cp; ++cp) {
6970 x = charmapencode_output(*cp, mapping, res, respos);
6971 if (x==enc_EXCEPTION)
6972 return -1;
6973 else if (x==enc_FAILED) {
6974 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6975 return -1;
6976 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006977 }
6978 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006979 *inpos = collendpos;
6980 break;
6981 default:
6982 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00006983 encoding, reason, p, size, exceptionObject,
6984 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006985 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006986 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006987 if (PyBytes_Check(repunicode)) {
6988 /* Directly copy bytes result to output. */
6989 Py_ssize_t outsize = PyBytes_Size(*res);
6990 Py_ssize_t requiredsize;
6991 repsize = PyBytes_Size(repunicode);
6992 requiredsize = *respos + repsize;
6993 if (requiredsize > outsize)
6994 /* Make room for all additional bytes. */
6995 if (charmapencode_resize(res, respos, requiredsize)) {
6996 Py_DECREF(repunicode);
6997 return -1;
6998 }
6999 memcpy(PyBytes_AsString(*res) + *respos,
7000 PyBytes_AsString(repunicode), repsize);
7001 *respos += repsize;
7002 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007003 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007004 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007005 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007006 /* generate replacement */
7007 repsize = PyUnicode_GET_SIZE(repunicode);
7008 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 x = charmapencode_output(*uni2, mapping, res, respos);
7010 if (x==enc_EXCEPTION) {
7011 return -1;
7012 }
7013 else if (x==enc_FAILED) {
7014 Py_DECREF(repunicode);
7015 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7016 return -1;
7017 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007018 }
7019 *inpos = newpos;
7020 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007021 }
7022 return 0;
7023}
7024
Alexander Belopolsky40018472011-02-26 01:02:56 +00007025PyObject *
7026PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7027 Py_ssize_t size,
7028 PyObject *mapping,
7029 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007031 /* output object */
7032 PyObject *res = NULL;
7033 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007034 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007035 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007036 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007037 PyObject *errorHandler = NULL;
7038 PyObject *exc = NULL;
7039 /* the following variable is used for caching string comparisons
7040 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7041 * 3=ignore, 4=xmlcharrefreplace */
7042 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043
7044 /* Default to Latin-1 */
7045 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007046 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007048 /* allocate enough for a simple encoding without
7049 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007050 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007051 if (res == NULL)
7052 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007053 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007054 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007056 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 /* try to encode it */
7058 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7059 if (x==enc_EXCEPTION) /* error */
7060 goto onError;
7061 if (x==enc_FAILED) { /* unencodable character */
7062 if (charmap_encoding_error(p, size, &inpos, mapping,
7063 &exc,
7064 &known_errorHandler, &errorHandler, errors,
7065 &res, &respos)) {
7066 goto onError;
7067 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007068 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 else
7070 /* done with this character => adjust input position */
7071 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007074 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007075 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007076 if (_PyBytes_Resize(&res, respos) < 0)
7077 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007079 Py_XDECREF(exc);
7080 Py_XDECREF(errorHandler);
7081 return res;
7082
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007084 Py_XDECREF(res);
7085 Py_XDECREF(exc);
7086 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087 return NULL;
7088}
7089
Alexander Belopolsky40018472011-02-26 01:02:56 +00007090PyObject *
7091PyUnicode_AsCharmapString(PyObject *unicode,
7092 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093{
7094 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007095 PyErr_BadArgument();
7096 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097 }
7098 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007099 PyUnicode_GET_SIZE(unicode),
7100 mapping,
7101 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102}
7103
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007104/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007105static void
7106make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007107 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007108 Py_ssize_t startpos, Py_ssize_t endpos,
7109 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007111 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007112 *exceptionObject = _PyUnicodeTranslateError_Create(
7113 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114 }
7115 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007116 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7117 goto onError;
7118 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7119 goto onError;
7120 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7121 goto onError;
7122 return;
7123 onError:
7124 Py_DECREF(*exceptionObject);
7125 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126 }
7127}
7128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007129/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007130static void
7131raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007132 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007133 Py_ssize_t startpos, Py_ssize_t endpos,
7134 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007135{
7136 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007137 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007138 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007139 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007140}
7141
7142/* error handling callback helper:
7143 build arguments, call the callback and check the arguments,
7144 put the result into newpos and return the replacement string, which
7145 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007146static PyObject *
7147unicode_translate_call_errorhandler(const char *errors,
7148 PyObject **errorHandler,
7149 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007150 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007151 Py_ssize_t startpos, Py_ssize_t endpos,
7152 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007153{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007154 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007155
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007156 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007157 PyObject *restuple;
7158 PyObject *resunicode;
7159
7160 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007161 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007162 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007163 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007164 }
7165
7166 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007167 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007168 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007170
7171 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007173 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007175 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007176 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 Py_DECREF(restuple);
7178 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007179 }
7180 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007181 &resunicode, &i_newpos)) {
7182 Py_DECREF(restuple);
7183 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007184 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007185 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007186 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007187 else
7188 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007189 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7191 Py_DECREF(restuple);
7192 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007193 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007194 Py_INCREF(resunicode);
7195 Py_DECREF(restuple);
7196 return resunicode;
7197}
7198
7199/* Lookup the character ch in the mapping and put the result in result,
7200 which must be decrefed by the caller.
7201 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007202static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007203charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007204{
Christian Heimes217cfd12007-12-02 14:31:20 +00007205 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007206 PyObject *x;
7207
7208 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007210 x = PyObject_GetItem(mapping, w);
7211 Py_DECREF(w);
7212 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7214 /* No mapping found means: use 1:1 mapping. */
7215 PyErr_Clear();
7216 *result = NULL;
7217 return 0;
7218 } else
7219 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007220 }
7221 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007222 *result = x;
7223 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007224 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007225 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 long value = PyLong_AS_LONG(x);
7227 long max = PyUnicode_GetMax();
7228 if (value < 0 || value > max) {
7229 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007230 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 Py_DECREF(x);
7232 return -1;
7233 }
7234 *result = x;
7235 return 0;
7236 }
7237 else if (PyUnicode_Check(x)) {
7238 *result = x;
7239 return 0;
7240 }
7241 else {
7242 /* wrong return value */
7243 PyErr_SetString(PyExc_TypeError,
7244 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007245 Py_DECREF(x);
7246 return -1;
7247 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007248}
7249/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007250 if not reallocate and adjust various state variables.
7251 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007252static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007253charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007255{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007256 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007257 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 /* exponentially overallocate to minimize reallocations */
7259 if (requiredsize < 2 * oldsize)
7260 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007261 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7262 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007263 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007264 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007265 }
7266 return 0;
7267}
7268/* lookup the character, put the result in the output string and adjust
7269 various state variables. Return a new reference to the object that
7270 was put in the output buffer in *result, or Py_None, if the mapping was
7271 undefined (in which case no character was written).
7272 The called must decref result.
7273 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007274static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007275charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7276 PyObject *mapping, Py_UCS4 **output,
7277 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007278 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007279{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007280 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7281 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007282 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007283 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007284 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007285 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007286 }
7287 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007288 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007289 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007290 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007291 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007292 }
7293 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007294 Py_ssize_t repsize;
7295 if (PyUnicode_READY(*res) == -1)
7296 return -1;
7297 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007298 if (repsize==1) {
7299 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007300 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007301 }
7302 else if (repsize!=0) {
7303 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007304 Py_ssize_t requiredsize = *opos +
7305 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007306 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007307 Py_ssize_t i;
7308 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007310 for(i = 0; i < repsize; i++)
7311 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007312 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007313 }
7314 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007316 return 0;
7317}
7318
Alexander Belopolsky40018472011-02-26 01:02:56 +00007319PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007320_PyUnicode_TranslateCharmap(PyObject *input,
7321 PyObject *mapping,
7322 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007324 /* input object */
7325 char *idata;
7326 Py_ssize_t size, i;
7327 int kind;
7328 /* output buffer */
7329 Py_UCS4 *output = NULL;
7330 Py_ssize_t osize;
7331 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007332 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007333 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007334 char *reason = "character maps to <undefined>";
7335 PyObject *errorHandler = NULL;
7336 PyObject *exc = NULL;
7337 /* the following variable is used for caching string comparisons
7338 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7339 * 3=ignore, 4=xmlcharrefreplace */
7340 int known_errorHandler = -1;
7341
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 PyErr_BadArgument();
7344 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007347 if (PyUnicode_READY(input) == -1)
7348 return NULL;
7349 idata = (char*)PyUnicode_DATA(input);
7350 kind = PyUnicode_KIND(input);
7351 size = PyUnicode_GET_LENGTH(input);
7352 i = 0;
7353
7354 if (size == 0) {
7355 Py_INCREF(input);
7356 return input;
7357 }
7358
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007359 /* allocate enough for a simple 1:1 translation without
7360 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007361 osize = size;
7362 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7363 opos = 0;
7364 if (output == NULL) {
7365 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007366 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007367 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007369 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 /* try to encode it */
7371 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007372 if (charmaptranslate_output(input, i, mapping,
7373 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 Py_XDECREF(x);
7375 goto onError;
7376 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007377 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007379 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 else { /* untranslatable character */
7381 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7382 Py_ssize_t repsize;
7383 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007384 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007386 Py_ssize_t collstart = i;
7387 Py_ssize_t collend = i+1;
7388 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007391 while (collend < size) {
7392 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 goto onError;
7394 Py_XDECREF(x);
7395 if (x!=Py_None)
7396 break;
7397 ++collend;
7398 }
7399 /* cache callback name lookup
7400 * (if not done yet, i.e. it's the first error) */
7401 if (known_errorHandler==-1) {
7402 if ((errors==NULL) || (!strcmp(errors, "strict")))
7403 known_errorHandler = 1;
7404 else if (!strcmp(errors, "replace"))
7405 known_errorHandler = 2;
7406 else if (!strcmp(errors, "ignore"))
7407 known_errorHandler = 3;
7408 else if (!strcmp(errors, "xmlcharrefreplace"))
7409 known_errorHandler = 4;
7410 else
7411 known_errorHandler = 0;
7412 }
7413 switch (known_errorHandler) {
7414 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007415 raise_translate_exception(&exc, input, collstart,
7416 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007417 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 case 2: /* replace */
7419 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007420 for (coll = collstart; coll<collend; coll++)
7421 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 /* fall through */
7423 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007424 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 break;
7426 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007427 /* generate replacement (temporarily (mis)uses i) */
7428 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007429 char buffer[2+29+1+1];
7430 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007431 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7432 if (charmaptranslate_makespace(&output, &osize,
7433 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007434 goto onError;
7435 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007436 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007438 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 break;
7440 default:
7441 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007442 reason, input, &exc,
7443 collstart, collend, &newpos);
7444 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 goto onError;
7446 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007447 repsize = PyUnicode_GET_LENGTH(repunicode);
7448 if (charmaptranslate_makespace(&output, &osize,
7449 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 Py_DECREF(repunicode);
7451 goto onError;
7452 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007453 for (uni2 = 0; repsize-->0; ++uni2)
7454 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7455 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007457 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007458 }
7459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007460 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7461 if (!res)
7462 goto onError;
7463 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007464 Py_XDECREF(exc);
7465 Py_XDECREF(errorHandler);
7466 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467
Benjamin Peterson29060642009-01-31 22:14:21 +00007468 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007469 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007470 Py_XDECREF(exc);
7471 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472 return NULL;
7473}
7474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007475/* Deprecated. Use PyUnicode_Translate instead. */
7476PyObject *
7477PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7478 Py_ssize_t size,
7479 PyObject *mapping,
7480 const char *errors)
7481{
7482 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7483 if (!unicode)
7484 return NULL;
7485 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7486}
7487
Alexander Belopolsky40018472011-02-26 01:02:56 +00007488PyObject *
7489PyUnicode_Translate(PyObject *str,
7490 PyObject *mapping,
7491 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492{
7493 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007494
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495 str = PyUnicode_FromObject(str);
7496 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007497 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007498 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499 Py_DECREF(str);
7500 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007501
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503 Py_XDECREF(str);
7504 return NULL;
7505}
Tim Petersced69f82003-09-16 20:30:58 +00007506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007507static Py_UCS4
7508fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7509{
7510 /* No need to call PyUnicode_READY(self) because this function is only
7511 called as a callback from fixup() which does it already. */
7512 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7513 const int kind = PyUnicode_KIND(self);
7514 void *data = PyUnicode_DATA(self);
7515 Py_UCS4 maxchar = 0, ch, fixed;
7516 Py_ssize_t i;
7517
7518 for (i = 0; i < len; ++i) {
7519 ch = PyUnicode_READ(kind, data, i);
7520 fixed = 0;
7521 if (ch > 127) {
7522 if (Py_UNICODE_ISSPACE(ch))
7523 fixed = ' ';
7524 else {
7525 const int decimal = Py_UNICODE_TODECIMAL(ch);
7526 if (decimal >= 0)
7527 fixed = '0' + decimal;
7528 }
7529 if (fixed != 0) {
7530 if (fixed > maxchar)
7531 maxchar = fixed;
7532 PyUnicode_WRITE(kind, data, i, fixed);
7533 }
7534 else if (ch > maxchar)
7535 maxchar = ch;
7536 }
7537 else if (ch > maxchar)
7538 maxchar = ch;
7539 }
7540
7541 return maxchar;
7542}
7543
7544PyObject *
7545_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7546{
7547 if (!PyUnicode_Check(unicode)) {
7548 PyErr_BadInternalCall();
7549 return NULL;
7550 }
7551 if (PyUnicode_READY(unicode) == -1)
7552 return NULL;
7553 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7554 /* If the string is already ASCII, just return the same string */
7555 Py_INCREF(unicode);
7556 return unicode;
7557 }
7558 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7559}
7560
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007561PyObject *
7562PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7563 Py_ssize_t length)
7564{
7565 PyObject *result;
7566 Py_UNICODE *p; /* write pointer into result */
7567 Py_ssize_t i;
7568 /* Copy to a new string */
7569 result = (PyObject *)_PyUnicode_New(length);
7570 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7571 if (result == NULL)
7572 return result;
7573 p = PyUnicode_AS_UNICODE(result);
7574 /* Iterate over code points */
7575 for (i = 0; i < length; i++) {
7576 Py_UNICODE ch =s[i];
7577 if (ch > 127) {
7578 int decimal = Py_UNICODE_TODECIMAL(ch);
7579 if (decimal >= 0)
7580 p[i] = '0' + decimal;
7581 }
7582 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007583 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7584 Py_DECREF(result);
7585 return NULL;
7586 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007587 return result;
7588}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007589/* --- Decimal Encoder ---------------------------------------------------- */
7590
Alexander Belopolsky40018472011-02-26 01:02:56 +00007591int
7592PyUnicode_EncodeDecimal(Py_UNICODE *s,
7593 Py_ssize_t length,
7594 char *output,
7595 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007596{
7597 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007598 PyObject *errorHandler = NULL;
7599 PyObject *exc = NULL;
7600 const char *encoding = "decimal";
7601 const char *reason = "invalid decimal Unicode string";
7602 /* the following variable is used for caching string comparisons
7603 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7604 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007605
7606 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 PyErr_BadArgument();
7608 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007609 }
7610
7611 p = s;
7612 end = s + length;
7613 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 register Py_UNICODE ch = *p;
7615 int decimal;
7616 PyObject *repunicode;
7617 Py_ssize_t repsize;
7618 Py_ssize_t newpos;
7619 Py_UNICODE *uni2;
7620 Py_UNICODE *collstart;
7621 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007622
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007624 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 ++p;
7626 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007627 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007628 decimal = Py_UNICODE_TODECIMAL(ch);
7629 if (decimal >= 0) {
7630 *output++ = '0' + decimal;
7631 ++p;
7632 continue;
7633 }
7634 if (0 < ch && ch < 256) {
7635 *output++ = (char)ch;
7636 ++p;
7637 continue;
7638 }
7639 /* All other characters are considered unencodable */
7640 collstart = p;
7641 collend = p+1;
7642 while (collend < end) {
7643 if ((0 < *collend && *collend < 256) ||
7644 !Py_UNICODE_ISSPACE(*collend) ||
7645 Py_UNICODE_TODECIMAL(*collend))
7646 break;
7647 }
7648 /* cache callback name lookup
7649 * (if not done yet, i.e. it's the first error) */
7650 if (known_errorHandler==-1) {
7651 if ((errors==NULL) || (!strcmp(errors, "strict")))
7652 known_errorHandler = 1;
7653 else if (!strcmp(errors, "replace"))
7654 known_errorHandler = 2;
7655 else if (!strcmp(errors, "ignore"))
7656 known_errorHandler = 3;
7657 else if (!strcmp(errors, "xmlcharrefreplace"))
7658 known_errorHandler = 4;
7659 else
7660 known_errorHandler = 0;
7661 }
7662 switch (known_errorHandler) {
7663 case 1: /* strict */
7664 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7665 goto onError;
7666 case 2: /* replace */
7667 for (p = collstart; p < collend; ++p)
7668 *output++ = '?';
7669 /* fall through */
7670 case 3: /* ignore */
7671 p = collend;
7672 break;
7673 case 4: /* xmlcharrefreplace */
7674 /* generate replacement (temporarily (mis)uses p) */
7675 for (p = collstart; p < collend; ++p)
7676 output += sprintf(output, "&#%d;", (int)*p);
7677 p = collend;
7678 break;
7679 default:
7680 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7681 encoding, reason, s, length, &exc,
7682 collstart-s, collend-s, &newpos);
7683 if (repunicode == NULL)
7684 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007685 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007686 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007687 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7688 Py_DECREF(repunicode);
7689 goto onError;
7690 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007691 /* generate replacement */
7692 repsize = PyUnicode_GET_SIZE(repunicode);
7693 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7694 Py_UNICODE ch = *uni2;
7695 if (Py_UNICODE_ISSPACE(ch))
7696 *output++ = ' ';
7697 else {
7698 decimal = Py_UNICODE_TODECIMAL(ch);
7699 if (decimal >= 0)
7700 *output++ = '0' + decimal;
7701 else if (0 < ch && ch < 256)
7702 *output++ = (char)ch;
7703 else {
7704 Py_DECREF(repunicode);
7705 raise_encode_exception(&exc, encoding,
7706 s, length, collstart-s, collend-s, reason);
7707 goto onError;
7708 }
7709 }
7710 }
7711 p = s + newpos;
7712 Py_DECREF(repunicode);
7713 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007714 }
7715 /* 0-terminate the output string */
7716 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007717 Py_XDECREF(exc);
7718 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007719 return 0;
7720
Benjamin Peterson29060642009-01-31 22:14:21 +00007721 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007722 Py_XDECREF(exc);
7723 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007724 return -1;
7725}
7726
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727/* --- Helpers ------------------------------------------------------------ */
7728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007729#include "stringlib/ucs1lib.h"
7730#include "stringlib/fastsearch.h"
7731#include "stringlib/partition.h"
7732#include "stringlib/split.h"
7733#include "stringlib/count.h"
7734#include "stringlib/find.h"
7735#include "stringlib/localeutil.h"
7736#include "stringlib/undef.h"
7737
7738#include "stringlib/ucs2lib.h"
7739#include "stringlib/fastsearch.h"
7740#include "stringlib/partition.h"
7741#include "stringlib/split.h"
7742#include "stringlib/count.h"
7743#include "stringlib/find.h"
7744#include "stringlib/localeutil.h"
7745#include "stringlib/undef.h"
7746
7747#include "stringlib/ucs4lib.h"
7748#include "stringlib/fastsearch.h"
7749#include "stringlib/partition.h"
7750#include "stringlib/split.h"
7751#include "stringlib/count.h"
7752#include "stringlib/find.h"
7753#include "stringlib/localeutil.h"
7754#include "stringlib/undef.h"
7755
7756static Py_ssize_t
7757any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7758 const Py_UCS1*, Py_ssize_t,
7759 Py_ssize_t, Py_ssize_t),
7760 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7761 const Py_UCS2*, Py_ssize_t,
7762 Py_ssize_t, Py_ssize_t),
7763 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7764 const Py_UCS4*, Py_ssize_t,
7765 Py_ssize_t, Py_ssize_t),
7766 PyObject* s1, PyObject* s2,
7767 Py_ssize_t start,
7768 Py_ssize_t end)
7769{
7770 int kind1, kind2, kind;
7771 void *buf1, *buf2;
7772 Py_ssize_t len1, len2, result;
7773
7774 kind1 = PyUnicode_KIND(s1);
7775 kind2 = PyUnicode_KIND(s2);
7776 kind = kind1 > kind2 ? kind1 : kind2;
7777 buf1 = PyUnicode_DATA(s1);
7778 buf2 = PyUnicode_DATA(s2);
7779 if (kind1 != kind)
7780 buf1 = _PyUnicode_AsKind(s1, kind);
7781 if (!buf1)
7782 return -2;
7783 if (kind2 != kind)
7784 buf2 = _PyUnicode_AsKind(s2, kind);
7785 if (!buf2) {
7786 if (kind1 != kind) PyMem_Free(buf1);
7787 return -2;
7788 }
7789 len1 = PyUnicode_GET_LENGTH(s1);
7790 len2 = PyUnicode_GET_LENGTH(s2);
7791
7792 switch(kind) {
7793 case PyUnicode_1BYTE_KIND:
7794 result = ucs1(buf1, len1, buf2, len2, start, end);
7795 break;
7796 case PyUnicode_2BYTE_KIND:
7797 result = ucs2(buf1, len1, buf2, len2, start, end);
7798 break;
7799 case PyUnicode_4BYTE_KIND:
7800 result = ucs4(buf1, len1, buf2, len2, start, end);
7801 break;
7802 default:
7803 assert(0); result = -2;
7804 }
7805
7806 if (kind1 != kind)
7807 PyMem_Free(buf1);
7808 if (kind2 != kind)
7809 PyMem_Free(buf2);
7810
7811 return result;
7812}
7813
7814Py_ssize_t
7815_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7816 Py_ssize_t n_buffer,
7817 void *digits, Py_ssize_t n_digits,
7818 Py_ssize_t min_width,
7819 const char *grouping,
7820 const char *thousands_sep)
7821{
7822 switch(kind) {
7823 case PyUnicode_1BYTE_KIND:
7824 return _PyUnicode_ucs1_InsertThousandsGrouping(
7825 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7826 min_width, grouping, thousands_sep);
7827 case PyUnicode_2BYTE_KIND:
7828 return _PyUnicode_ucs2_InsertThousandsGrouping(
7829 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7830 min_width, grouping, thousands_sep);
7831 case PyUnicode_4BYTE_KIND:
7832 return _PyUnicode_ucs4_InsertThousandsGrouping(
7833 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7834 min_width, grouping, thousands_sep);
7835 }
7836 assert(0);
7837 return -1;
7838}
7839
7840
Eric Smith8c663262007-08-25 02:26:07 +00007841#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007842#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007843
Thomas Wouters477c8d52006-05-27 19:21:47 +00007844#include "stringlib/count.h"
7845#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007846
Thomas Wouters477c8d52006-05-27 19:21:47 +00007847/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007848#define ADJUST_INDICES(start, end, len) \
7849 if (end > len) \
7850 end = len; \
7851 else if (end < 0) { \
7852 end += len; \
7853 if (end < 0) \
7854 end = 0; \
7855 } \
7856 if (start < 0) { \
7857 start += len; \
7858 if (start < 0) \
7859 start = 0; \
7860 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007861
Alexander Belopolsky40018472011-02-26 01:02:56 +00007862Py_ssize_t
7863PyUnicode_Count(PyObject *str,
7864 PyObject *substr,
7865 Py_ssize_t start,
7866 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007868 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007869 PyUnicodeObject* str_obj;
7870 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007871 int kind1, kind2, kind;
7872 void *buf1 = NULL, *buf2 = NULL;
7873 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007874
Thomas Wouters477c8d52006-05-27 19:21:47 +00007875 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007876 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007878 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007879 if (!sub_obj || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 Py_DECREF(str_obj);
7881 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882 }
Tim Petersced69f82003-09-16 20:30:58 +00007883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007884 kind1 = PyUnicode_KIND(str_obj);
7885 kind2 = PyUnicode_KIND(sub_obj);
7886 kind = kind1 > kind2 ? kind1 : kind2;
7887 buf1 = PyUnicode_DATA(str_obj);
7888 if (kind1 != kind)
7889 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7890 if (!buf1)
7891 goto onError;
7892 buf2 = PyUnicode_DATA(sub_obj);
7893 if (kind2 != kind)
7894 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7895 if (!buf2)
7896 goto onError;
7897 len1 = PyUnicode_GET_LENGTH(str_obj);
7898 len2 = PyUnicode_GET_LENGTH(sub_obj);
7899
7900 ADJUST_INDICES(start, end, len1);
7901 switch(kind) {
7902 case PyUnicode_1BYTE_KIND:
7903 result = ucs1lib_count(
7904 ((Py_UCS1*)buf1) + start, end - start,
7905 buf2, len2, PY_SSIZE_T_MAX
7906 );
7907 break;
7908 case PyUnicode_2BYTE_KIND:
7909 result = ucs2lib_count(
7910 ((Py_UCS2*)buf1) + start, end - start,
7911 buf2, len2, PY_SSIZE_T_MAX
7912 );
7913 break;
7914 case PyUnicode_4BYTE_KIND:
7915 result = ucs4lib_count(
7916 ((Py_UCS4*)buf1) + start, end - start,
7917 buf2, len2, PY_SSIZE_T_MAX
7918 );
7919 break;
7920 default:
7921 assert(0); result = 0;
7922 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007923
7924 Py_DECREF(sub_obj);
7925 Py_DECREF(str_obj);
7926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007927 if (kind1 != kind)
7928 PyMem_Free(buf1);
7929 if (kind2 != kind)
7930 PyMem_Free(buf2);
7931
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007933 onError:
7934 Py_DECREF(sub_obj);
7935 Py_DECREF(str_obj);
7936 if (kind1 != kind && buf1)
7937 PyMem_Free(buf1);
7938 if (kind2 != kind && buf2)
7939 PyMem_Free(buf2);
7940 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007941}
7942
Alexander Belopolsky40018472011-02-26 01:02:56 +00007943Py_ssize_t
7944PyUnicode_Find(PyObject *str,
7945 PyObject *sub,
7946 Py_ssize_t start,
7947 Py_ssize_t end,
7948 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007950 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00007951
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007953 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007955 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007956 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 Py_DECREF(str);
7958 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959 }
Tim Petersced69f82003-09-16 20:30:58 +00007960
Thomas Wouters477c8d52006-05-27 19:21:47 +00007961 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007962 result = any_find_slice(
7963 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
7964 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007965 );
7966 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007967 result = any_find_slice(
7968 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
7969 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007970 );
7971
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007973 Py_DECREF(sub);
7974
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 return result;
7976}
7977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007978Py_ssize_t
7979PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
7980 Py_ssize_t start, Py_ssize_t end,
7981 int direction)
7982{
7983 char *result;
7984 int kind;
7985 if (PyUnicode_READY(str) == -1)
7986 return -2;
7987 if (end > PyUnicode_GET_LENGTH(str))
7988 end = PyUnicode_GET_LENGTH(str);
7989 kind = PyUnicode_KIND(str);
7990 result = findchar(PyUnicode_1BYTE_DATA(str)
7991 + PyUnicode_KIND_SIZE(kind, start),
7992 kind,
7993 end-start, ch, direction);
7994 if (!result)
7995 return -1;
7996 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
7997}
7998
Alexander Belopolsky40018472011-02-26 01:02:56 +00007999static int
8000tailmatch(PyUnicodeObject *self,
8001 PyUnicodeObject *substring,
8002 Py_ssize_t start,
8003 Py_ssize_t end,
8004 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008006 int kind_self;
8007 int kind_sub;
8008 void *data_self;
8009 void *data_sub;
8010 Py_ssize_t offset;
8011 Py_ssize_t i;
8012 Py_ssize_t end_sub;
8013
8014 if (PyUnicode_READY(self) == -1 ||
8015 PyUnicode_READY(substring) == -1)
8016 return 0;
8017
8018 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019 return 1;
8020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008021 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8022 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 kind_self = PyUnicode_KIND(self);
8027 data_self = PyUnicode_DATA(self);
8028 kind_sub = PyUnicode_KIND(substring);
8029 data_sub = PyUnicode_DATA(substring);
8030 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8031
8032 if (direction > 0)
8033 offset = end;
8034 else
8035 offset = start;
8036
8037 if (PyUnicode_READ(kind_self, data_self, offset) ==
8038 PyUnicode_READ(kind_sub, data_sub, 0) &&
8039 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8040 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8041 /* If both are of the same kind, memcmp is sufficient */
8042 if (kind_self == kind_sub) {
8043 return ! memcmp((char *)data_self +
8044 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8045 data_sub,
8046 PyUnicode_GET_LENGTH(substring) *
8047 PyUnicode_CHARACTER_SIZE(substring));
8048 }
8049 /* otherwise we have to compare each character by first accesing it */
8050 else {
8051 /* We do not need to compare 0 and len(substring)-1 because
8052 the if statement above ensured already that they are equal
8053 when we end up here. */
8054 // TODO: honor direction and do a forward or backwards search
8055 for (i = 1; i < end_sub; ++i) {
8056 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8057 PyUnicode_READ(kind_sub, data_sub, i))
8058 return 0;
8059 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008061 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062 }
8063
8064 return 0;
8065}
8066
Alexander Belopolsky40018472011-02-26 01:02:56 +00008067Py_ssize_t
8068PyUnicode_Tailmatch(PyObject *str,
8069 PyObject *substr,
8070 Py_ssize_t start,
8071 Py_ssize_t end,
8072 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008074 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008075
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076 str = PyUnicode_FromObject(str);
8077 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 substr = PyUnicode_FromObject(substr);
8080 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 Py_DECREF(str);
8082 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 }
Tim Petersced69f82003-09-16 20:30:58 +00008084
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 (PyUnicodeObject *)substr,
8087 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088 Py_DECREF(str);
8089 Py_DECREF(substr);
8090 return result;
8091}
8092
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093/* Apply fixfct filter to the Unicode object self and return a
8094 reference to the modified object */
8095
Alexander Belopolsky40018472011-02-26 01:02:56 +00008096static PyObject *
8097fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008098 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008100 PyObject *u;
8101 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008103 if (PyUnicode_READY(self) == -1)
8104 return NULL;
8105 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8106 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8107 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008111 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8112 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008114 /* fix functions return the new maximum character in a string,
8115 if the kind of the resulting unicode object does not change,
8116 everything is fine. Otherwise we need to change the string kind
8117 and re-run the fix function. */
8118 maxchar_new = fixfct((PyUnicodeObject*)u);
8119 if (maxchar_new == 0)
8120 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8121 else if (maxchar_new <= 127)
8122 maxchar_new = 127;
8123 else if (maxchar_new <= 255)
8124 maxchar_new = 255;
8125 else if (maxchar_new <= 65535)
8126 maxchar_new = 65535;
8127 else
8128 maxchar_new = 1114111; /* 0x10ffff */
8129
8130 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 /* fixfct should return TRUE if it modified the buffer. If
8132 FALSE, return a reference to the original buffer instead
8133 (to save space, not time) */
8134 Py_INCREF(self);
8135 Py_DECREF(u);
8136 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008138 else if (maxchar_new == maxchar_old) {
8139 return u;
8140 }
8141 else {
8142 /* In case the maximum character changed, we need to
8143 convert the string to the new category. */
8144 PyObject *v = PyUnicode_New(
8145 PyUnicode_GET_LENGTH(self), maxchar_new);
8146 if (v == NULL) {
8147 Py_DECREF(u);
8148 return NULL;
8149 }
8150 if (maxchar_new > maxchar_old) {
8151 /* If the maxchar increased so that the kind changed, not all
8152 characters are representable anymore and we need to fix the
8153 string again. This only happens in very few cases. */
8154 PyUnicode_CopyCharacters(v, 0, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self));
8155 maxchar_old = fixfct((PyUnicodeObject*)v);
8156 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8157 }
8158 else
8159 PyUnicode_CopyCharacters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
8160
8161 Py_DECREF(u);
8162 return v;
8163 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164}
8165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008166static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008167fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008169 /* No need to call PyUnicode_READY(self) because this function is only
8170 called as a callback from fixup() which does it already. */
8171 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8172 const int kind = PyUnicode_KIND(self);
8173 void *data = PyUnicode_DATA(self);
8174 int touched = 0;
8175 Py_UCS4 maxchar = 0;
8176 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008178 for (i = 0; i < len; ++i) {
8179 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8180 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8181 if (up != ch) {
8182 if (up > maxchar)
8183 maxchar = up;
8184 PyUnicode_WRITE(kind, data, i, up);
8185 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008187 else if (ch > maxchar)
8188 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189 }
8190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008191 if (touched)
8192 return maxchar;
8193 else
8194 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195}
8196
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008197static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008198fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008200 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8201 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8202 const int kind = PyUnicode_KIND(self);
8203 void *data = PyUnicode_DATA(self);
8204 int touched = 0;
8205 Py_UCS4 maxchar = 0;
8206 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008208 for(i = 0; i < len; ++i) {
8209 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8210 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8211 if (lo != ch) {
8212 if (lo > maxchar)
8213 maxchar = lo;
8214 PyUnicode_WRITE(kind, data, i, lo);
8215 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008217 else if (ch > maxchar)
8218 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219 }
8220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008221 if (touched)
8222 return maxchar;
8223 else
8224 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225}
8226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008227static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008228fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008230 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8231 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8232 const int kind = PyUnicode_KIND(self);
8233 void *data = PyUnicode_DATA(self);
8234 int touched = 0;
8235 Py_UCS4 maxchar = 0;
8236 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238 for(i = 0; i < len; ++i) {
8239 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8240 Py_UCS4 nu = 0;
8241
8242 if (Py_UNICODE_ISUPPER(ch))
8243 nu = Py_UNICODE_TOLOWER(ch);
8244 else if (Py_UNICODE_ISLOWER(ch))
8245 nu = Py_UNICODE_TOUPPER(ch);
8246
8247 if (nu != 0) {
8248 if (nu > maxchar)
8249 maxchar = nu;
8250 PyUnicode_WRITE(kind, data, i, nu);
8251 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008253 else if (ch > maxchar)
8254 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255 }
8256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008257 if (touched)
8258 return maxchar;
8259 else
8260 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261}
8262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008263static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008264fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008266 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8267 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8268 const int kind = PyUnicode_KIND(self);
8269 void *data = PyUnicode_DATA(self);
8270 int touched = 0;
8271 Py_UCS4 maxchar = 0;
8272 Py_ssize_t i = 0;
8273 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008274
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008275 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008277
8278 ch = PyUnicode_READ(kind, data, i);
8279 if (!Py_UNICODE_ISUPPER(ch)) {
8280 maxchar = Py_UNICODE_TOUPPER(ch);
8281 PyUnicode_WRITE(kind, data, i, maxchar);
8282 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008284 ++i;
8285 for(; i < len; ++i) {
8286 ch = PyUnicode_READ(kind, data, i);
8287 if (!Py_UNICODE_ISLOWER(ch)) {
8288 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8289 if (lo > maxchar)
8290 maxchar = lo;
8291 PyUnicode_WRITE(kind, data, i, lo);
8292 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008293 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008294 else if (ch > maxchar)
8295 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008296 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008297
8298 if (touched)
8299 return maxchar;
8300 else
8301 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302}
8303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008304static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008305fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008307 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8308 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8309 const int kind = PyUnicode_KIND(self);
8310 void *data = PyUnicode_DATA(self);
8311 Py_UCS4 maxchar = 0;
8312 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313 int previous_is_cased;
8314
8315 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008316 if (len == 1) {
8317 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8318 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8319 if (ti != ch) {
8320 PyUnicode_WRITE(kind, data, i, ti);
8321 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 }
8323 else
8324 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327 for(; i < len; ++i) {
8328 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8329 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008330
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008334 nu = Py_UNICODE_TOTITLE(ch);
8335
8336 if (nu > maxchar)
8337 maxchar = nu;
8338 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008339
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 if (Py_UNICODE_ISLOWER(ch) ||
8341 Py_UNICODE_ISUPPER(ch) ||
8342 Py_UNICODE_ISTITLE(ch))
8343 previous_is_cased = 1;
8344 else
8345 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008347 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348}
8349
Tim Peters8ce9f162004-08-27 01:49:32 +00008350PyObject *
8351PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008354 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008356 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008357 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8358 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008359 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008360 Py_ssize_t sz, i, res_offset;
8361 Py_UCS4 maxchar = 0;
8362 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363
Tim Peters05eba1f2004-08-27 21:32:02 +00008364 fseq = PySequence_Fast(seq, "");
8365 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008366 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008367 }
8368
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008369 /* NOTE: the following code can't call back into Python code,
8370 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008371 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008372
Tim Peters05eba1f2004-08-27 21:32:02 +00008373 seqlen = PySequence_Fast_GET_SIZE(fseq);
8374 /* If empty sequence, return u"". */
8375 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008377 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008378 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008379 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008380 /* If singleton sequence with an exact Unicode, return that. */
8381 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 item = items[0];
8383 if (PyUnicode_CheckExact(item)) {
8384 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 goto Done;
8387 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008388 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008389 else {
8390 /* Set up sep and seplen */
8391 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008392 /* fall back to a blank space separator */
8393 sep = PyUnicode_FromOrdinal(' ');
8394 if (!sep || PyUnicode_READY(sep) == -1)
8395 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008396 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008397 else {
8398 if (!PyUnicode_Check(separator)) {
8399 PyErr_Format(PyExc_TypeError,
8400 "separator: expected str instance,"
8401 " %.80s found",
8402 Py_TYPE(separator)->tp_name);
8403 goto onError;
8404 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008405 if (PyUnicode_READY(separator) == -1)
8406 goto onError;
8407 sep = separator;
8408 seplen = PyUnicode_GET_LENGTH(separator);
8409 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8410 /* inc refcount to keep this code path symetric with the
8411 above case of a blank separator */
8412 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008413 }
8414 }
8415
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008416 /* There are at least two things to join, or else we have a subclass
8417 * of str in the sequence.
8418 * Do a pre-pass to figure out the total amount of space we'll
8419 * need (sz), and see whether all argument are strings.
8420 */
8421 sz = 0;
8422 for (i = 0; i < seqlen; i++) {
8423 const Py_ssize_t old_sz = sz;
8424 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 if (!PyUnicode_Check(item)) {
8426 PyErr_Format(PyExc_TypeError,
8427 "sequence item %zd: expected str instance,"
8428 " %.80s found",
8429 i, Py_TYPE(item)->tp_name);
8430 goto onError;
8431 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008432 if (PyUnicode_READY(item) == -1)
8433 goto onError;
8434 sz += PyUnicode_GET_LENGTH(item);
8435 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8436 if (item_maxchar > maxchar)
8437 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008438 if (i != 0)
8439 sz += seplen;
8440 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8441 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008443 goto onError;
8444 }
8445 }
Tim Petersced69f82003-09-16 20:30:58 +00008446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008447 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008448 if (res == NULL)
8449 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008450
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008451 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008453 Py_ssize_t itemlen;
8454 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 /* Copy item, and maybe the separator. */
8457 if (i) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008458 PyUnicode_CopyCharacters(res, res_offset,
8459 sep, 0, seplen);
8460 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008462 PyUnicode_CopyCharacters(res, res_offset,
8463 item, 0, itemlen);
8464 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008466 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008467
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008469 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470 Py_XDECREF(sep);
8471 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008474 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008475 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008476 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 return NULL;
8478}
8479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480#define FILL(kind, data, value, start, length) \
8481 do { \
8482 Py_ssize_t i_ = 0; \
8483 assert(kind != PyUnicode_WCHAR_KIND); \
8484 switch ((kind)) { \
8485 case PyUnicode_1BYTE_KIND: { \
8486 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8487 memset(to_, (unsigned char)value, length); \
8488 break; \
8489 } \
8490 case PyUnicode_2BYTE_KIND: { \
8491 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8492 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8493 break; \
8494 } \
8495 default: { \
8496 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8497 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8498 break; \
8499 } \
8500 } \
8501 } while (0)
8502
Alexander Belopolsky40018472011-02-26 01:02:56 +00008503static PyUnicodeObject *
8504pad(PyUnicodeObject *self,
8505 Py_ssize_t left,
8506 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 PyObject *u;
8510 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511
8512 if (left < 0)
8513 left = 0;
8514 if (right < 0)
8515 right = 0;
8516
Tim Peters7a29bd52001-09-12 03:03:31 +00008517 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518 Py_INCREF(self);
8519 return self;
8520 }
8521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008522 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8523 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008524 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8525 return NULL;
8526 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8528 if (fill > maxchar)
8529 maxchar = fill;
8530 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531 if (u) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 int kind = PyUnicode_KIND(u);
8533 void *data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534 if (left)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008535 FILL(kind, data, fill, 0, left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536 if (right)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
8538 PyUnicode_CopyCharacters(u, left, (PyObject*)self, 0, _PyUnicode_LENGTH(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539 }
8540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008543#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544
Alexander Belopolsky40018472011-02-26 01:02:56 +00008545PyObject *
8546PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549
8550 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 switch(PyUnicode_KIND(string)) {
8555 case PyUnicode_1BYTE_KIND:
8556 list = ucs1lib_splitlines(
8557 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8558 PyUnicode_GET_LENGTH(string), keepends);
8559 break;
8560 case PyUnicode_2BYTE_KIND:
8561 list = ucs2lib_splitlines(
8562 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8563 PyUnicode_GET_LENGTH(string), keepends);
8564 break;
8565 case PyUnicode_4BYTE_KIND:
8566 list = ucs4lib_splitlines(
8567 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8568 PyUnicode_GET_LENGTH(string), keepends);
8569 break;
8570 default:
8571 assert(0);
8572 list = 0;
8573 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574 Py_DECREF(string);
8575 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576}
8577
Alexander Belopolsky40018472011-02-26 01:02:56 +00008578static PyObject *
8579split(PyUnicodeObject *self,
8580 PyUnicodeObject *substring,
8581 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583 int kind1, kind2, kind;
8584 void *buf1, *buf2;
8585 Py_ssize_t len1, len2;
8586 PyObject* out;
8587
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008589 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008591 if (PyUnicode_READY(self) == -1)
8592 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594 if (substring == NULL)
8595 switch(PyUnicode_KIND(self)) {
8596 case PyUnicode_1BYTE_KIND:
8597 return ucs1lib_split_whitespace(
8598 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8599 PyUnicode_GET_LENGTH(self), maxcount
8600 );
8601 case PyUnicode_2BYTE_KIND:
8602 return ucs2lib_split_whitespace(
8603 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8604 PyUnicode_GET_LENGTH(self), maxcount
8605 );
8606 case PyUnicode_4BYTE_KIND:
8607 return ucs4lib_split_whitespace(
8608 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8609 PyUnicode_GET_LENGTH(self), maxcount
8610 );
8611 default:
8612 assert(0);
8613 return NULL;
8614 }
8615
8616 if (PyUnicode_READY(substring) == -1)
8617 return NULL;
8618
8619 kind1 = PyUnicode_KIND(self);
8620 kind2 = PyUnicode_KIND(substring);
8621 kind = kind1 > kind2 ? kind1 : kind2;
8622 buf1 = PyUnicode_DATA(self);
8623 buf2 = PyUnicode_DATA(substring);
8624 if (kind1 != kind)
8625 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8626 if (!buf1)
8627 return NULL;
8628 if (kind2 != kind)
8629 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8630 if (!buf2) {
8631 if (kind1 != kind) PyMem_Free(buf1);
8632 return NULL;
8633 }
8634 len1 = PyUnicode_GET_LENGTH(self);
8635 len2 = PyUnicode_GET_LENGTH(substring);
8636
8637 switch(kind) {
8638 case PyUnicode_1BYTE_KIND:
8639 out = ucs1lib_split(
8640 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8641 break;
8642 case PyUnicode_2BYTE_KIND:
8643 out = ucs2lib_split(
8644 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8645 break;
8646 case PyUnicode_4BYTE_KIND:
8647 out = ucs4lib_split(
8648 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8649 break;
8650 default:
8651 out = NULL;
8652 }
8653 if (kind1 != kind)
8654 PyMem_Free(buf1);
8655 if (kind2 != kind)
8656 PyMem_Free(buf2);
8657 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658}
8659
Alexander Belopolsky40018472011-02-26 01:02:56 +00008660static PyObject *
8661rsplit(PyUnicodeObject *self,
8662 PyUnicodeObject *substring,
8663 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008664{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 int kind1, kind2, kind;
8666 void *buf1, *buf2;
8667 Py_ssize_t len1, len2;
8668 PyObject* out;
8669
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008670 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008671 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673 if (PyUnicode_READY(self) == -1)
8674 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 if (substring == NULL)
8677 switch(PyUnicode_KIND(self)) {
8678 case PyUnicode_1BYTE_KIND:
8679 return ucs1lib_rsplit_whitespace(
8680 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8681 PyUnicode_GET_LENGTH(self), maxcount
8682 );
8683 case PyUnicode_2BYTE_KIND:
8684 return ucs2lib_rsplit_whitespace(
8685 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8686 PyUnicode_GET_LENGTH(self), maxcount
8687 );
8688 case PyUnicode_4BYTE_KIND:
8689 return ucs4lib_rsplit_whitespace(
8690 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8691 PyUnicode_GET_LENGTH(self), maxcount
8692 );
8693 default:
8694 assert(0);
8695 return NULL;
8696 }
8697
8698 if (PyUnicode_READY(substring) == -1)
8699 return NULL;
8700
8701 kind1 = PyUnicode_KIND(self);
8702 kind2 = PyUnicode_KIND(substring);
8703 kind = kind1 > kind2 ? kind1 : kind2;
8704 buf1 = PyUnicode_DATA(self);
8705 buf2 = PyUnicode_DATA(substring);
8706 if (kind1 != kind)
8707 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8708 if (!buf1)
8709 return NULL;
8710 if (kind2 != kind)
8711 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8712 if (!buf2) {
8713 if (kind1 != kind) PyMem_Free(buf1);
8714 return NULL;
8715 }
8716 len1 = PyUnicode_GET_LENGTH(self);
8717 len2 = PyUnicode_GET_LENGTH(substring);
8718
8719 switch(kind) {
8720 case PyUnicode_1BYTE_KIND:
8721 out = ucs1lib_rsplit(
8722 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8723 break;
8724 case PyUnicode_2BYTE_KIND:
8725 out = ucs2lib_rsplit(
8726 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8727 break;
8728 case PyUnicode_4BYTE_KIND:
8729 out = ucs4lib_rsplit(
8730 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8731 break;
8732 default:
8733 out = NULL;
8734 }
8735 if (kind1 != kind)
8736 PyMem_Free(buf1);
8737 if (kind2 != kind)
8738 PyMem_Free(buf2);
8739 return out;
8740}
8741
8742static Py_ssize_t
8743anylib_find(int kind, void *buf1, Py_ssize_t len1,
8744 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8745{
8746 switch(kind) {
8747 case PyUnicode_1BYTE_KIND:
8748 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8749 case PyUnicode_2BYTE_KIND:
8750 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8751 case PyUnicode_4BYTE_KIND:
8752 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8753 }
8754 assert(0);
8755 return -1;
8756}
8757
8758static Py_ssize_t
8759anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8760 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8761{
8762 switch(kind) {
8763 case PyUnicode_1BYTE_KIND:
8764 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8765 case PyUnicode_2BYTE_KIND:
8766 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8767 case PyUnicode_4BYTE_KIND:
8768 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8769 }
8770 assert(0);
8771 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008772}
8773
Alexander Belopolsky40018472011-02-26 01:02:56 +00008774static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775replace(PyObject *self, PyObject *str1,
8776 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 PyObject *u;
8779 char *sbuf = PyUnicode_DATA(self);
8780 char *buf1 = PyUnicode_DATA(str1);
8781 char *buf2 = PyUnicode_DATA(str2);
8782 int srelease = 0, release1 = 0, release2 = 0;
8783 int skind = PyUnicode_KIND(self);
8784 int kind1 = PyUnicode_KIND(str1);
8785 int kind2 = PyUnicode_KIND(str2);
8786 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8787 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8788 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789
8790 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008793 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008795 if (skind < kind1)
8796 /* substring too wide to be present */
8797 goto nothing;
8798
8799 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008800 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008801 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008803 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008805 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 Py_UCS4 u1, u2, maxchar;
8807 int mayshrink, rkind;
8808 u1 = PyUnicode_READ_CHAR(str1, 0);
8809 if (!findchar(sbuf, PyUnicode_KIND(self),
8810 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008811 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812 u2 = PyUnicode_READ_CHAR(str2, 0);
8813 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8814 /* Replacing u1 with u2 may cause a maxchar reduction in the
8815 result string. */
8816 mayshrink = maxchar > 127;
8817 if (u2 > maxchar) {
8818 maxchar = u2;
8819 mayshrink = 0;
8820 }
8821 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008822 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008823 goto error;
8824 PyUnicode_CopyCharacters(u, 0,
8825 (PyObject*)self, 0, slen);
8826 rkind = PyUnicode_KIND(u);
8827 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8828 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008829 if (--maxcount < 0)
8830 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008832 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833 if (mayshrink) {
8834 PyObject *tmp = u;
8835 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8836 PyUnicode_GET_LENGTH(tmp));
8837 Py_DECREF(tmp);
8838 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840 int rkind = skind;
8841 char *res;
8842 if (kind1 < rkind) {
8843 /* widen substring */
8844 buf1 = _PyUnicode_AsKind(str1, rkind);
8845 if (!buf1) goto error;
8846 release1 = 1;
8847 }
8848 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008849 if (i < 0)
8850 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 if (rkind > kind2) {
8852 /* widen replacement */
8853 buf2 = _PyUnicode_AsKind(str2, rkind);
8854 if (!buf2) goto error;
8855 release2 = 1;
8856 }
8857 else if (rkind < kind2) {
8858 /* widen self and buf1 */
8859 rkind = kind2;
8860 if (release1) PyMem_Free(buf1);
8861 sbuf = _PyUnicode_AsKind(self, rkind);
8862 if (!sbuf) goto error;
8863 srelease = 1;
8864 buf1 = _PyUnicode_AsKind(str1, rkind);
8865 if (!buf1) goto error;
8866 release1 = 1;
8867 }
8868 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8869 if (!res) {
8870 PyErr_NoMemory();
8871 goto error;
8872 }
8873 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008874 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8876 buf2,
8877 PyUnicode_KIND_SIZE(rkind, len2));
8878 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008879
8880 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8882 slen-i,
8883 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008884 if (i == -1)
8885 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8887 buf2,
8888 PyUnicode_KIND_SIZE(rkind, len2));
8889 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008890 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891
8892 u = PyUnicode_FromKindAndData(rkind, res, slen);
8893 PyMem_Free(res);
8894 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 Py_ssize_t n, i, j, ires;
8899 Py_ssize_t product, new_size;
8900 int rkind = skind;
8901 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 if (kind1 < rkind) {
8904 buf1 = _PyUnicode_AsKind(str1, rkind);
8905 if (!buf1) goto error;
8906 release1 = 1;
8907 }
8908 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008909 if (n == 0)
8910 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911 if (kind2 < rkind) {
8912 buf2 = _PyUnicode_AsKind(str2, rkind);
8913 if (!buf2) goto error;
8914 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 else if (kind2 > rkind) {
8917 rkind = kind2;
8918 sbuf = _PyUnicode_AsKind(self, rkind);
8919 if (!sbuf) goto error;
8920 srelease = 1;
8921 if (release1) PyMem_Free(buf1);
8922 buf1 = _PyUnicode_AsKind(str1, rkind);
8923 if (!buf1) goto error;
8924 release1 = 1;
8925 }
8926 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
8927 PyUnicode_GET_LENGTH(str1))); */
8928 product = n * (len2-len1);
8929 if ((product / (len2-len1)) != n) {
8930 PyErr_SetString(PyExc_OverflowError,
8931 "replace string is too long");
8932 goto error;
8933 }
8934 new_size = slen + product;
8935 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
8936 PyErr_SetString(PyExc_OverflowError,
8937 "replace string is too long");
8938 goto error;
8939 }
8940 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
8941 if (!res)
8942 goto error;
8943 ires = i = 0;
8944 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008945 while (n-- > 0) {
8946 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 j = anylib_find(rkind,
8948 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8949 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008950 if (j == -1)
8951 break;
8952 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008953 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8955 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8956 PyUnicode_KIND_SIZE(rkind, j-i));
8957 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008958 }
8959 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 if (len2 > 0) {
8961 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8962 buf2,
8963 PyUnicode_KIND_SIZE(rkind, len2));
8964 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008968 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00008969 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8971 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8972 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00008973 } else {
8974 /* interleave */
8975 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8977 buf2,
8978 PyUnicode_KIND_SIZE(rkind, len2));
8979 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008980 if (--n <= 0)
8981 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8983 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8984 PyUnicode_KIND_SIZE(rkind, 1));
8985 ires++;
8986 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008987 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008988 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
8989 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8990 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00008991 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008992 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008994 if (srelease)
8995 PyMem_FREE(sbuf);
8996 if (release1)
8997 PyMem_FREE(buf1);
8998 if (release2)
8999 PyMem_FREE(buf2);
9000 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009001
Benjamin Peterson29060642009-01-31 22:14:21 +00009002 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009003 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 if (srelease)
9005 PyMem_FREE(sbuf);
9006 if (release1)
9007 PyMem_FREE(buf1);
9008 if (release2)
9009 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009010 if (PyUnicode_CheckExact(self)) {
9011 Py_INCREF(self);
9012 return (PyObject *) self;
9013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 return PyUnicode_FromKindAndData(PyUnicode_KIND(self),
9015 PyUnicode_DATA(self),
9016 PyUnicode_GET_LENGTH(self));
9017 error:
9018 if (srelease && sbuf)
9019 PyMem_FREE(sbuf);
9020 if (release1 && buf1)
9021 PyMem_FREE(buf1);
9022 if (release2 && buf2)
9023 PyMem_FREE(buf2);
9024 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025}
9026
9027/* --- Unicode Object Methods --------------------------------------------- */
9028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009029PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009030 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031\n\
9032Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009033characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034
9035static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009036unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038 return fixup(self, fixtitle);
9039}
9040
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009041PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009042 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043\n\
9044Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009045have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046
9047static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009048unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009050 return fixup(self, fixcapitalize);
9051}
9052
9053#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009054PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009055 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056\n\
9057Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009058normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059
9060static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009061unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062{
9063 PyObject *list;
9064 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009065 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067 /* Split into words */
9068 list = split(self, NULL, -1);
9069 if (!list)
9070 return NULL;
9071
9072 /* Capitalize each word */
9073 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9074 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009075 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076 if (item == NULL)
9077 goto onError;
9078 Py_DECREF(PyList_GET_ITEM(list, i));
9079 PyList_SET_ITEM(list, i, item);
9080 }
9081
9082 /* Join the words to form a new string */
9083 item = PyUnicode_Join(NULL, list);
9084
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086 Py_DECREF(list);
9087 return (PyObject *)item;
9088}
9089#endif
9090
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009091/* Argument converter. Coerces to a single unicode character */
9092
9093static int
9094convert_uc(PyObject *obj, void *addr)
9095{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009097 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009098
Benjamin Peterson14339b62009-01-31 16:36:08 +00009099 uniobj = PyUnicode_FromObject(obj);
9100 if (uniobj == NULL) {
9101 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009102 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009103 return 0;
9104 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009106 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009107 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009108 Py_DECREF(uniobj);
9109 return 0;
9110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 if (PyUnicode_READY(uniobj)) {
9112 Py_DECREF(uniobj);
9113 return 0;
9114 }
9115 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009116 Py_DECREF(uniobj);
9117 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009118}
9119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009120PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009121 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009123Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009124done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125
9126static PyObject *
9127unicode_center(PyUnicodeObject *self, PyObject *args)
9128{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009129 Py_ssize_t marg, left;
9130 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131 Py_UCS4 fillchar = ' ';
9132
9133 if (PyUnicode_READY(self) == -1)
9134 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009135
Thomas Woutersde017742006-02-16 19:34:37 +00009136 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137 return NULL;
9138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140 Py_INCREF(self);
9141 return (PyObject*) self;
9142 }
9143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 left = marg / 2 + (marg & width & 1);
9146
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009147 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148}
9149
Marc-André Lemburge5034372000-08-08 08:04:29 +00009150#if 0
9151
9152/* This code should go into some future Unicode collation support
9153 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009154 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009155
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009156/* speedy UTF-16 code point order comparison */
9157/* gleaned from: */
9158/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9159
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009160static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009161{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009162 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009163 0, 0, 0, 0, 0, 0, 0, 0,
9164 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009165 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009166};
9167
Guido van Rossumd57fd912000-03-10 22:53:23 +00009168static int
9169unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9170{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009171 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009172
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173 Py_UNICODE *s1 = str1->str;
9174 Py_UNICODE *s2 = str2->str;
9175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 len1 = str1->_base._base.length;
9177 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009178
Guido van Rossumd57fd912000-03-10 22:53:23 +00009179 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009180 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009181
9182 c1 = *s1++;
9183 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009184
Benjamin Peterson29060642009-01-31 22:14:21 +00009185 if (c1 > (1<<11) * 26)
9186 c1 += utf16Fixup[c1>>11];
9187 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009188 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009189 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009190
9191 if (c1 != c2)
9192 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009193
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009194 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009195 }
9196
9197 return (len1 < len2) ? -1 : (len1 != len2);
9198}
9199
Marc-André Lemburge5034372000-08-08 08:04:29 +00009200#else
9201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202/* This function assumes that str1 and str2 are readied by the caller. */
9203
Marc-André Lemburge5034372000-08-08 08:04:29 +00009204static int
9205unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9206{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207 int kind1, kind2;
9208 void *data1, *data2;
9209 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009211 kind1 = PyUnicode_KIND(str1);
9212 kind2 = PyUnicode_KIND(str2);
9213 data1 = PyUnicode_DATA(str1);
9214 data2 = PyUnicode_DATA(str2);
9215 len1 = PyUnicode_GET_LENGTH(str1);
9216 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218 for (i = 0; i < len1 && i < len2; ++i) {
9219 Py_UCS4 c1, c2;
9220 c1 = PyUnicode_READ(kind1, data1, i);
9221 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009222
9223 if (c1 != c2)
9224 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009225 }
9226
9227 return (len1 < len2) ? -1 : (len1 != len2);
9228}
9229
9230#endif
9231
Alexander Belopolsky40018472011-02-26 01:02:56 +00009232int
9233PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009235 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9236 if (PyUnicode_READY(left) == -1 ||
9237 PyUnicode_READY(right) == -1)
9238 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009239 return unicode_compare((PyUnicodeObject *)left,
9240 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009242 PyErr_Format(PyExc_TypeError,
9243 "Can't compare %.100s and %.100s",
9244 left->ob_type->tp_name,
9245 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246 return -1;
9247}
9248
Martin v. Löwis5b222132007-06-10 09:51:05 +00009249int
9250PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9251{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009252 Py_ssize_t i;
9253 int kind;
9254 void *data;
9255 Py_UCS4 chr;
9256
Martin v. Löwis5b222132007-06-10 09:51:05 +00009257 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258 if (PyUnicode_READY(uni) == -1)
9259 return -1;
9260 kind = PyUnicode_KIND(uni);
9261 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009262 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9264 if (chr != str[i])
9265 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009266 /* This check keeps Python strings that end in '\0' from comparing equal
9267 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009269 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009270 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009271 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009272 return 0;
9273}
9274
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009275
Benjamin Peterson29060642009-01-31 22:14:21 +00009276#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009277 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009278
Alexander Belopolsky40018472011-02-26 01:02:56 +00009279PyObject *
9280PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009281{
9282 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009283
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009284 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9285 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 if (PyUnicode_READY(left) == -1 ||
9287 PyUnicode_READY(right) == -1)
9288 return NULL;
9289 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9290 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009291 if (op == Py_EQ) {
9292 Py_INCREF(Py_False);
9293 return Py_False;
9294 }
9295 if (op == Py_NE) {
9296 Py_INCREF(Py_True);
9297 return Py_True;
9298 }
9299 }
9300 if (left == right)
9301 result = 0;
9302 else
9303 result = unicode_compare((PyUnicodeObject *)left,
9304 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009305
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009306 /* Convert the return value to a Boolean */
9307 switch (op) {
9308 case Py_EQ:
9309 v = TEST_COND(result == 0);
9310 break;
9311 case Py_NE:
9312 v = TEST_COND(result != 0);
9313 break;
9314 case Py_LE:
9315 v = TEST_COND(result <= 0);
9316 break;
9317 case Py_GE:
9318 v = TEST_COND(result >= 0);
9319 break;
9320 case Py_LT:
9321 v = TEST_COND(result == -1);
9322 break;
9323 case Py_GT:
9324 v = TEST_COND(result == 1);
9325 break;
9326 default:
9327 PyErr_BadArgument();
9328 return NULL;
9329 }
9330 Py_INCREF(v);
9331 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009332 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009333
Brian Curtindfc80e32011-08-10 20:28:54 -05009334 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009335}
9336
Alexander Belopolsky40018472011-02-26 01:02:56 +00009337int
9338PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009339{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009340 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341 int kind1, kind2, kind;
9342 void *buf1, *buf2;
9343 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009344 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009345
9346 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009347 sub = PyUnicode_FromObject(element);
9348 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009349 PyErr_Format(PyExc_TypeError,
9350 "'in <string>' requires string as left operand, not %s",
9351 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009352 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354 if (PyUnicode_READY(sub) == -1)
9355 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009356
Thomas Wouters477c8d52006-05-27 19:21:47 +00009357 str = PyUnicode_FromObject(container);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009358 if (!str || PyUnicode_READY(container) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009359 Py_DECREF(sub);
9360 return -1;
9361 }
9362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 kind1 = PyUnicode_KIND(str);
9364 kind2 = PyUnicode_KIND(sub);
9365 kind = kind1 > kind2 ? kind1 : kind2;
9366 buf1 = PyUnicode_DATA(str);
9367 buf2 = PyUnicode_DATA(sub);
9368 if (kind1 != kind)
9369 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9370 if (!buf1) {
9371 Py_DECREF(sub);
9372 return -1;
9373 }
9374 if (kind2 != kind)
9375 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9376 if (!buf2) {
9377 Py_DECREF(sub);
9378 if (kind1 != kind) PyMem_Free(buf1);
9379 return -1;
9380 }
9381 len1 = PyUnicode_GET_LENGTH(str);
9382 len2 = PyUnicode_GET_LENGTH(sub);
9383
9384 switch(kind) {
9385 case PyUnicode_1BYTE_KIND:
9386 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9387 break;
9388 case PyUnicode_2BYTE_KIND:
9389 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9390 break;
9391 case PyUnicode_4BYTE_KIND:
9392 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9393 break;
9394 default:
9395 result = -1;
9396 assert(0);
9397 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009398
9399 Py_DECREF(str);
9400 Py_DECREF(sub);
9401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 if (kind1 != kind)
9403 PyMem_Free(buf1);
9404 if (kind2 != kind)
9405 PyMem_Free(buf2);
9406
Guido van Rossum403d68b2000-03-13 15:55:09 +00009407 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009408}
9409
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410/* Concat to string or Unicode object giving a new Unicode object. */
9411
Alexander Belopolsky40018472011-02-26 01:02:56 +00009412PyObject *
9413PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009414{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 PyObject *u = NULL, *v = NULL, *w;
9416 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417
9418 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009421 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009423 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009424 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009425
9426 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009428 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009430 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009432 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009434 }
9435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 if (PyUnicode_READY(u) == -1 || PyUnicode_READY(v) == -1)
9437 goto onError;
9438
9439 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
9440 if (PyUnicode_MAX_CHAR_VALUE(v) > maxchar)
9441 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
9442
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009444 w = PyUnicode_New(
9445 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9446 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009448 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
9450 PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u), v, 0,
9451 PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452 Py_DECREF(u);
9453 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455
Benjamin Peterson29060642009-01-31 22:14:21 +00009456 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457 Py_XDECREF(u);
9458 Py_XDECREF(v);
9459 return NULL;
9460}
9461
Walter Dörwald1ab83302007-05-18 17:15:44 +00009462void
9463PyUnicode_Append(PyObject **pleft, PyObject *right)
9464{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009465 PyObject *new;
9466 if (*pleft == NULL)
9467 return;
9468 if (right == NULL || !PyUnicode_Check(*pleft)) {
9469 Py_DECREF(*pleft);
9470 *pleft = NULL;
9471 return;
9472 }
9473 new = PyUnicode_Concat(*pleft, right);
9474 Py_DECREF(*pleft);
9475 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009476}
9477
9478void
9479PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9480{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009481 PyUnicode_Append(pleft, right);
9482 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009483}
9484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009485PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009486 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009488Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009489string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009490interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491
9492static PyObject *
9493unicode_count(PyUnicodeObject *self, PyObject *args)
9494{
9495 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009496 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009497 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 int kind1, kind2, kind;
9500 void *buf1, *buf2;
9501 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502
Jesus Ceaac451502011-04-20 17:09:23 +02009503 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9504 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009505 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 kind1 = PyUnicode_KIND(self);
9508 kind2 = PyUnicode_KIND(substring);
9509 kind = kind1 > kind2 ? kind1 : kind2;
9510 buf1 = PyUnicode_DATA(self);
9511 buf2 = PyUnicode_DATA(substring);
9512 if (kind1 != kind)
9513 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9514 if (!buf1) {
9515 Py_DECREF(substring);
9516 return NULL;
9517 }
9518 if (kind2 != kind)
9519 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9520 if (!buf2) {
9521 Py_DECREF(substring);
9522 if (kind1 != kind) PyMem_Free(buf1);
9523 return NULL;
9524 }
9525 len1 = PyUnicode_GET_LENGTH(self);
9526 len2 = PyUnicode_GET_LENGTH(substring);
9527
9528 ADJUST_INDICES(start, end, len1);
9529 switch(kind) {
9530 case PyUnicode_1BYTE_KIND:
9531 iresult = ucs1lib_count(
9532 ((Py_UCS1*)buf1) + start, end - start,
9533 buf2, len2, PY_SSIZE_T_MAX
9534 );
9535 break;
9536 case PyUnicode_2BYTE_KIND:
9537 iresult = ucs2lib_count(
9538 ((Py_UCS2*)buf1) + start, end - start,
9539 buf2, len2, PY_SSIZE_T_MAX
9540 );
9541 break;
9542 case PyUnicode_4BYTE_KIND:
9543 iresult = ucs4lib_count(
9544 ((Py_UCS4*)buf1) + start, end - start,
9545 buf2, len2, PY_SSIZE_T_MAX
9546 );
9547 break;
9548 default:
9549 assert(0); iresult = 0;
9550 }
9551
9552 result = PyLong_FromSsize_t(iresult);
9553
9554 if (kind1 != kind)
9555 PyMem_Free(buf1);
9556 if (kind2 != kind)
9557 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558
9559 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009560
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561 return result;
9562}
9563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009564PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009565 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009567Encode S using the codec registered for encoding. Default encoding\n\
9568is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009569handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009570a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9571'xmlcharrefreplace' as well as any other name registered with\n\
9572codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573
9574static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009575unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009576{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009577 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578 char *encoding = NULL;
9579 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009580
Benjamin Peterson308d6372009-09-18 21:42:35 +00009581 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9582 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009584 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009585}
9586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009587PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009588 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589\n\
9590Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009591If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592
9593static PyObject*
9594unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9595{
9596 Py_UNICODE *e;
9597 Py_UNICODE *p;
9598 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009599 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009600 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009601 PyUnicodeObject *u;
9602 int tabsize = 8;
9603
9604 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009605 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9608 return NULL;
9609
Thomas Wouters7e474022000-07-16 12:04:32 +00009610 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009611 i = 0; /* chars up to and including most recent \n or \r */
9612 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009613 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9614 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009615 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009616 if (tabsize > 0) {
9617 incr = tabsize - (j % tabsize); /* cannot overflow */
9618 if (j > PY_SSIZE_T_MAX - incr)
9619 goto overflow1;
9620 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009621 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009622 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009624 if (j > PY_SSIZE_T_MAX - 1)
9625 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626 j++;
9627 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009628 if (i > PY_SSIZE_T_MAX - j)
9629 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009631 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632 }
9633 }
9634
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009635 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009636 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009637
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638 /* Second pass: create output string and fill it */
9639 u = _PyUnicode_New(i + j);
9640 if (!u)
9641 return NULL;
9642
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009643 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 q = _PyUnicode_WSTR(u); /* next output char */
9645 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009649 if (tabsize > 0) {
9650 i = tabsize - (j % tabsize);
9651 j += i;
9652 while (i--) {
9653 if (q >= qe)
9654 goto overflow2;
9655 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009656 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009657 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009658 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009659 else {
9660 if (q >= qe)
9661 goto overflow2;
9662 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009663 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664 if (*p == '\n' || *p == '\r')
9665 j = 0;
9666 }
9667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 if (PyUnicode_READY(u) == -1) {
9669 Py_DECREF(u);
9670 return NULL;
9671 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009673
9674 overflow2:
9675 Py_DECREF(u);
9676 overflow1:
9677 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9678 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679}
9680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009681PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009682 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683\n\
9684Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009685such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686arguments start and end are interpreted as in slice notation.\n\
9687\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009688Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689
9690static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692{
Jesus Ceaac451502011-04-20 17:09:23 +02009693 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009694 Py_ssize_t start;
9695 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009696 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697
Jesus Ceaac451502011-04-20 17:09:23 +02009698 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9699 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 if (PyUnicode_READY(self) == -1)
9703 return NULL;
9704 if (PyUnicode_READY(substring) == -1)
9705 return NULL;
9706
9707 result = any_find_slice(
9708 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9709 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009710 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711
9712 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009714 if (result == -2)
9715 return NULL;
9716
Christian Heimes217cfd12007-12-02 14:31:20 +00009717 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718}
9719
9720static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009721unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009723 Py_UCS4 ch;
9724
9725 if (PyUnicode_READY(self) == -1)
9726 return NULL;
9727 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728 PyErr_SetString(PyExc_IndexError, "string index out of range");
9729 return NULL;
9730 }
9731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9733 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734}
9735
Guido van Rossumc2504932007-09-18 19:42:40 +00009736/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009737 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009738static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009739unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740{
Guido van Rossumc2504932007-09-18 19:42:40 +00009741 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009742 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009744 if (_PyUnicode_HASH(self) != -1)
9745 return _PyUnicode_HASH(self);
9746 if (PyUnicode_READY(self) == -1)
9747 return -1;
9748 len = PyUnicode_GET_LENGTH(self);
9749
9750 /* The hash function as a macro, gets expanded three times below. */
9751#define HASH(P) \
9752 x = (Py_uhash_t)*P << 7; \
9753 while (--len >= 0) \
9754 x = (1000003*x) ^ (Py_uhash_t)*P++;
9755
9756 switch (PyUnicode_KIND(self)) {
9757 case PyUnicode_1BYTE_KIND: {
9758 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9759 HASH(c);
9760 break;
9761 }
9762 case PyUnicode_2BYTE_KIND: {
9763 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9764 HASH(s);
9765 break;
9766 }
9767 default: {
9768 Py_UCS4 *l;
9769 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9770 "Impossible switch case in unicode_hash");
9771 l = PyUnicode_4BYTE_DATA(self);
9772 HASH(l);
9773 break;
9774 }
9775 }
9776 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9777
Guido van Rossumc2504932007-09-18 19:42:40 +00009778 if (x == -1)
9779 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009781 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009782}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009784
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009785PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009786 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009788Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009789
9790static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009792{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009793 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009794 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009795 Py_ssize_t start;
9796 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797
Jesus Ceaac451502011-04-20 17:09:23 +02009798 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9799 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009800 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 if (PyUnicode_READY(self) == -1)
9803 return NULL;
9804 if (PyUnicode_READY(substring) == -1)
9805 return NULL;
9806
9807 result = any_find_slice(
9808 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9809 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009810 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811
9812 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009814 if (result == -2)
9815 return NULL;
9816
Guido van Rossumd57fd912000-03-10 22:53:23 +00009817 if (result < 0) {
9818 PyErr_SetString(PyExc_ValueError, "substring not found");
9819 return NULL;
9820 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009821
Christian Heimes217cfd12007-12-02 14:31:20 +00009822 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823}
9824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009825PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009826 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009828Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009829at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830
9831static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009832unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009833{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 Py_ssize_t i, length;
9835 int kind;
9836 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009837 int cased;
9838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 if (PyUnicode_READY(self) == -1)
9840 return NULL;
9841 length = PyUnicode_GET_LENGTH(self);
9842 kind = PyUnicode_KIND(self);
9843 data = PyUnicode_DATA(self);
9844
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 if (length == 1)
9847 return PyBool_FromLong(
9848 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009850 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009852 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009853
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 for (i = 0; i < length; i++) {
9856 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009857
Benjamin Peterson29060642009-01-31 22:14:21 +00009858 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9859 return PyBool_FromLong(0);
9860 else if (!cased && Py_UNICODE_ISLOWER(ch))
9861 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009862 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009863 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864}
9865
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009866PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009867 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009869Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009870at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009871
9872static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009873unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 Py_ssize_t i, length;
9876 int kind;
9877 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009878 int cased;
9879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 if (PyUnicode_READY(self) == -1)
9881 return NULL;
9882 length = PyUnicode_GET_LENGTH(self);
9883 kind = PyUnicode_KIND(self);
9884 data = PyUnicode_DATA(self);
9885
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887 if (length == 1)
9888 return PyBool_FromLong(
9889 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009890
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009891 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009893 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009894
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 for (i = 0; i < length; i++) {
9897 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009898
Benjamin Peterson29060642009-01-31 22:14:21 +00009899 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9900 return PyBool_FromLong(0);
9901 else if (!cased && Py_UNICODE_ISUPPER(ch))
9902 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009904 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905}
9906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009907PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009908 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009910Return True if S is a titlecased string and there is at least one\n\
9911character in S, i.e. upper- and titlecase characters may only\n\
9912follow uncased characters and lowercase characters only cased ones.\n\
9913Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914
9915static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009916unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009917{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 Py_ssize_t i, length;
9919 int kind;
9920 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009921 int cased, previous_is_cased;
9922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 if (PyUnicode_READY(self) == -1)
9924 return NULL;
9925 length = PyUnicode_GET_LENGTH(self);
9926 kind = PyUnicode_KIND(self);
9927 data = PyUnicode_DATA(self);
9928
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 if (length == 1) {
9931 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
9932 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
9933 (Py_UNICODE_ISUPPER(ch) != 0));
9934 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009936 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009938 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009939
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940 cased = 0;
9941 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 for (i = 0; i < length; i++) {
9943 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009944
Benjamin Peterson29060642009-01-31 22:14:21 +00009945 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
9946 if (previous_is_cased)
9947 return PyBool_FromLong(0);
9948 previous_is_cased = 1;
9949 cased = 1;
9950 }
9951 else if (Py_UNICODE_ISLOWER(ch)) {
9952 if (!previous_is_cased)
9953 return PyBool_FromLong(0);
9954 previous_is_cased = 1;
9955 cased = 1;
9956 }
9957 else
9958 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009960 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961}
9962
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009963PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009964 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009965\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009966Return True if all characters in S are whitespace\n\
9967and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968
9969static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009970unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009971{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 Py_ssize_t i, length;
9973 int kind;
9974 void *data;
9975
9976 if (PyUnicode_READY(self) == -1)
9977 return NULL;
9978 length = PyUnicode_GET_LENGTH(self);
9979 kind = PyUnicode_KIND(self);
9980 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009981
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 if (length == 1)
9984 return PyBool_FromLong(
9985 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009987 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009989 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 for (i = 0; i < length; i++) {
9992 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03009993 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00009994 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009995 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009996 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997}
9998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009999PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010000 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010001\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010002Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010003and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010004
10005static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010006unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010007{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 Py_ssize_t i, length;
10009 int kind;
10010 void *data;
10011
10012 if (PyUnicode_READY(self) == -1)
10013 return NULL;
10014 length = PyUnicode_GET_LENGTH(self);
10015 kind = PyUnicode_KIND(self);
10016 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010017
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010018 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 if (length == 1)
10020 return PyBool_FromLong(
10021 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010022
10023 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010025 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 for (i = 0; i < length; i++) {
10028 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010029 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010030 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010031 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010032}
10033
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010034PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010035 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010036\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010037Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010038and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010039
10040static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010041unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010042{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 int kind;
10044 void *data;
10045 Py_ssize_t len, i;
10046
10047 if (PyUnicode_READY(self) == -1)
10048 return NULL;
10049
10050 kind = PyUnicode_KIND(self);
10051 data = PyUnicode_DATA(self);
10052 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010053
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010054 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010055 if (len == 1) {
10056 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10057 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10058 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010059
10060 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010062 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 for (i = 0; i < len; i++) {
10065 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010066 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010067 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010068 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010069 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010070}
10071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010072PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010073 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010074\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010075Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010076False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010077
10078static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010079unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010080{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 Py_ssize_t i, length;
10082 int kind;
10083 void *data;
10084
10085 if (PyUnicode_READY(self) == -1)
10086 return NULL;
10087 length = PyUnicode_GET_LENGTH(self);
10088 kind = PyUnicode_KIND(self);
10089 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010090
Guido van Rossumd57fd912000-03-10 22:53:23 +000010091 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 if (length == 1)
10093 return PyBool_FromLong(
10094 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010095
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010096 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010098 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 for (i = 0; i < length; i++) {
10101 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010102 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010103 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010104 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010105}
10106
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010107PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010108 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010109\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010110Return True if all characters in S are digits\n\
10111and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010112
10113static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010114unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010115{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 Py_ssize_t i, length;
10117 int kind;
10118 void *data;
10119
10120 if (PyUnicode_READY(self) == -1)
10121 return NULL;
10122 length = PyUnicode_GET_LENGTH(self);
10123 kind = PyUnicode_KIND(self);
10124 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125
Guido van Rossumd57fd912000-03-10 22:53:23 +000010126 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 if (length == 1) {
10128 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10129 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010131
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010132 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010134 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 for (i = 0; i < length; i++) {
10137 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010138 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010140 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141}
10142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010143PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010144 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010146Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010147False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148
10149static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010150unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 Py_ssize_t i, length;
10153 int kind;
10154 void *data;
10155
10156 if (PyUnicode_READY(self) == -1)
10157 return NULL;
10158 length = PyUnicode_GET_LENGTH(self);
10159 kind = PyUnicode_KIND(self);
10160 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 if (length == 1)
10164 return PyBool_FromLong(
10165 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010167 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010169 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 for (i = 0; i < length; i++) {
10172 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010173 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010175 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176}
10177
Martin v. Löwis47383402007-08-15 07:32:56 +000010178int
10179PyUnicode_IsIdentifier(PyObject *self)
10180{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 int kind;
10182 void *data;
10183 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010184 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 if (PyUnicode_READY(self) == -1) {
10187 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010188 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 }
10190
10191 /* Special case for empty strings */
10192 if (PyUnicode_GET_LENGTH(self) == 0)
10193 return 0;
10194 kind = PyUnicode_KIND(self);
10195 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010196
10197 /* PEP 3131 says that the first character must be in
10198 XID_Start and subsequent characters in XID_Continue,
10199 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010200 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010201 letters, digits, underscore). However, given the current
10202 definition of XID_Start and XID_Continue, it is sufficient
10203 to check just for these, except that _ must be allowed
10204 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010206 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010207 return 0;
10208
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010209 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010211 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010212 return 1;
10213}
10214
10215PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010216 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010217\n\
10218Return True if S is a valid identifier according\n\
10219to the language definition.");
10220
10221static PyObject*
10222unicode_isidentifier(PyObject *self)
10223{
10224 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10225}
10226
Georg Brandl559e5d72008-06-11 18:37:52 +000010227PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010228 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010229\n\
10230Return True if all characters in S are considered\n\
10231printable in repr() or S is empty, False otherwise.");
10232
10233static PyObject*
10234unicode_isprintable(PyObject *self)
10235{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 Py_ssize_t i, length;
10237 int kind;
10238 void *data;
10239
10240 if (PyUnicode_READY(self) == -1)
10241 return NULL;
10242 length = PyUnicode_GET_LENGTH(self);
10243 kind = PyUnicode_KIND(self);
10244 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010245
10246 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 if (length == 1)
10248 return PyBool_FromLong(
10249 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 for (i = 0; i < length; i++) {
10252 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010253 Py_RETURN_FALSE;
10254 }
10255 }
10256 Py_RETURN_TRUE;
10257}
10258
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010259PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010260 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261\n\
10262Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010263iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264
10265static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010266unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010268 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269}
10270
Martin v. Löwis18e16552006-02-15 17:27:45 +000010271static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272unicode_length(PyUnicodeObject *self)
10273{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 if (PyUnicode_READY(self) == -1)
10275 return -1;
10276 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010277}
10278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010279PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010282Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010283done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284
10285static PyObject *
10286unicode_ljust(PyUnicodeObject *self, PyObject *args)
10287{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010288 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 Py_UCS4 fillchar = ' ';
10290
10291 if (PyUnicode_READY(self) == -1)
10292 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010293
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010294 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295 return NULL;
10296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298 Py_INCREF(self);
10299 return (PyObject*) self;
10300 }
10301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303}
10304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010305PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010306 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010308Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010309
10310static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010311unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313 return fixup(self, fixlower);
10314}
10315
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010316#define LEFTSTRIP 0
10317#define RIGHTSTRIP 1
10318#define BOTHSTRIP 2
10319
10320/* Arrays indexed by above */
10321static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10322
10323#define STRIPNAME(i) (stripformat[i]+3)
10324
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010325/* externally visible for str.strip(unicode) */
10326PyObject *
10327_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 void *data;
10330 int kind;
10331 Py_ssize_t i, j, len;
10332 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10335 return NULL;
10336
10337 kind = PyUnicode_KIND(self);
10338 data = PyUnicode_DATA(self);
10339 len = PyUnicode_GET_LENGTH(self);
10340 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10341 PyUnicode_DATA(sepobj),
10342 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010343
Benjamin Peterson14339b62009-01-31 16:36:08 +000010344 i = 0;
10345 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 while (i < len &&
10347 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010348 i++;
10349 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010350 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010351
Benjamin Peterson14339b62009-01-31 16:36:08 +000010352 j = len;
10353 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010354 do {
10355 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 } while (j >= i &&
10357 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010358 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010359 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010360
Benjamin Peterson14339b62009-01-31 16:36:08 +000010361 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010362 Py_INCREF(self);
10363 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010364 }
10365 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 return PyUnicode_Substring((PyObject*)self, i, j);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010367}
10368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369/* Assumes an already ready self string. */
10370
10371static PyObject *
10372substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len)
10373{
10374 const int kind = PyUnicode_KIND(self);
10375 void *data = PyUnicode_DATA(self);
10376 Py_UCS4 maxchar = 0;
10377 Py_ssize_t i;
10378 PyObject *unicode;
10379
10380 if (start < 0 || len < 0 || (start + len) > PyUnicode_GET_LENGTH(self)) {
10381 PyErr_BadInternalCall();
10382 return NULL;
10383 }
10384
10385 if (len == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) {
10386 Py_INCREF(self);
10387 return (PyObject*)self;
10388 }
10389
10390 for (i = 0; i < len; ++i) {
10391 const Py_UCS4 ch = PyUnicode_READ(kind, data, start + i);
10392 if (ch > maxchar)
10393 maxchar = ch;
10394 }
10395
10396 unicode = PyUnicode_New(len, maxchar);
10397 if (unicode == NULL)
10398 return NULL;
10399 PyUnicode_CopyCharacters(unicode, 0,
10400 (PyObject*)self, start, len);
10401 return unicode;
10402}
10403
10404PyObject*
10405PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10406{
10407 unsigned char *data;
10408 int kind;
10409
10410 if (start == 0 && end == PyUnicode_GET_LENGTH(self)
10411 && PyUnicode_CheckExact(self))
10412 {
10413 Py_INCREF(self);
10414 return (PyObject *)self;
10415 }
10416
10417 if ((end - start) == 1)
10418 return unicode_getitem((PyUnicodeObject*)self, start);
10419
10420 if (PyUnicode_READY(self) == -1)
10421 return NULL;
10422 kind = PyUnicode_KIND(self);
10423 data = PyUnicode_1BYTE_DATA(self);
10424 return PyUnicode_FromKindAndData(kind, data + PyUnicode_KIND_SIZE(kind, start),
10425 end-start);
10426}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010427
10428static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010429do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 int kind;
10432 void *data;
10433 Py_ssize_t len, i, j;
10434
10435 if (PyUnicode_READY(self) == -1)
10436 return NULL;
10437
10438 kind = PyUnicode_KIND(self);
10439 data = PyUnicode_DATA(self);
10440 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010441
Benjamin Peterson14339b62009-01-31 16:36:08 +000010442 i = 0;
10443 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010445 i++;
10446 }
10447 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010448
Benjamin Peterson14339b62009-01-31 16:36:08 +000010449 j = len;
10450 if (striptype != LEFTSTRIP) {
10451 do {
10452 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010454 j++;
10455 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010456
Benjamin Peterson14339b62009-01-31 16:36:08 +000010457 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
10458 Py_INCREF(self);
10459 return (PyObject*)self;
10460 }
10461 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 return substring(self, i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010463}
10464
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010465
10466static PyObject *
10467do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10468{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010469 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010470
Benjamin Peterson14339b62009-01-31 16:36:08 +000010471 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10472 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010473
Benjamin Peterson14339b62009-01-31 16:36:08 +000010474 if (sep != NULL && sep != Py_None) {
10475 if (PyUnicode_Check(sep))
10476 return _PyUnicode_XStrip(self, striptype, sep);
10477 else {
10478 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010479 "%s arg must be None or str",
10480 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010481 return NULL;
10482 }
10483 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010484
Benjamin Peterson14339b62009-01-31 16:36:08 +000010485 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010486}
10487
10488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010489PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010490 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010491\n\
10492Return a copy of the string S with leading and trailing\n\
10493whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010494If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010495
10496static PyObject *
10497unicode_strip(PyUnicodeObject *self, PyObject *args)
10498{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010499 if (PyTuple_GET_SIZE(args) == 0)
10500 return do_strip(self, BOTHSTRIP); /* Common case */
10501 else
10502 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010503}
10504
10505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010506PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010507 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010508\n\
10509Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010510If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010511
10512static PyObject *
10513unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10514{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010515 if (PyTuple_GET_SIZE(args) == 0)
10516 return do_strip(self, LEFTSTRIP); /* Common case */
10517 else
10518 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010519}
10520
10521
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010522PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010523 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010524\n\
10525Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010526If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010527
10528static PyObject *
10529unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10530{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010531 if (PyTuple_GET_SIZE(args) == 0)
10532 return do_strip(self, RIGHTSTRIP); /* Common case */
10533 else
10534 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010535}
10536
10537
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010539unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010540{
10541 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 Py_ssize_t nchars, n;
10543 size_t nbytes, char_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010544
Georg Brandl222de0f2009-04-12 12:01:50 +000010545 if (len < 1) {
10546 Py_INCREF(unicode_empty);
10547 return (PyObject *)unicode_empty;
10548 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549
Tim Peters7a29bd52001-09-12 03:03:31 +000010550 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551 /* no repeat, return original string */
10552 Py_INCREF(str);
10553 return (PyObject*) str;
10554 }
Tim Peters8f422462000-09-09 06:13:41 +000010555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 if (PyUnicode_READY(str) == -1)
10557 return NULL;
10558
Tim Peters8f422462000-09-09 06:13:41 +000010559 /* ensure # of chars needed doesn't overflow int and # of bytes
10560 * needed doesn't overflow size_t
10561 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 nchars = len * PyUnicode_GET_LENGTH(str);
10563 if (nchars / len != PyUnicode_GET_LENGTH(str)) {
Tim Peters8f422462000-09-09 06:13:41 +000010564 PyErr_SetString(PyExc_OverflowError,
10565 "repeated string is too long");
10566 return NULL;
10567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 char_size = PyUnicode_CHARACTER_SIZE(str);
10569 nbytes = (nchars + 1) * char_size;
10570 if (nbytes / char_size != (size_t)(nchars + 1)) {
Tim Peters8f422462000-09-09 06:13:41 +000010571 PyErr_SetString(PyExc_OverflowError,
10572 "repeated string is too long");
10573 return NULL;
10574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010576 if (!u)
10577 return NULL;
10578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 if (PyUnicode_GET_LENGTH(str) == 1) {
10580 const int kind = PyUnicode_KIND(str);
10581 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10582 void *to = PyUnicode_DATA(u);
10583 for (n = 0; n < len; ++n)
10584 PyUnicode_WRITE(kind, to, n, fill_char);
10585 }
10586 else {
10587 /* number of characters copied this far */
10588 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10589 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10590 char *to = (char *) PyUnicode_DATA(u);
10591 Py_MEMCPY(to, PyUnicode_DATA(str),
10592 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010593 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 n = (done <= nchars-done) ? done : nchars-done;
10595 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010596 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598 }
10599
10600 return (PyObject*) u;
10601}
10602
Alexander Belopolsky40018472011-02-26 01:02:56 +000010603PyObject *
10604PyUnicode_Replace(PyObject *obj,
10605 PyObject *subobj,
10606 PyObject *replobj,
10607 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608{
10609 PyObject *self;
10610 PyObject *str1;
10611 PyObject *str2;
10612 PyObject *result;
10613
10614 self = PyUnicode_FromObject(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 if (self == NULL || PyUnicode_READY(obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010616 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 str1 = PyUnicode_FromObject(subobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 if (str1 == NULL || PyUnicode_READY(obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010619 Py_DECREF(self);
10620 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621 }
10622 str2 = PyUnicode_FromObject(replobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 if (str2 == NULL || PyUnicode_READY(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010624 Py_DECREF(self);
10625 Py_DECREF(str1);
10626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010629 Py_DECREF(self);
10630 Py_DECREF(str1);
10631 Py_DECREF(str2);
10632 return result;
10633}
10634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010635PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010636 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637\n\
10638Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010639old replaced by new. If the optional argument count is\n\
10640given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641
10642static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010644{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 PyObject *str1;
10646 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010647 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648 PyObject *result;
10649
Martin v. Löwis18e16552006-02-15 17:27:45 +000010650 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010651 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010653 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 str1 = PyUnicode_FromObject(str1);
10655 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10656 return NULL;
10657 str2 = PyUnicode_FromObject(str2);
10658 if (str2 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010659 Py_DECREF(str1);
10660 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010661 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662
10663 result = replace(self, str1, str2, maxcount);
10664
10665 Py_DECREF(str1);
10666 Py_DECREF(str2);
10667 return result;
10668}
10669
Alexander Belopolsky40018472011-02-26 01:02:56 +000010670static PyObject *
10671unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010673 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 Py_ssize_t isize;
10675 Py_ssize_t osize, squote, dquote, i, o;
10676 Py_UCS4 max, quote;
10677 int ikind, okind;
10678 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010681 return NULL;
10682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 isize = PyUnicode_GET_LENGTH(unicode);
10684 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 /* Compute length of output, quote characters, and
10687 maximum character */
10688 osize = 2; /* quotes */
10689 max = 127;
10690 squote = dquote = 0;
10691 ikind = PyUnicode_KIND(unicode);
10692 for (i = 0; i < isize; i++) {
10693 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10694 switch (ch) {
10695 case '\'': squote++; osize++; break;
10696 case '"': dquote++; osize++; break;
10697 case '\\': case '\t': case '\r': case '\n':
10698 osize += 2; break;
10699 default:
10700 /* Fast-path ASCII */
10701 if (ch < ' ' || ch == 0x7f)
10702 osize += 4; /* \xHH */
10703 else if (ch < 0x7f)
10704 osize++;
10705 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10706 osize++;
10707 max = ch > max ? ch : max;
10708 }
10709 else if (ch < 0x100)
10710 osize += 4; /* \xHH */
10711 else if (ch < 0x10000)
10712 osize += 6; /* \uHHHH */
10713 else
10714 osize += 10; /* \uHHHHHHHH */
10715 }
10716 }
10717
10718 quote = '\'';
10719 if (squote) {
10720 if (dquote)
10721 /* Both squote and dquote present. Use squote,
10722 and escape them */
10723 osize += squote;
10724 else
10725 quote = '"';
10726 }
10727
10728 repr = PyUnicode_New(osize, max);
10729 if (repr == NULL)
10730 return NULL;
10731 okind = PyUnicode_KIND(repr);
10732 odata = PyUnicode_DATA(repr);
10733
10734 PyUnicode_WRITE(okind, odata, 0, quote);
10735 PyUnicode_WRITE(okind, odata, osize-1, quote);
10736
10737 for (i = 0, o = 1; i < isize; i++) {
10738 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010739
10740 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 if ((ch == quote) || (ch == '\\')) {
10742 PyUnicode_WRITE(okind, odata, o++, '\\');
10743 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010744 continue;
10745 }
10746
Benjamin Peterson29060642009-01-31 22:14:21 +000010747 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010748 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 PyUnicode_WRITE(okind, odata, o++, '\\');
10750 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010751 }
10752 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 PyUnicode_WRITE(okind, odata, o++, '\\');
10754 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010755 }
10756 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 PyUnicode_WRITE(okind, odata, o++, '\\');
10758 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010759 }
10760
10761 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010762 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 PyUnicode_WRITE(okind, odata, o++, '\\');
10764 PyUnicode_WRITE(okind, odata, o++, 'x');
10765 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10766 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010767 }
10768
Georg Brandl559e5d72008-06-11 18:37:52 +000010769 /* Copy ASCII characters as-is */
10770 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010772 }
10773
Benjamin Peterson29060642009-01-31 22:14:21 +000010774 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010775 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010776 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010777 (categories Z* and C* except ASCII space)
10778 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010780 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 if (ch <= 0xff) {
10782 PyUnicode_WRITE(okind, odata, o++, '\\');
10783 PyUnicode_WRITE(okind, odata, o++, 'x');
10784 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10785 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010786 }
10787 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 else if (ch >= 0x10000) {
10789 PyUnicode_WRITE(okind, odata, o++, '\\');
10790 PyUnicode_WRITE(okind, odata, o++, 'U');
10791 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10792 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10793 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10794 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10795 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10796 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10797 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10798 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010799 }
10800 /* Map 16-bit characters to '\uxxxx' */
10801 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 PyUnicode_WRITE(okind, odata, o++, '\\');
10803 PyUnicode_WRITE(okind, odata, o++, 'u');
10804 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10805 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10806 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10807 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010808 }
10809 }
10810 /* Copy characters as-is */
10811 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010813 }
10814 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010815 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010817 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010818}
10819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010820PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010821 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822\n\
10823Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010824such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825arguments start and end are interpreted as in slice notation.\n\
10826\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010827Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828
10829static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831{
Jesus Ceaac451502011-04-20 17:09:23 +020010832 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010833 Py_ssize_t start;
10834 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010835 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836
Jesus Ceaac451502011-04-20 17:09:23 +020010837 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10838 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010839 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 if (PyUnicode_READY(self) == -1)
10842 return NULL;
10843 if (PyUnicode_READY(substring) == -1)
10844 return NULL;
10845
10846 result = any_find_slice(
10847 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10848 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010849 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850
10851 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853 if (result == -2)
10854 return NULL;
10855
Christian Heimes217cfd12007-12-02 14:31:20 +000010856 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857}
10858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010859PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010860 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010862Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863
10864static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866{
Jesus Ceaac451502011-04-20 17:09:23 +020010867 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010868 Py_ssize_t start;
10869 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010870 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871
Jesus Ceaac451502011-04-20 17:09:23 +020010872 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10873 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010876 if (PyUnicode_READY(self) == -1)
10877 return NULL;
10878 if (PyUnicode_READY(substring) == -1)
10879 return NULL;
10880
10881 result = any_find_slice(
10882 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10883 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010884 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885
10886 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010887
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010888 if (result == -2)
10889 return NULL;
10890
Guido van Rossumd57fd912000-03-10 22:53:23 +000010891 if (result < 0) {
10892 PyErr_SetString(PyExc_ValueError, "substring not found");
10893 return NULL;
10894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010895
Christian Heimes217cfd12007-12-02 14:31:20 +000010896 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897}
10898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010899PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010900 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010902Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010903done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904
10905static PyObject *
10906unicode_rjust(PyUnicodeObject *self, PyObject *args)
10907{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010908 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010909 Py_UCS4 fillchar = ' ';
10910
10911 if (PyUnicode_READY(self) == -1)
10912 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010913
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010914 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915 return NULL;
10916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010917 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918 Py_INCREF(self);
10919 return (PyObject*) self;
10920 }
10921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923}
10924
Alexander Belopolsky40018472011-02-26 01:02:56 +000010925PyObject *
10926PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927{
10928 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010929
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930 s = PyUnicode_FromObject(s);
10931 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010932 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010933 if (sep != NULL) {
10934 sep = PyUnicode_FromObject(sep);
10935 if (sep == NULL) {
10936 Py_DECREF(s);
10937 return NULL;
10938 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939 }
10940
10941 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
10942
10943 Py_DECREF(s);
10944 Py_XDECREF(sep);
10945 return result;
10946}
10947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010948PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010949 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950\n\
10951Return a list of the words in S, using sep as the\n\
10952delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000010953splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000010954whitespace string is a separator and empty strings are\n\
10955removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956
10957static PyObject*
10958unicode_split(PyUnicodeObject *self, PyObject *args)
10959{
10960 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010961 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
Martin v. Löwis18e16552006-02-15 17:27:45 +000010963 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964 return NULL;
10965
10966 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000010967 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000010969 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970 else
Benjamin Peterson29060642009-01-31 22:14:21 +000010971 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972}
10973
Thomas Wouters477c8d52006-05-27 19:21:47 +000010974PyObject *
10975PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
10976{
10977 PyObject* str_obj;
10978 PyObject* sep_obj;
10979 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 int kind1, kind2, kind;
10981 void *buf1 = NULL, *buf2 = NULL;
10982 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010983
10984 str_obj = PyUnicode_FromObject(str_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010985 if (!str_obj || PyUnicode_READY(str_in) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010986 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010987 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010989 Py_DECREF(str_obj);
10990 return NULL;
10991 }
10992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 kind1 = PyUnicode_KIND(str_in);
10994 kind2 = PyUnicode_KIND(sep_obj);
10995 kind = kind1 > kind2 ? kind1 : kind2;
10996 buf1 = PyUnicode_DATA(str_in);
10997 if (kind1 != kind)
10998 buf1 = _PyUnicode_AsKind(str_in, kind);
10999 if (!buf1)
11000 goto onError;
11001 buf2 = PyUnicode_DATA(sep_obj);
11002 if (kind2 != kind)
11003 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11004 if (!buf2)
11005 goto onError;
11006 len1 = PyUnicode_GET_LENGTH(str_obj);
11007 len2 = PyUnicode_GET_LENGTH(sep_obj);
11008
11009 switch(PyUnicode_KIND(str_in)) {
11010 case PyUnicode_1BYTE_KIND:
11011 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11012 break;
11013 case PyUnicode_2BYTE_KIND:
11014 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11015 break;
11016 case PyUnicode_4BYTE_KIND:
11017 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11018 break;
11019 default:
11020 assert(0);
11021 out = 0;
11022 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011023
11024 Py_DECREF(sep_obj);
11025 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 if (kind1 != kind)
11027 PyMem_Free(buf1);
11028 if (kind2 != kind)
11029 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011030
11031 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011032 onError:
11033 Py_DECREF(sep_obj);
11034 Py_DECREF(str_obj);
11035 if (kind1 != kind && buf1)
11036 PyMem_Free(buf1);
11037 if (kind2 != kind && buf2)
11038 PyMem_Free(buf2);
11039 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011040}
11041
11042
11043PyObject *
11044PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11045{
11046 PyObject* str_obj;
11047 PyObject* sep_obj;
11048 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049 int kind1, kind2, kind;
11050 void *buf1 = NULL, *buf2 = NULL;
11051 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011052
11053 str_obj = PyUnicode_FromObject(str_in);
11054 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011055 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011056 sep_obj = PyUnicode_FromObject(sep_in);
11057 if (!sep_obj) {
11058 Py_DECREF(str_obj);
11059 return NULL;
11060 }
11061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011062 kind1 = PyUnicode_KIND(str_in);
11063 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011064 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 buf1 = PyUnicode_DATA(str_in);
11066 if (kind1 != kind)
11067 buf1 = _PyUnicode_AsKind(str_in, kind);
11068 if (!buf1)
11069 goto onError;
11070 buf2 = PyUnicode_DATA(sep_obj);
11071 if (kind2 != kind)
11072 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11073 if (!buf2)
11074 goto onError;
11075 len1 = PyUnicode_GET_LENGTH(str_obj);
11076 len2 = PyUnicode_GET_LENGTH(sep_obj);
11077
11078 switch(PyUnicode_KIND(str_in)) {
11079 case PyUnicode_1BYTE_KIND:
11080 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11081 break;
11082 case PyUnicode_2BYTE_KIND:
11083 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11084 break;
11085 case PyUnicode_4BYTE_KIND:
11086 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11087 break;
11088 default:
11089 assert(0);
11090 out = 0;
11091 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011092
11093 Py_DECREF(sep_obj);
11094 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 if (kind1 != kind)
11096 PyMem_Free(buf1);
11097 if (kind2 != kind)
11098 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011099
11100 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 onError:
11102 Py_DECREF(sep_obj);
11103 Py_DECREF(str_obj);
11104 if (kind1 != kind && buf1)
11105 PyMem_Free(buf1);
11106 if (kind2 != kind && buf2)
11107 PyMem_Free(buf2);
11108 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011109}
11110
11111PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011112 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011113\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011114Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011115the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011116found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011117
11118static PyObject*
11119unicode_partition(PyUnicodeObject *self, PyObject *separator)
11120{
11121 return PyUnicode_Partition((PyObject *)self, separator);
11122}
11123
11124PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011125 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011126\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011127Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011128the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011129separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011130
11131static PyObject*
11132unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11133{
11134 return PyUnicode_RPartition((PyObject *)self, separator);
11135}
11136
Alexander Belopolsky40018472011-02-26 01:02:56 +000011137PyObject *
11138PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011139{
11140 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011141
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011142 s = PyUnicode_FromObject(s);
11143 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011144 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011145 if (sep != NULL) {
11146 sep = PyUnicode_FromObject(sep);
11147 if (sep == NULL) {
11148 Py_DECREF(s);
11149 return NULL;
11150 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011151 }
11152
11153 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11154
11155 Py_DECREF(s);
11156 Py_XDECREF(sep);
11157 return result;
11158}
11159
11160PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011161 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011162\n\
11163Return a list of the words in S, using sep as the\n\
11164delimiter string, starting at the end of the string and\n\
11165working to the front. If maxsplit is given, at most maxsplit\n\
11166splits are done. If sep is not specified, any whitespace string\n\
11167is a separator.");
11168
11169static PyObject*
11170unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11171{
11172 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011173 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011174
Martin v. Löwis18e16552006-02-15 17:27:45 +000011175 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011176 return NULL;
11177
11178 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011179 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011180 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011181 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011182 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011183 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011184}
11185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011186PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011187 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188\n\
11189Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011190Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011191is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192
11193static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011194unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011196 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011197 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011199 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11200 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201 return NULL;
11202
Guido van Rossum86662912000-04-11 15:38:46 +000011203 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204}
11205
11206static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011207PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208{
Walter Dörwald346737f2007-05-31 10:44:43 +000011209 if (PyUnicode_CheckExact(self)) {
11210 Py_INCREF(self);
11211 return self;
11212 } else
11213 /* Subtype -- return genuine unicode string with the same value. */
11214 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
11215 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216}
11217
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011218PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011219 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220\n\
11221Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011222and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223
11224static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011225unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227 return fixup(self, fixswapcase);
11228}
11229
Georg Brandlceee0772007-11-27 23:48:05 +000011230PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011231 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011232\n\
11233Return a translation table usable for str.translate().\n\
11234If there is only one argument, it must be a dictionary mapping Unicode\n\
11235ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011236Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011237If there are two arguments, they must be strings of equal length, and\n\
11238in the resulting dictionary, each character in x will be mapped to the\n\
11239character at the same position in y. If there is a third argument, it\n\
11240must be a string, whose characters will be mapped to None in the result.");
11241
11242static PyObject*
11243unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11244{
11245 PyObject *x, *y = NULL, *z = NULL;
11246 PyObject *new = NULL, *key, *value;
11247 Py_ssize_t i = 0;
11248 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011249
Georg Brandlceee0772007-11-27 23:48:05 +000011250 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11251 return NULL;
11252 new = PyDict_New();
11253 if (!new)
11254 return NULL;
11255 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 int x_kind, y_kind, z_kind;
11257 void *x_data, *y_data, *z_data;
11258
Georg Brandlceee0772007-11-27 23:48:05 +000011259 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011260 if (!PyUnicode_Check(x)) {
11261 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11262 "be a string if there is a second argument");
11263 goto err;
11264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011266 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11267 "arguments must have equal length");
11268 goto err;
11269 }
11270 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011271 x_kind = PyUnicode_KIND(x);
11272 y_kind = PyUnicode_KIND(y);
11273 x_data = PyUnicode_DATA(x);
11274 y_data = PyUnicode_DATA(y);
11275 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11276 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11277 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011278 if (!key || !value)
11279 goto err;
11280 res = PyDict_SetItem(new, key, value);
11281 Py_DECREF(key);
11282 Py_DECREF(value);
11283 if (res < 0)
11284 goto err;
11285 }
11286 /* create entries for deleting chars in z */
11287 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011288 z_kind = PyUnicode_KIND(z);
11289 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011290 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011291 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011292 if (!key)
11293 goto err;
11294 res = PyDict_SetItem(new, key, Py_None);
11295 Py_DECREF(key);
11296 if (res < 0)
11297 goto err;
11298 }
11299 }
11300 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011301 int kind;
11302 void *data;
11303
Georg Brandlceee0772007-11-27 23:48:05 +000011304 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011305 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011306 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11307 "to maketrans it must be a dict");
11308 goto err;
11309 }
11310 /* copy entries into the new dict, converting string keys to int keys */
11311 while (PyDict_Next(x, &i, &key, &value)) {
11312 if (PyUnicode_Check(key)) {
11313 /* convert string keys to integer keys */
11314 PyObject *newkey;
11315 if (PyUnicode_GET_SIZE(key) != 1) {
11316 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11317 "table must be of length 1");
11318 goto err;
11319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 kind = PyUnicode_KIND(key);
11321 data = PyUnicode_DATA(key);
11322 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011323 if (!newkey)
11324 goto err;
11325 res = PyDict_SetItem(new, newkey, value);
11326 Py_DECREF(newkey);
11327 if (res < 0)
11328 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011329 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011330 /* just keep integer keys */
11331 if (PyDict_SetItem(new, key, value) < 0)
11332 goto err;
11333 } else {
11334 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11335 "be strings or integers");
11336 goto err;
11337 }
11338 }
11339 }
11340 return new;
11341 err:
11342 Py_DECREF(new);
11343 return NULL;
11344}
11345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011346PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011347 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348\n\
11349Return a copy of the string S, where all characters have been mapped\n\
11350through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011351Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011352Unmapped characters are left untouched. Characters mapped to None\n\
11353are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354
11355static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359}
11360
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011361PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011362 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011364Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365
11366static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011367unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369 return fixup(self, fixupper);
11370}
11371
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011372PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011373 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011375Pad a numeric string S with zeros on the left, to fill a field\n\
11376of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377
11378static PyObject *
11379unicode_zfill(PyUnicodeObject *self, PyObject *args)
11380{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011381 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011383 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 int kind;
11385 void *data;
11386 Py_UCS4 chr;
11387
11388 if (PyUnicode_READY(self) == -1)
11389 return NULL;
11390
Martin v. Löwis18e16552006-02-15 17:27:45 +000011391 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392 return NULL;
11393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011395 if (PyUnicode_CheckExact(self)) {
11396 Py_INCREF(self);
11397 return (PyObject*) self;
11398 }
11399 else
11400 return PyUnicode_FromUnicode(
11401 PyUnicode_AS_UNICODE(self),
11402 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +000011403 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404 }
11405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407
11408 u = pad(self, fill, 0, '0');
11409
Walter Dörwald068325e2002-04-15 13:36:47 +000011410 if (u == NULL)
11411 return NULL;
11412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 kind = PyUnicode_KIND(u);
11414 data = PyUnicode_DATA(u);
11415 chr = PyUnicode_READ(kind, data, fill);
11416
11417 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419 PyUnicode_WRITE(kind, data, 0, chr);
11420 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421 }
11422
11423 return (PyObject*) u;
11424}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
11426#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011427static PyObject *
11428unicode__decimal2ascii(PyObject *self)
11429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011431}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432#endif
11433
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011434PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011435 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011437Return True if S starts with the specified prefix, False otherwise.\n\
11438With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011439With optional end, stop comparing S at that position.\n\
11440prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441
11442static PyObject *
11443unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011446 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011448 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011449 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011450 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451
Jesus Ceaac451502011-04-20 17:09:23 +020011452 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011453 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011454 if (PyTuple_Check(subobj)) {
11455 Py_ssize_t i;
11456 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11457 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011458 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011459 if (substring == NULL)
11460 return NULL;
11461 result = tailmatch(self, substring, start, end, -1);
11462 Py_DECREF(substring);
11463 if (result) {
11464 Py_RETURN_TRUE;
11465 }
11466 }
11467 /* nothing matched */
11468 Py_RETURN_FALSE;
11469 }
11470 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011471 if (substring == NULL) {
11472 if (PyErr_ExceptionMatches(PyExc_TypeError))
11473 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11474 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011476 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011477 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011479 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480}
11481
11482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011483PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011484 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011486Return True if S ends with the specified suffix, False otherwise.\n\
11487With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011488With optional end, stop comparing S at that position.\n\
11489suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
11491static PyObject *
11492unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011493 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011495 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011497 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011498 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011499 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500
Jesus Ceaac451502011-04-20 17:09:23 +020011501 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011502 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011503 if (PyTuple_Check(subobj)) {
11504 Py_ssize_t i;
11505 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11506 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011508 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011510 result = tailmatch(self, substring, start, end, +1);
11511 Py_DECREF(substring);
11512 if (result) {
11513 Py_RETURN_TRUE;
11514 }
11515 }
11516 Py_RETURN_FALSE;
11517 }
11518 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011519 if (substring == NULL) {
11520 if (PyErr_ExceptionMatches(PyExc_TypeError))
11521 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11522 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011524 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011525 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011527 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528}
11529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011531
11532PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011533 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011534\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011535Return a formatted version of S, using substitutions from args and kwargs.\n\
11536The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011537
Eric Smith27bbca62010-11-04 17:06:58 +000011538PyDoc_STRVAR(format_map__doc__,
11539 "S.format_map(mapping) -> str\n\
11540\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011541Return a formatted version of S, using substitutions from mapping.\n\
11542The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011543
Eric Smith4a7d76d2008-05-30 18:10:19 +000011544static PyObject *
11545unicode__format__(PyObject* self, PyObject* args)
11546{
11547 PyObject *format_spec;
11548
11549 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11550 return NULL;
11551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11553 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011554}
11555
Eric Smith8c663262007-08-25 02:26:07 +000011556PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011558\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011559Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011560
11561static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011562unicode__sizeof__(PyUnicodeObject *v)
11563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 Py_ssize_t size;
11565
11566 /* If it's a compact object, account for base structure +
11567 character data. */
11568 if (PyUnicode_IS_COMPACT_ASCII(v))
11569 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11570 else if (PyUnicode_IS_COMPACT(v))
11571 size = sizeof(PyCompactUnicodeObject) +
11572 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11573 else {
11574 /* If it is a two-block object, account for base object, and
11575 for character block if present. */
11576 size = sizeof(PyUnicodeObject);
11577 if (v->data.any)
11578 size += (PyUnicode_GET_LENGTH(v) + 1) *
11579 PyUnicode_CHARACTER_SIZE(v);
11580 }
11581 /* If the wstr pointer is present, account for it unless it is shared
11582 with the data pointer. Since PyUnicode_DATA will crash if the object
11583 is not ready, check whether it's either not ready (in which case the
11584 data is entirely in wstr) or if the data is not shared. */
11585 if (_PyUnicode_WSTR(v) &&
11586 (!PyUnicode_IS_READY(v) ||
11587 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11588 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11589 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11590 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11591
11592 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011593}
11594
11595PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011596 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011597
11598static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011599unicode_getnewargs(PyUnicodeObject *v)
11600{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 PyObject *copy;
11602 unsigned char *data;
11603 int kind;
11604 if (PyUnicode_READY(v) == -1)
11605 return NULL;
11606 kind = PyUnicode_KIND(v);
11607 data = PyUnicode_1BYTE_DATA(v);
11608 copy = PyUnicode_FromKindAndData(kind, data, PyUnicode_GET_LENGTH(v));
11609 if (!copy)
11610 return NULL;
11611 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011612}
11613
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614static PyMethodDef unicode_methods[] = {
11615
11616 /* Order is according to common usage: often used methods should
11617 appear first, since lookup is done sequentially. */
11618
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011619 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011620 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11621 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011622 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011623 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11624 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11625 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11626 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11627 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11628 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11629 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011630 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011631 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11632 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11633 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011634 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011635 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11636 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11637 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011638 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011639 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011640 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011641 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011642 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11643 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11644 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11645 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11646 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11647 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11648 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11649 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11650 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11651 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11652 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11653 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11654 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11655 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011656 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011657 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011658 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011659 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011660 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011661 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011662 {"maketrans", (PyCFunction) unicode_maketrans,
11663 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011664 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011665#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011666 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667#endif
11668
11669#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011670 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011671 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672#endif
11673
Benjamin Peterson14339b62009-01-31 16:36:08 +000011674 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675 {NULL, NULL}
11676};
11677
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011678static PyObject *
11679unicode_mod(PyObject *v, PyObject *w)
11680{
Brian Curtindfc80e32011-08-10 20:28:54 -050011681 if (!PyUnicode_Check(v))
11682 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011683 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011684}
11685
11686static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011687 0, /*nb_add*/
11688 0, /*nb_subtract*/
11689 0, /*nb_multiply*/
11690 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011691};
11692
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011694 (lenfunc) unicode_length, /* sq_length */
11695 PyUnicode_Concat, /* sq_concat */
11696 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11697 (ssizeargfunc) unicode_getitem, /* sq_item */
11698 0, /* sq_slice */
11699 0, /* sq_ass_item */
11700 0, /* sq_ass_slice */
11701 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702};
11703
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011704static PyObject*
11705unicode_subscript(PyUnicodeObject* self, PyObject* item)
11706{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707 if (PyUnicode_READY(self) == -1)
11708 return NULL;
11709
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011710 if (PyIndex_Check(item)) {
11711 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011712 if (i == -1 && PyErr_Occurred())
11713 return NULL;
11714 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011716 return unicode_getitem(self, i);
11717 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011718 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011720 Py_UNICODE* result_buf;
11721 PyObject* result;
11722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011725 return NULL;
11726 }
11727
11728 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011729 return PyUnicode_New(0, 0);
11730 } else if (start == 0 && step == 1 &&
11731 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011732 PyUnicode_CheckExact(self)) {
11733 Py_INCREF(self);
11734 return (PyObject *)self;
11735 } else if (step == 1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736 return substring(self, start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011737 } else {
11738 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011739 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11740 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011741
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 if (result_buf == NULL)
11743 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011744
11745 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11746 result_buf[i] = source_buf[cur];
11747 }
Tim Petersced69f82003-09-16 20:30:58 +000011748
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011749 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011750 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011751 return result;
11752 }
11753 } else {
11754 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11755 return NULL;
11756 }
11757}
11758
11759static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011760 (lenfunc)unicode_length, /* mp_length */
11761 (binaryfunc)unicode_subscript, /* mp_subscript */
11762 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011763};
11764
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766/* Helpers for PyUnicode_Format() */
11767
11768static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011769getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011771 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011773 (*p_argidx)++;
11774 if (arglen < 0)
11775 return args;
11776 else
11777 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778 }
11779 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011780 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781 return NULL;
11782}
11783
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011784/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011786static PyObject *
11787formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011789 char *p;
11790 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011792
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793 x = PyFloat_AsDouble(v);
11794 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011795 return NULL;
11796
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011798 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011799
Eric Smith0923d1d2009-04-16 20:16:10 +000011800 p = PyOS_double_to_string(x, type, prec,
11801 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011802 if (p == NULL)
11803 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011805 PyMem_Free(p);
11806 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807}
11808
Tim Peters38fd5b62000-09-21 05:43:11 +000011809static PyObject*
11810formatlong(PyObject *val, int flags, int prec, int type)
11811{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011812 char *buf;
11813 int len;
11814 PyObject *str; /* temporary string object. */
11815 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011816
Benjamin Peterson14339b62009-01-31 16:36:08 +000011817 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11818 if (!str)
11819 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011821 Py_DECREF(str);
11822 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011823}
11824
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011827 size_t buflen,
11828 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011830 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011831 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 if (PyUnicode_GET_LENGTH(v) == 1) {
11833 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011834 buf[1] = '\0';
11835 return 1;
11836 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011837 goto onError;
11838 }
11839 else {
11840 /* Integer input truncated to a character */
11841 long x;
11842 x = PyLong_AsLong(v);
11843 if (x == -1 && PyErr_Occurred())
11844 goto onError;
11845
11846 if (x < 0 || x > 0x10ffff) {
11847 PyErr_SetString(PyExc_OverflowError,
11848 "%c arg not in range(0x110000)");
11849 return -1;
11850 }
11851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011853 buf[1] = '\0';
11854 return 1;
11855 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011856
Benjamin Peterson29060642009-01-31 22:14:21 +000011857 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011858 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011859 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011860 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861}
11862
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011863/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011864 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011865*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011866#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011867
Alexander Belopolsky40018472011-02-26 01:02:56 +000011868PyObject *
11869PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 void *fmt;
11872 int fmtkind;
11873 PyObject *result;
11874 Py_UCS4 *res, *res0;
11875 Py_UCS4 max;
11876 int kind;
11877 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011881
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 PyErr_BadInternalCall();
11884 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11887 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011888 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 fmt = PyUnicode_DATA(uformat);
11890 fmtkind = PyUnicode_KIND(uformat);
11891 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11892 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893
11894 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11896 if (res0 == NULL) {
11897 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011898 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900
11901 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 arglen = PyTuple_Size(args);
11903 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904 }
11905 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 arglen = -1;
11907 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011909 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011910 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011911 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912
11913 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011915 if (--rescnt < 0) {
11916 rescnt = fmtcnt + 100;
11917 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11919 if (res0 == NULL){
11920 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011921 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 }
11923 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011924 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011925 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011927 }
11928 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 /* Got a format specifier */
11930 int flags = 0;
11931 Py_ssize_t width = -1;
11932 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 Py_UCS4 c = '\0';
11934 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011935 int isnumok;
11936 PyObject *v = NULL;
11937 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 void *pbuf;
11939 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011940 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 Py_ssize_t len, len1;
11942 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 fmtpos++;
11945 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
11946 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000011947 Py_ssize_t keylen;
11948 PyObject *key;
11949 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000011950
Benjamin Peterson29060642009-01-31 22:14:21 +000011951 if (dict == NULL) {
11952 PyErr_SetString(PyExc_TypeError,
11953 "format requires a mapping");
11954 goto onError;
11955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011957 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011959 /* Skip over balanced parentheses */
11960 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000011962 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000011966 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011968 if (fmtcnt < 0 || pcount > 0) {
11969 PyErr_SetString(PyExc_ValueError,
11970 "incomplete format key");
11971 goto onError;
11972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 key = substring(uformat, keystart, keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000011974 if (key == NULL)
11975 goto onError;
11976 if (args_owned) {
11977 Py_DECREF(args);
11978 args_owned = 0;
11979 }
11980 args = PyObject_GetItem(dict, key);
11981 Py_DECREF(key);
11982 if (args == NULL) {
11983 goto onError;
11984 }
11985 args_owned = 1;
11986 arglen = -1;
11987 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011988 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 case '-': flags |= F_LJUST; continue;
11992 case '+': flags |= F_SIGN; continue;
11993 case ' ': flags |= F_BLANK; continue;
11994 case '#': flags |= F_ALT; continue;
11995 case '0': flags |= F_ZERO; continue;
11996 }
11997 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011998 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011999 if (c == '*') {
12000 v = getnextarg(args, arglen, &argidx);
12001 if (v == NULL)
12002 goto onError;
12003 if (!PyLong_Check(v)) {
12004 PyErr_SetString(PyExc_TypeError,
12005 "* wants int");
12006 goto onError;
12007 }
12008 width = PyLong_AsLong(v);
12009 if (width == -1 && PyErr_Occurred())
12010 goto onError;
12011 if (width < 0) {
12012 flags |= F_LJUST;
12013 width = -width;
12014 }
12015 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012017 }
12018 else if (c >= '0' && c <= '9') {
12019 width = c - '0';
12020 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 if (c < '0' || c > '9')
12023 break;
12024 if ((width*10) / 10 != width) {
12025 PyErr_SetString(PyExc_ValueError,
12026 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012027 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012028 }
12029 width = width*10 + (c - '0');
12030 }
12031 }
12032 if (c == '.') {
12033 prec = 0;
12034 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012036 if (c == '*') {
12037 v = getnextarg(args, arglen, &argidx);
12038 if (v == NULL)
12039 goto onError;
12040 if (!PyLong_Check(v)) {
12041 PyErr_SetString(PyExc_TypeError,
12042 "* wants int");
12043 goto onError;
12044 }
12045 prec = PyLong_AsLong(v);
12046 if (prec == -1 && PyErr_Occurred())
12047 goto onError;
12048 if (prec < 0)
12049 prec = 0;
12050 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012052 }
12053 else if (c >= '0' && c <= '9') {
12054 prec = c - '0';
12055 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012057 if (c < '0' || c > '9')
12058 break;
12059 if ((prec*10) / 10 != prec) {
12060 PyErr_SetString(PyExc_ValueError,
12061 "prec too big");
12062 goto onError;
12063 }
12064 prec = prec*10 + (c - '0');
12065 }
12066 }
12067 } /* prec */
12068 if (fmtcnt >= 0) {
12069 if (c == 'h' || c == 'l' || c == 'L') {
12070 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012072 }
12073 }
12074 if (fmtcnt < 0) {
12075 PyErr_SetString(PyExc_ValueError,
12076 "incomplete format");
12077 goto onError;
12078 }
12079 if (c != '%') {
12080 v = getnextarg(args, arglen, &argidx);
12081 if (v == NULL)
12082 goto onError;
12083 }
12084 sign = 0;
12085 fill = ' ';
12086 switch (c) {
12087
12088 case '%':
12089 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012091 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012093 len = 1;
12094 break;
12095
12096 case 's':
12097 case 'r':
12098 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012099 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012100 temp = v;
12101 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012102 }
12103 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012104 if (c == 's')
12105 temp = PyObject_Str(v);
12106 else if (c == 'r')
12107 temp = PyObject_Repr(v);
12108 else
12109 temp = PyObject_ASCII(v);
12110 if (temp == NULL)
12111 goto onError;
12112 if (PyUnicode_Check(temp))
12113 /* nothing to do */;
12114 else {
12115 Py_DECREF(temp);
12116 PyErr_SetString(PyExc_TypeError,
12117 "%s argument has non-string str()");
12118 goto onError;
12119 }
12120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 if (PyUnicode_READY(temp) == -1) {
12122 Py_CLEAR(temp);
12123 goto onError;
12124 }
12125 pbuf = PyUnicode_DATA(temp);
12126 kind = PyUnicode_KIND(temp);
12127 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012128 if (prec >= 0 && len > prec)
12129 len = prec;
12130 break;
12131
12132 case 'i':
12133 case 'd':
12134 case 'u':
12135 case 'o':
12136 case 'x':
12137 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012138 isnumok = 0;
12139 if (PyNumber_Check(v)) {
12140 PyObject *iobj=NULL;
12141
12142 if (PyLong_Check(v)) {
12143 iobj = v;
12144 Py_INCREF(iobj);
12145 }
12146 else {
12147 iobj = PyNumber_Long(v);
12148 }
12149 if (iobj!=NULL) {
12150 if (PyLong_Check(iobj)) {
12151 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012152 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012153 Py_DECREF(iobj);
12154 if (!temp)
12155 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156 if (PyUnicode_READY(temp) == -1) {
12157 Py_CLEAR(temp);
12158 goto onError;
12159 }
12160 pbuf = PyUnicode_DATA(temp);
12161 kind = PyUnicode_KIND(temp);
12162 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012163 sign = 1;
12164 }
12165 else {
12166 Py_DECREF(iobj);
12167 }
12168 }
12169 }
12170 if (!isnumok) {
12171 PyErr_Format(PyExc_TypeError,
12172 "%%%c format: a number is required, "
12173 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12174 goto onError;
12175 }
12176 if (flags & F_ZERO)
12177 fill = '0';
12178 break;
12179
12180 case 'e':
12181 case 'E':
12182 case 'f':
12183 case 'F':
12184 case 'g':
12185 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012186 temp = formatfloat(v, flags, prec, c);
12187 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012188 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 if (PyUnicode_READY(temp) == -1) {
12190 Py_CLEAR(temp);
12191 goto onError;
12192 }
12193 pbuf = PyUnicode_DATA(temp);
12194 kind = PyUnicode_KIND(temp);
12195 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012196 sign = 1;
12197 if (flags & F_ZERO)
12198 fill = '0';
12199 break;
12200
12201 case 'c':
12202 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012204 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
12205 if (len < 0)
12206 goto onError;
12207 break;
12208
12209 default:
12210 PyErr_Format(PyExc_ValueError,
12211 "unsupported format character '%c' (0x%x) "
12212 "at index %zd",
12213 (31<=c && c<=126) ? (char)c : '?',
12214 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012216 goto onError;
12217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 /* pbuf is initialized here. */
12219 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012220 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12222 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12223 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012224 len--;
12225 }
12226 else if (flags & F_SIGN)
12227 sign = '+';
12228 else if (flags & F_BLANK)
12229 sign = ' ';
12230 else
12231 sign = 0;
12232 }
12233 if (width < len)
12234 width = len;
12235 if (rescnt - (sign != 0) < width) {
12236 reslen -= rescnt;
12237 rescnt = width + fmtcnt + 100;
12238 reslen += rescnt;
12239 if (reslen < 0) {
12240 Py_XDECREF(temp);
12241 PyErr_NoMemory();
12242 goto onError;
12243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012244 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12245 if (res0 == 0) {
12246 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012247 Py_XDECREF(temp);
12248 goto onError;
12249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012250 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012251 }
12252 if (sign) {
12253 if (fill != ' ')
12254 *res++ = sign;
12255 rescnt--;
12256 if (width > len)
12257 width--;
12258 }
12259 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012260 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12261 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012262 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12264 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012265 }
12266 rescnt -= 2;
12267 width -= 2;
12268 if (width < 0)
12269 width = 0;
12270 len -= 2;
12271 }
12272 if (width > len && !(flags & F_LJUST)) {
12273 do {
12274 --rescnt;
12275 *res++ = fill;
12276 } while (--width > len);
12277 }
12278 if (fill == ' ') {
12279 if (sign)
12280 *res++ = sign;
12281 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12283 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12284 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12285 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012286 }
12287 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 /* Copy all characters, preserving len */
12289 len1 = len;
12290 while (len1--) {
12291 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12292 rescnt--;
12293 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012294 while (--width >= len) {
12295 --rescnt;
12296 *res++ = ' ';
12297 }
12298 if (dict && (argidx < arglen) && c != '%') {
12299 PyErr_SetString(PyExc_TypeError,
12300 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012301 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012302 goto onError;
12303 }
12304 Py_XDECREF(temp);
12305 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306 } /* until end */
12307 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012308 PyErr_SetString(PyExc_TypeError,
12309 "not all arguments converted during string formatting");
12310 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311 }
12312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313
12314 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12315 if (*res > max)
12316 max = *res;
12317 result = PyUnicode_New(reslen - rescnt, max);
12318 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012319 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 kind = PyUnicode_KIND(result);
12321 for (res = res0; res < res0+reslen-rescnt; res++)
12322 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12323 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012324 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012325 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012326 }
12327 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328 return (PyObject *)result;
12329
Benjamin Peterson29060642009-01-31 22:14:21 +000012330 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332 Py_DECREF(uformat);
12333 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012334 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012335 }
12336 return NULL;
12337}
12338
Jeremy Hylton938ace62002-07-17 16:30:39 +000012339static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012340unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12341
Tim Peters6d6c1a32001-08-02 04:15:00 +000012342static PyObject *
12343unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12344{
Benjamin Peterson29060642009-01-31 22:14:21 +000012345 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012346 static char *kwlist[] = {"object", "encoding", "errors", 0};
12347 char *encoding = NULL;
12348 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012349
Benjamin Peterson14339b62009-01-31 16:36:08 +000012350 if (type != &PyUnicode_Type)
12351 return unicode_subtype_new(type, args, kwds);
12352 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012353 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012354 return NULL;
12355 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012357 if (encoding == NULL && errors == NULL)
12358 return PyObject_Str(x);
12359 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012360 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012361}
12362
Guido van Rossume023fe02001-08-30 03:12:59 +000012363static PyObject *
12364unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12365{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012366 PyUnicodeObject *tmp, *pnew;
12367 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012369
Benjamin Peterson14339b62009-01-31 16:36:08 +000012370 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12371 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12372 if (tmp == NULL)
12373 return NULL;
12374 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12376 // it seems kind of strange that tp_alloc gets passed the size
12377 // of the unicode string because there will follow another
12378 // malloc.
12379 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12380 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012381 if (pnew == NULL) {
12382 Py_DECREF(tmp);
12383 return NULL;
12384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12386 if (_PyUnicode_WSTR(pnew) == NULL) {
12387 err = PyErr_NoMemory();
12388 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12391 _PyUnicode_WSTR_LENGTH(pnew) = n;
12392 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12393 _PyUnicode_STATE(pnew).interned = 0;
12394 _PyUnicode_STATE(pnew).kind = 0;
12395 _PyUnicode_STATE(pnew).compact = 0;
12396 _PyUnicode_STATE(pnew).ready = 0;
12397 _PyUnicode_STATE(pnew).ascii = 0;
12398 pnew->data.any = NULL;
12399 _PyUnicode_LENGTH(pnew) = 0;
12400 pnew->_base.utf8 = NULL;
12401 pnew->_base.utf8_length = 0;
12402
12403 if (PyUnicode_READY(pnew) == -1) {
12404 PyObject_FREE(_PyUnicode_WSTR(pnew));
12405 goto onError;
12406 }
12407
Benjamin Peterson14339b62009-01-31 16:36:08 +000012408 Py_DECREF(tmp);
12409 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410
12411 onError:
12412 _Py_ForgetReference((PyObject *)pnew);
12413 PyObject_Del(pnew);
12414 Py_DECREF(tmp);
12415 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012416}
12417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012418PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012419 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012420\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012421Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012422encoding defaults to the current default string encoding.\n\
12423errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012424
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012425static PyObject *unicode_iter(PyObject *seq);
12426
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012428 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012429 "str", /* tp_name */
12430 sizeof(PyUnicodeObject), /* tp_size */
12431 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012433 (destructor)unicode_dealloc, /* tp_dealloc */
12434 0, /* tp_print */
12435 0, /* tp_getattr */
12436 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012437 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012438 unicode_repr, /* tp_repr */
12439 &unicode_as_number, /* tp_as_number */
12440 &unicode_as_sequence, /* tp_as_sequence */
12441 &unicode_as_mapping, /* tp_as_mapping */
12442 (hashfunc) unicode_hash, /* tp_hash*/
12443 0, /* tp_call*/
12444 (reprfunc) unicode_str, /* tp_str */
12445 PyObject_GenericGetAttr, /* tp_getattro */
12446 0, /* tp_setattro */
12447 0, /* tp_as_buffer */
12448 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012449 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012450 unicode_doc, /* tp_doc */
12451 0, /* tp_traverse */
12452 0, /* tp_clear */
12453 PyUnicode_RichCompare, /* tp_richcompare */
12454 0, /* tp_weaklistoffset */
12455 unicode_iter, /* tp_iter */
12456 0, /* tp_iternext */
12457 unicode_methods, /* tp_methods */
12458 0, /* tp_members */
12459 0, /* tp_getset */
12460 &PyBaseObject_Type, /* tp_base */
12461 0, /* tp_dict */
12462 0, /* tp_descr_get */
12463 0, /* tp_descr_set */
12464 0, /* tp_dictoffset */
12465 0, /* tp_init */
12466 0, /* tp_alloc */
12467 unicode_new, /* tp_new */
12468 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469};
12470
12471/* Initialize the Unicode implementation */
12472
Thomas Wouters78890102000-07-22 19:25:51 +000012473void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012475 int i;
12476
Thomas Wouters477c8d52006-05-27 19:21:47 +000012477 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012479 0x000A, /* LINE FEED */
12480 0x000D, /* CARRIAGE RETURN */
12481 0x001C, /* FILE SEPARATOR */
12482 0x001D, /* GROUP SEPARATOR */
12483 0x001E, /* RECORD SEPARATOR */
12484 0x0085, /* NEXT LINE */
12485 0x2028, /* LINE SEPARATOR */
12486 0x2029, /* PARAGRAPH SEPARATOR */
12487 };
12488
Fred Drakee4315f52000-05-09 19:53:39 +000012489 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012491 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012493
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012494 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012495 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012496 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012497 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012498
12499 /* initialize the linebreak bloom filter */
12500 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012501 PyUnicode_2BYTE_KIND, linebreak,
12502 sizeof(linebreak) / sizeof(linebreak[0]));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012503
12504 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505}
12506
12507/* Finalize the Unicode implementation */
12508
Christian Heimesa156e092008-02-16 07:38:31 +000012509int
12510PyUnicode_ClearFreeList(void)
12511{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012513}
12514
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515void
Thomas Wouters78890102000-07-22 19:25:51 +000012516_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012518 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012520 Py_XDECREF(unicode_empty);
12521 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012522
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012523 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012524 if (unicode_latin1[i]) {
12525 Py_DECREF(unicode_latin1[i]);
12526 unicode_latin1[i] = NULL;
12527 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012528 }
Christian Heimesa156e092008-02-16 07:38:31 +000012529 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012531
Walter Dörwald16807132007-05-25 13:52:07 +000012532void
12533PyUnicode_InternInPlace(PyObject **p)
12534{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012535 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12536 PyObject *t;
12537 if (s == NULL || !PyUnicode_Check(s))
12538 Py_FatalError(
12539 "PyUnicode_InternInPlace: unicode strings only please!");
12540 /* If it's a subclass, we don't really know what putting
12541 it in the interned dict might do. */
12542 if (!PyUnicode_CheckExact(s))
12543 return;
12544 if (PyUnicode_CHECK_INTERNED(s))
12545 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 if (PyUnicode_READY(s) == -1) {
12547 assert(0 && "ready fail in intern...");
12548 return;
12549 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012550 if (interned == NULL) {
12551 interned = PyDict_New();
12552 if (interned == NULL) {
12553 PyErr_Clear(); /* Don't leave an exception */
12554 return;
12555 }
12556 }
12557 /* It might be that the GetItem call fails even
12558 though the key is present in the dictionary,
12559 namely when this happens during a stack overflow. */
12560 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012561 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012562 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012563
Benjamin Peterson29060642009-01-31 22:14:21 +000012564 if (t) {
12565 Py_INCREF(t);
12566 Py_DECREF(*p);
12567 *p = t;
12568 return;
12569 }
Walter Dörwald16807132007-05-25 13:52:07 +000012570
Benjamin Peterson14339b62009-01-31 16:36:08 +000012571 PyThreadState_GET()->recursion_critical = 1;
12572 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12573 PyErr_Clear();
12574 PyThreadState_GET()->recursion_critical = 0;
12575 return;
12576 }
12577 PyThreadState_GET()->recursion_critical = 0;
12578 /* The two references in interned are not counted by refcnt.
12579 The deallocator will take care of this */
12580 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012582}
12583
12584void
12585PyUnicode_InternImmortal(PyObject **p)
12586{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12588
Benjamin Peterson14339b62009-01-31 16:36:08 +000012589 PyUnicode_InternInPlace(p);
12590 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012592 Py_INCREF(*p);
12593 }
Walter Dörwald16807132007-05-25 13:52:07 +000012594}
12595
12596PyObject *
12597PyUnicode_InternFromString(const char *cp)
12598{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012599 PyObject *s = PyUnicode_FromString(cp);
12600 if (s == NULL)
12601 return NULL;
12602 PyUnicode_InternInPlace(&s);
12603 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012604}
12605
Alexander Belopolsky40018472011-02-26 01:02:56 +000012606void
12607_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012608{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012609 PyObject *keys;
12610 PyUnicodeObject *s;
12611 Py_ssize_t i, n;
12612 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012613
Benjamin Peterson14339b62009-01-31 16:36:08 +000012614 if (interned == NULL || !PyDict_Check(interned))
12615 return;
12616 keys = PyDict_Keys(interned);
12617 if (keys == NULL || !PyList_Check(keys)) {
12618 PyErr_Clear();
12619 return;
12620 }
Walter Dörwald16807132007-05-25 13:52:07 +000012621
Benjamin Peterson14339b62009-01-31 16:36:08 +000012622 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12623 detector, interned unicode strings are not forcibly deallocated;
12624 rather, we give them their stolen references back, and then clear
12625 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012626
Benjamin Peterson14339b62009-01-31 16:36:08 +000012627 n = PyList_GET_SIZE(keys);
12628 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012629 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012630 for (i = 0; i < n; i++) {
12631 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 if (PyUnicode_READY(s) == -1)
12633 fprintf(stderr, "could not ready string\n");
12634 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012635 case SSTATE_NOT_INTERNED:
12636 /* XXX Shouldn't happen */
12637 break;
12638 case SSTATE_INTERNED_IMMORTAL:
12639 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012641 break;
12642 case SSTATE_INTERNED_MORTAL:
12643 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012645 break;
12646 default:
12647 Py_FatalError("Inconsistent interned string state.");
12648 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012650 }
12651 fprintf(stderr, "total size of all interned strings: "
12652 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12653 "mortal/immortal\n", mortal_size, immortal_size);
12654 Py_DECREF(keys);
12655 PyDict_Clear(interned);
12656 Py_DECREF(interned);
12657 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012658}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012659
12660
12661/********************* Unicode Iterator **************************/
12662
12663typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012664 PyObject_HEAD
12665 Py_ssize_t it_index;
12666 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012667} unicodeiterobject;
12668
12669static void
12670unicodeiter_dealloc(unicodeiterobject *it)
12671{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012672 _PyObject_GC_UNTRACK(it);
12673 Py_XDECREF(it->it_seq);
12674 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012675}
12676
12677static int
12678unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12679{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012680 Py_VISIT(it->it_seq);
12681 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012682}
12683
12684static PyObject *
12685unicodeiter_next(unicodeiterobject *it)
12686{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012687 PyUnicodeObject *seq;
12688 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012689
Benjamin Peterson14339b62009-01-31 16:36:08 +000012690 assert(it != NULL);
12691 seq = it->it_seq;
12692 if (seq == NULL)
12693 return NULL;
12694 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12697 int kind = PyUnicode_KIND(seq);
12698 void *data = PyUnicode_DATA(seq);
12699 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12700 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012701 if (item != NULL)
12702 ++it->it_index;
12703 return item;
12704 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012705
Benjamin Peterson14339b62009-01-31 16:36:08 +000012706 Py_DECREF(seq);
12707 it->it_seq = NULL;
12708 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012709}
12710
12711static PyObject *
12712unicodeiter_len(unicodeiterobject *it)
12713{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012714 Py_ssize_t len = 0;
12715 if (it->it_seq)
12716 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12717 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012718}
12719
12720PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12721
12722static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012723 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012724 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012725 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012726};
12727
12728PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012729 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12730 "str_iterator", /* tp_name */
12731 sizeof(unicodeiterobject), /* tp_basicsize */
12732 0, /* tp_itemsize */
12733 /* methods */
12734 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12735 0, /* tp_print */
12736 0, /* tp_getattr */
12737 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012738 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012739 0, /* tp_repr */
12740 0, /* tp_as_number */
12741 0, /* tp_as_sequence */
12742 0, /* tp_as_mapping */
12743 0, /* tp_hash */
12744 0, /* tp_call */
12745 0, /* tp_str */
12746 PyObject_GenericGetAttr, /* tp_getattro */
12747 0, /* tp_setattro */
12748 0, /* tp_as_buffer */
12749 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12750 0, /* tp_doc */
12751 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12752 0, /* tp_clear */
12753 0, /* tp_richcompare */
12754 0, /* tp_weaklistoffset */
12755 PyObject_SelfIter, /* tp_iter */
12756 (iternextfunc)unicodeiter_next, /* tp_iternext */
12757 unicodeiter_methods, /* tp_methods */
12758 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012759};
12760
12761static PyObject *
12762unicode_iter(PyObject *seq)
12763{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012764 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012765
Benjamin Peterson14339b62009-01-31 16:36:08 +000012766 if (!PyUnicode_Check(seq)) {
12767 PyErr_BadInternalCall();
12768 return NULL;
12769 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012770 if (PyUnicode_READY(seq) == -1)
12771 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012772 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12773 if (it == NULL)
12774 return NULL;
12775 it->it_index = 0;
12776 Py_INCREF(seq);
12777 it->it_seq = (PyUnicodeObject *)seq;
12778 _PyObject_GC_TRACK(it);
12779 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012780}
12781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782#define UNIOP(x) Py_UNICODE_##x
12783#define UNIOP_t Py_UNICODE
12784#include "uniops.h"
12785#undef UNIOP
12786#undef UNIOP_t
12787#define UNIOP(x) Py_UCS4_##x
12788#define UNIOP_t Py_UCS4
12789#include "uniops.h"
12790#undef UNIOP
12791#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012792
Victor Stinner71133ff2010-09-01 23:43:53 +000012793Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012794PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012795{
12796 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12797 Py_UNICODE *copy;
12798 Py_ssize_t size;
12799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012800 if (!PyUnicode_Check(unicode)) {
12801 PyErr_BadArgument();
12802 return NULL;
12803 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012804 /* Ensure we won't overflow the size. */
12805 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12806 PyErr_NoMemory();
12807 return NULL;
12808 }
12809 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12810 size *= sizeof(Py_UNICODE);
12811 copy = PyMem_Malloc(size);
12812 if (copy == NULL) {
12813 PyErr_NoMemory();
12814 return NULL;
12815 }
12816 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12817 return copy;
12818}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012819
Georg Brandl66c221e2010-10-14 07:04:07 +000012820/* A _string module, to export formatter_parser and formatter_field_name_split
12821 to the string.Formatter class implemented in Python. */
12822
12823static PyMethodDef _string_methods[] = {
12824 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12825 METH_O, PyDoc_STR("split the argument as a field name")},
12826 {"formatter_parser", (PyCFunction) formatter_parser,
12827 METH_O, PyDoc_STR("parse the argument as a format string")},
12828 {NULL, NULL}
12829};
12830
12831static struct PyModuleDef _string_module = {
12832 PyModuleDef_HEAD_INIT,
12833 "_string",
12834 PyDoc_STR("string helper module"),
12835 0,
12836 _string_methods,
12837 NULL,
12838 NULL,
12839 NULL,
12840 NULL
12841};
12842
12843PyMODINIT_FUNC
12844PyInit__string(void)
12845{
12846 return PyModule_Create(&_string_module);
12847}
12848
12849
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012850#ifdef __cplusplus
12851}
12852#endif