blob: 06e7a916be6a17cb8dfb3057494ce7b0179d3032 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092/* Generic helper macro to convert characters of different types.
93 from_type and to_type have to be valid type names, begin and end
94 are pointers to the source characters which should be of type
95 "from_type *". to is a pointer of type "to_type *" and points to the
96 buffer where the result characters are written to. */
97#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
98 do { \
99 const from_type *iter_; to_type *to_; \
100 for (iter_ = (begin), to_ = (to_type *)(to); \
101 iter_ < (end); \
102 ++iter_, ++to_) { \
103 *to_ = (to_type)*iter_; \
104 } \
105 } while (0)
106
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200107#define _PyUnicode_UTF8(op) \
108 (PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((char*)((PyASCIIObject*)(op) + 1)) : \
110 ((PyCompactUnicodeObject*)(op))->utf8)
111#define _PyUnicode_UTF8_LENGTH(op) \
112 (PyUnicode_IS_COMPACT_ASCII(op) ? \
113 ((PyASCIIObject*)(op))->length : \
114 ((PyCompactUnicodeObject*)(op))->utf8_length)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200115#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
116#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
117#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
118#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
119#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
120#define _PyUnicode_KIND(op) \
121 (assert(PyUnicode_Check(op)), \
122 ((PyASCIIObject *)(op))->state.kind)
123#define _PyUnicode_GET_LENGTH(op) \
124 (assert(PyUnicode_Check(op)), \
125 ((PyASCIIObject *)(op))->length)
126
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200127/* The Unicode string has been modified: reset the hash */
128#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130
Walter Dörwald16807132007-05-25 13:52:07 +0000131/* This dictionary holds all interned unicode strings. Note that references
132 to strings in this dictionary are *not* counted in the string's ob_refcnt.
133 When the interned string reaches a refcnt of 0 the string deallocation
134 function will delete the reference from this dictionary.
135
136 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000137 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000138*/
139static PyObject *interned;
140
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141/* The empty Unicode object is shared to improve performance. */
142static PyUnicodeObject *unicode_empty;
143
144/* Single character Unicode strings in the Latin-1 range are being
145 shared as well. */
146static PyUnicodeObject *unicode_latin1[256];
147
Christian Heimes190d79e2008-01-30 11:58:22 +0000148/* Fast detection of the most frequent whitespace characters */
149const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000150 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000151/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000152/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000153/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000154/* case 0x000C: * FORM FEED */
155/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000156 0, 1, 1, 1, 1, 1, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000158/* case 0x001C: * FILE SEPARATOR */
159/* case 0x001D: * GROUP SEPARATOR */
160/* case 0x001E: * RECORD SEPARATOR */
161/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000162 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000163/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 1, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000168
Benjamin Peterson14339b62009-01-31 16:36:08 +0000169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000177};
178
Alexander Belopolsky40018472011-02-26 01:02:56 +0000179static PyObject *
180unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000181 PyObject **errorHandler,const char *encoding, const char *reason,
182 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
183 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
184
Alexander Belopolsky40018472011-02-26 01:02:56 +0000185static void
186raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300187 const char *encoding,
188 const Py_UNICODE *unicode, Py_ssize_t size,
189 Py_ssize_t startpos, Py_ssize_t endpos,
190 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000191
Christian Heimes190d79e2008-01-30 11:58:22 +0000192/* Same for linebreaks */
193static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000196/* 0x000B, * LINE TABULATION */
197/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000200 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* 0x001C, * FILE SEPARATOR */
202/* 0x001D, * GROUP SEPARATOR */
203/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 0, 0, 0, 0, 1, 1, 1, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300220/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
221 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000222Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000223PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000224{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000225#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000226 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000227#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000228 /* This is actually an illegal character, so it should
229 not be passed to unichr. */
230 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000231#endif
232}
233
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234/* --- Bloom Filters ----------------------------------------------------- */
235
236/* stuff to implement simple "bloom filters" for Unicode characters.
237 to keep things simple, we use a single bitmask, using the least 5
238 bits from each unicode characters as the bit index. */
239
240/* the linebreak mask is set up by Unicode_Init below */
241
Antoine Pitrouf068f942010-01-13 14:19:12 +0000242#if LONG_BIT >= 128
243#define BLOOM_WIDTH 128
244#elif LONG_BIT >= 64
245#define BLOOM_WIDTH 64
246#elif LONG_BIT >= 32
247#define BLOOM_WIDTH 32
248#else
249#error "LONG_BIT is smaller than 32"
250#endif
251
Thomas Wouters477c8d52006-05-27 19:21:47 +0000252#define BLOOM_MASK unsigned long
253
254static BLOOM_MASK bloom_linebreak;
255
Antoine Pitrouf068f942010-01-13 14:19:12 +0000256#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
257#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000258
Benjamin Peterson29060642009-01-31 22:14:21 +0000259#define BLOOM_LINEBREAK(ch) \
260 ((ch) < 128U ? ascii_linebreak[(ch)] : \
261 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000262
Alexander Belopolsky40018472011-02-26 01:02:56 +0000263Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200264make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000265{
266 /* calculate simple bloom-style bitmask for a given unicode string */
267
Antoine Pitrouf068f942010-01-13 14:19:12 +0000268 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269 Py_ssize_t i;
270
271 mask = 0;
272 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200273 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000274
275 return mask;
276}
277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200278#define BLOOM_MEMBER(mask, chr, str) \
279 (BLOOM(mask, chr) \
280 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282/* --- Unicode Object ----------------------------------------------------- */
283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200284static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200285fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
286
287Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
288 Py_ssize_t size, Py_UCS4 ch,
289 int direction)
290{
291 /* like wcschr, but doesn't stop at NULL characters */
292 Py_ssize_t i;
293 if (direction == 1) {
294 for(i = 0; i < size; i++)
295 if (PyUnicode_READ(kind, s, i) == ch)
296 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
297 }
298 else {
299 for(i = size-1; i >= 0; i--)
300 if (PyUnicode_READ(kind, s, i) == ch)
301 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
302 }
303 return NULL;
304}
305
Alexander Belopolsky40018472011-02-26 01:02:56 +0000306static int
307unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200308 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309{
310 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200312 /* Resizing is only supported for old unicode objects. */
313 assert(!PyUnicode_IS_COMPACT(unicode));
314 assert(_PyUnicode_WSTR(unicode) != NULL);
315
316 /* ... and only if they have not been readied yet, because
317 callees usually rely on the wstr representation when resizing. */
318 assert(unicode->data.any == NULL);
319
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000320 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200321 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000322 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000324 /* Resizing shared object (unicode_empty or single character
325 objects) in-place is not allowed. Use PyUnicode_Resize()
326 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000327
Benjamin Peterson14339b62009-01-31 16:36:08 +0000328 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200329 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
330 _PyUnicode_WSTR(unicode)[0] < 256U &&
331 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000333 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 return -1;
335 }
336
Thomas Wouters477c8d52006-05-27 19:21:47 +0000337 /* We allocate one more byte to make sure the string is Ux0000 terminated.
338 The overallocation is also used by fastsearch, which assumes that it's
339 safe to look at str[length] (without making any assumptions about what
340 it contains). */
341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200342 oldstr = _PyUnicode_WSTR(unicode);
343 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
344 sizeof(Py_UNICODE) * (length + 1));
345 if (!_PyUnicode_WSTR(unicode)) {
346 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347 PyErr_NoMemory();
348 return -1;
349 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200350 _PyUnicode_WSTR(unicode)[length] = 0;
351 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 if (unicode->data.any != NULL) {
355 PyObject_FREE(unicode->data.any);
356 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
357 PyObject_FREE(unicode->_base.utf8);
358 }
359 unicode->_base.utf8 = NULL;
360 unicode->_base.utf8_length = 0;
361 unicode->data.any = NULL;
362 _PyUnicode_LENGTH(unicode) = 0;
363 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
364 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200366 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000367
Guido van Rossumd57fd912000-03-10 22:53:23 +0000368 return 0;
369}
370
371/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000372 Ux0000 terminated; some code (e.g. new_identifier)
373 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000374
375 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000376 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377
378*/
379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200380#ifdef Py_DEBUG
381int unicode_old_new_calls = 0;
382#endif
383
Alexander Belopolsky40018472011-02-26 01:02:56 +0000384static PyUnicodeObject *
385_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386{
387 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200388 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389
Thomas Wouters477c8d52006-05-27 19:21:47 +0000390 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391 if (length == 0 && unicode_empty != NULL) {
392 Py_INCREF(unicode_empty);
393 return unicode_empty;
394 }
395
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000396 /* Ensure we won't overflow the size. */
397 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
398 return (PyUnicodeObject *)PyErr_NoMemory();
399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200400 if (length < 0) {
401 PyErr_SetString(PyExc_SystemError,
402 "Negative size passed to _PyUnicode_New");
403 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404 }
405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200406#ifdef Py_DEBUG
407 ++unicode_old_new_calls;
408#endif
409
410 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
411 if (unicode == NULL)
412 return NULL;
413 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
414 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
415 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000416 PyErr_NoMemory();
417 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419
Jeremy Hyltond8082792003-09-16 19:41:39 +0000420 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000421 * the caller fails before initializing str -- unicode_resize()
422 * reads str[0], and the Keep-Alive optimization can keep memory
423 * allocated for str alive across a call to unicode_dealloc(unicode).
424 * We don't want unicode_resize to read uninitialized memory in
425 * that case.
426 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200427 _PyUnicode_WSTR(unicode)[0] = 0;
428 _PyUnicode_WSTR(unicode)[length] = 0;
429 _PyUnicode_WSTR_LENGTH(unicode) = length;
430 _PyUnicode_HASH(unicode) = -1;
431 _PyUnicode_STATE(unicode).interned = 0;
432 _PyUnicode_STATE(unicode).kind = 0;
433 _PyUnicode_STATE(unicode).compact = 0;
434 _PyUnicode_STATE(unicode).ready = 0;
435 _PyUnicode_STATE(unicode).ascii = 0;
436 unicode->data.any = NULL;
437 _PyUnicode_LENGTH(unicode) = 0;
438 unicode->_base.utf8 = NULL;
439 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000441
Benjamin Peterson29060642009-01-31 22:14:21 +0000442 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000443 /* XXX UNREF/NEWREF interface should be more symmetrical */
444 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000445 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000446 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000447 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000448}
449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200450#ifdef Py_DEBUG
451int unicode_new_new_calls = 0;
452
453/* Functions wrapping macros for use in debugger */
454char *_PyUnicode_utf8(void *unicode){
455 return _PyUnicode_UTF8(unicode);
456}
457
458void *_PyUnicode_compact_data(void *unicode) {
459 return _PyUnicode_COMPACT_DATA(unicode);
460}
461void *_PyUnicode_data(void *unicode){
462 printf("obj %p\n", unicode);
463 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
464 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
465 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
466 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
467 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
468 return PyUnicode_DATA(unicode);
469}
470#endif
471
472PyObject *
473PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
474{
475 PyObject *obj;
476 PyCompactUnicodeObject *unicode;
477 void *data;
478 int kind_state;
479 int is_sharing = 0, is_ascii = 0;
480 Py_ssize_t char_size;
481 Py_ssize_t struct_size;
482
483 /* Optimization for empty strings */
484 if (size == 0 && unicode_empty != NULL) {
485 Py_INCREF(unicode_empty);
486 return (PyObject *)unicode_empty;
487 }
488
489#ifdef Py_DEBUG
490 ++unicode_new_new_calls;
491#endif
492
493 struct_size = sizeof(PyCompactUnicodeObject);
494 if (maxchar < 128) {
495 kind_state = PyUnicode_1BYTE_KIND;
496 char_size = 1;
497 is_ascii = 1;
498 struct_size = sizeof(PyASCIIObject);
499 }
500 else if (maxchar < 256) {
501 kind_state = PyUnicode_1BYTE_KIND;
502 char_size = 1;
503 }
504 else if (maxchar < 65536) {
505 kind_state = PyUnicode_2BYTE_KIND;
506 char_size = 2;
507 if (sizeof(wchar_t) == 2)
508 is_sharing = 1;
509 }
510 else {
511 kind_state = PyUnicode_4BYTE_KIND;
512 char_size = 4;
513 if (sizeof(wchar_t) == 4)
514 is_sharing = 1;
515 }
516
517 /* Ensure we won't overflow the size. */
518 if (size < 0) {
519 PyErr_SetString(PyExc_SystemError,
520 "Negative size passed to PyUnicode_New");
521 return NULL;
522 }
523 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
524 return PyErr_NoMemory();
525
526 /* Duplicated allocation code from _PyObject_New() instead of a call to
527 * PyObject_New() so we are able to allocate space for the object and
528 * it's data buffer.
529 */
530 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
531 if (obj == NULL)
532 return PyErr_NoMemory();
533 obj = PyObject_INIT(obj, &PyUnicode_Type);
534 if (obj == NULL)
535 return NULL;
536
537 unicode = (PyCompactUnicodeObject *)obj;
538 if (is_ascii)
539 data = ((PyASCIIObject*)obj) + 1;
540 else
541 data = unicode + 1;
542 _PyUnicode_LENGTH(unicode) = size;
543 _PyUnicode_HASH(unicode) = -1;
544 _PyUnicode_STATE(unicode).interned = 0;
545 _PyUnicode_STATE(unicode).kind = kind_state;
546 _PyUnicode_STATE(unicode).compact = 1;
547 _PyUnicode_STATE(unicode).ready = 1;
548 _PyUnicode_STATE(unicode).ascii = is_ascii;
549 if (is_ascii) {
550 ((char*)data)[size] = 0;
551 _PyUnicode_WSTR(unicode) = NULL;
552 }
553 else if (kind_state == PyUnicode_1BYTE_KIND) {
554 ((char*)data)[size] = 0;
555 _PyUnicode_WSTR(unicode) = NULL;
556 _PyUnicode_WSTR_LENGTH(unicode) = 0;
557 unicode->utf8_length = 0;
558 unicode->utf8 = NULL;
559 }
560 else {
561 unicode->utf8 = NULL;
562 if (kind_state == PyUnicode_2BYTE_KIND)
563 ((Py_UCS2*)data)[size] = 0;
564 else /* kind_state == PyUnicode_4BYTE_KIND */
565 ((Py_UCS4*)data)[size] = 0;
566 if (is_sharing) {
567 _PyUnicode_WSTR_LENGTH(unicode) = size;
568 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
569 }
570 else {
571 _PyUnicode_WSTR_LENGTH(unicode) = 0;
572 _PyUnicode_WSTR(unicode) = NULL;
573 }
574 }
575 return obj;
576}
577
578#if SIZEOF_WCHAR_T == 2
579/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
580 will decode surrogate pairs, the other conversions are implemented as macros
581 for efficency.
582
583 This function assumes that unicode can hold one more code point than wstr
584 characters for a terminating null character. */
585static int
586unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
587 PyUnicodeObject *unicode)
588{
589 const wchar_t *iter;
590 Py_UCS4 *ucs4_out;
591
592 assert(unicode && PyUnicode_Check(unicode));
593 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
594 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
595
596 for (iter = begin; iter < end; ) {
597 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
598 _PyUnicode_GET_LENGTH(unicode)));
599 if (*iter >= 0xD800 && *iter <= 0xDBFF
600 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
601 {
602 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
603 iter += 2;
604 }
605 else {
606 *ucs4_out++ = *iter;
607 iter++;
608 }
609 }
610 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
611 _PyUnicode_GET_LENGTH(unicode)));
612
613 return 0;
614}
615#endif
616
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200617Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200618PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
619 PyObject *from, Py_ssize_t from_start,
620 Py_ssize_t how_many)
621{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200622 unsigned int from_kind, to_kind;
623 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200624
Victor Stinnerb1536152011-09-30 02:26:10 +0200625 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
626 PyErr_BadInternalCall();
627 return -1;
628 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200629
630 if (PyUnicode_READY(from))
631 return -1;
632 if (PyUnicode_READY(to))
633 return -1;
634
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200635 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200636 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
637 PyErr_Format(PyExc_ValueError,
638 "Cannot write %zi characters at %zi "
639 "in a string of %zi characters",
640 how_many, to_start, PyUnicode_GET_LENGTH(to));
641 return -1;
642 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200643 if (how_many == 0)
644 return 0;
645
646 if (Py_REFCNT(to) != 1) {
647 PyErr_SetString(PyExc_ValueError,
648 "Cannot modify a string having more than 1 reference");
649 return -1;
650 }
Victor Stinnerc17f5402011-09-29 00:16:58 +0200651 _PyUnicode_DIRTY(to);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200653 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200654 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200656 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200657
658 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200659 /* fast path */
Victor Stinnera0702ab2011-09-29 14:14:38 +0200660 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200661 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200662 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200663 + PyUnicode_KIND_SIZE(from_kind, from_start),
664 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200665 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200666 else if (from_kind == PyUnicode_1BYTE_KIND
667 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200668 {
669 _PyUnicode_CONVERT_BYTES(
670 Py_UCS1, Py_UCS2,
671 PyUnicode_1BYTE_DATA(from) + from_start,
672 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
673 PyUnicode_2BYTE_DATA(to) + to_start
674 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200675 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200676 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200677 && to_kind == PyUnicode_4BYTE_KIND)
678 {
679 _PyUnicode_CONVERT_BYTES(
680 Py_UCS1, Py_UCS4,
681 PyUnicode_1BYTE_DATA(from) + from_start,
682 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
683 PyUnicode_4BYTE_DATA(to) + to_start
684 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200685 }
686 else if (from_kind == PyUnicode_2BYTE_KIND
687 && to_kind == PyUnicode_4BYTE_KIND)
688 {
689 _PyUnicode_CONVERT_BYTES(
690 Py_UCS2, Py_UCS4,
691 PyUnicode_2BYTE_DATA(from) + from_start,
692 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
693 PyUnicode_4BYTE_DATA(to) + to_start
694 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200695 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200696 else {
697 int invalid_kinds;
698 if (from_kind > to_kind) {
699 /* slow path to check for character overflow */
700 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
701 Py_UCS4 ch, maxchar;
702 Py_ssize_t i;
703
704 maxchar = 0;
705 invalid_kinds = 0;
706 for (i=0; i < how_many; i++) {
707 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
708 if (ch > maxchar) {
709 maxchar = ch;
710 if (maxchar > to_maxchar) {
711 invalid_kinds = 1;
712 break;
713 }
714 }
715 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
716 }
717 }
718 else
719 invalid_kinds = 1;
720 if (invalid_kinds) {
721 PyErr_Format(PyExc_ValueError,
722 "Cannot copy UCS%u characters "
723 "into a string of UCS%u characters",
724 1 << (from_kind - 1),
725 1 << (to_kind -1));
726 return -1;
727 }
728 }
729 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200730}
731
Victor Stinner17222162011-09-28 22:15:37 +0200732/* Find the maximum code point and count the number of surrogate pairs so a
733 correct string length can be computed before converting a string to UCS4.
734 This function counts single surrogates as a character and not as a pair.
735
736 Return 0 on success, or -1 on error. */
737static int
738find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
739 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740{
741 const wchar_t *iter;
742
743 if (num_surrogates == NULL || maxchar == NULL) {
744 PyErr_SetString(PyExc_SystemError,
745 "unexpected NULL arguments to "
746 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
747 return -1;
748 }
749
750 *num_surrogates = 0;
751 *maxchar = 0;
752
753 for (iter = begin; iter < end; ) {
754 if (*iter > *maxchar)
755 *maxchar = *iter;
756#if SIZEOF_WCHAR_T == 2
757 if (*iter >= 0xD800 && *iter <= 0xDBFF
758 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
759 {
760 Py_UCS4 surrogate_val;
761 surrogate_val = (((iter[0] & 0x3FF)<<10)
762 | (iter[1] & 0x3FF)) + 0x10000;
763 ++(*num_surrogates);
764 if (surrogate_val > *maxchar)
765 *maxchar = surrogate_val;
766 iter += 2;
767 }
768 else
769 iter++;
770#else
771 iter++;
772#endif
773 }
774 return 0;
775}
776
777#ifdef Py_DEBUG
778int unicode_ready_calls = 0;
779#endif
780
781int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200782_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200783{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200784 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200785 wchar_t *end;
786 Py_UCS4 maxchar = 0;
787 Py_ssize_t num_surrogates;
788#if SIZEOF_WCHAR_T == 2
789 Py_ssize_t length_wo_surrogates;
790#endif
791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200792 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200793 strings were created using _PyObject_New() and where no canonical
794 representation (the str field) has been set yet aka strings
795 which are not yet ready. */
796 assert(PyUnicode_Check(obj));
797 assert(!PyUnicode_IS_READY(obj));
798 assert(!PyUnicode_IS_COMPACT(obj));
799 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200800 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200801 assert(unicode->data.any == NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 assert(unicode->_base.utf8 == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200803 /* Actually, it should neither be interned nor be anything else: */
804 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200805
806#ifdef Py_DEBUG
807 ++unicode_ready_calls;
808#endif
809
810 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200811 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200812 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200814
815 if (maxchar < 256) {
816 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
817 if (!unicode->data.any) {
818 PyErr_NoMemory();
819 return -1;
820 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200821 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200822 _PyUnicode_WSTR(unicode), end,
823 PyUnicode_1BYTE_DATA(unicode));
824 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
825 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
826 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
827 if (maxchar < 128) {
828 unicode->_base.utf8 = unicode->data.any;
829 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
830 }
831 else {
832 unicode->_base.utf8 = NULL;
833 unicode->_base.utf8_length = 0;
834 }
835 PyObject_FREE(_PyUnicode_WSTR(unicode));
836 _PyUnicode_WSTR(unicode) = NULL;
837 _PyUnicode_WSTR_LENGTH(unicode) = 0;
838 }
839 /* In this case we might have to convert down from 4-byte native
840 wchar_t to 2-byte unicode. */
841 else if (maxchar < 65536) {
842 assert(num_surrogates == 0 &&
843 "FindMaxCharAndNumSurrogatePairs() messed up");
844
Victor Stinner506f5922011-09-28 22:34:18 +0200845#if SIZEOF_WCHAR_T == 2
846 /* We can share representations and are done. */
847 unicode->data.any = _PyUnicode_WSTR(unicode);
848 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
849 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
850 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
851 unicode->_base.utf8 = NULL;
852 unicode->_base.utf8_length = 0;
853#else
854 /* sizeof(wchar_t) == 4 */
855 unicode->data.any = PyObject_MALLOC(
856 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
857 if (!unicode->data.any) {
858 PyErr_NoMemory();
859 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200860 }
Victor Stinner506f5922011-09-28 22:34:18 +0200861 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
862 _PyUnicode_WSTR(unicode), end,
863 PyUnicode_2BYTE_DATA(unicode));
864 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
865 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
866 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
867 unicode->_base.utf8 = NULL;
868 unicode->_base.utf8_length = 0;
869 PyObject_FREE(_PyUnicode_WSTR(unicode));
870 _PyUnicode_WSTR(unicode) = NULL;
871 _PyUnicode_WSTR_LENGTH(unicode) = 0;
872#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200873 }
874 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
875 else {
876#if SIZEOF_WCHAR_T == 2
877 /* in case the native representation is 2-bytes, we need to allocate a
878 new normalized 4-byte version. */
879 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
880 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
881 if (!unicode->data.any) {
882 PyErr_NoMemory();
883 return -1;
884 }
885 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
886 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
887 unicode->_base.utf8 = NULL;
888 unicode->_base.utf8_length = 0;
889 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
890 unicode) < 0) {
891 assert(0 && "ConvertWideCharToUCS4 failed");
892 return -1;
893 }
894 PyObject_FREE(_PyUnicode_WSTR(unicode));
895 _PyUnicode_WSTR(unicode) = NULL;
896 _PyUnicode_WSTR_LENGTH(unicode) = 0;
897#else
898 assert(num_surrogates == 0);
899
900 unicode->data.any = _PyUnicode_WSTR(unicode);
901 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
902 unicode->_base.utf8 = NULL;
903 unicode->_base.utf8_length = 0;
904 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
905#endif
906 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
907 }
908 _PyUnicode_STATE(unicode).ready = 1;
909 return 0;
910}
911
Alexander Belopolsky40018472011-02-26 01:02:56 +0000912static void
913unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000914{
Walter Dörwald16807132007-05-25 13:52:07 +0000915 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000916 case SSTATE_NOT_INTERNED:
917 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000918
Benjamin Peterson29060642009-01-31 22:14:21 +0000919 case SSTATE_INTERNED_MORTAL:
920 /* revive dead object temporarily for DelItem */
921 Py_REFCNT(unicode) = 3;
922 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
923 Py_FatalError(
924 "deletion of interned string failed");
925 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000926
Benjamin Peterson29060642009-01-31 22:14:21 +0000927 case SSTATE_INTERNED_IMMORTAL:
928 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000929
Benjamin Peterson29060642009-01-31 22:14:21 +0000930 default:
931 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000932 }
933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934 if (_PyUnicode_WSTR(unicode) &&
935 (!PyUnicode_IS_READY(unicode) ||
936 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
937 PyObject_DEL(_PyUnicode_WSTR(unicode));
938 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
939 PyObject_DEL(unicode->_base.utf8);
940
941 if (PyUnicode_IS_COMPACT(unicode)) {
942 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000943 }
944 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200945 if (unicode->data.any)
946 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000947 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948 }
949}
950
Alexander Belopolsky40018472011-02-26 01:02:56 +0000951static int
952_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000953{
954 register PyUnicodeObject *v;
955
956 /* Argument checks */
957 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000958 PyErr_BadInternalCall();
959 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000960 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000961 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200962 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
963 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000964 PyErr_BadInternalCall();
965 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000966 }
967
968 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969 possible since these are being shared.
970 The same goes for new-representation unicode objects or objects which
971 have already been readied.
972 For these, we simply return a fresh copy with the same Unicode content.
973 */
974 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
975 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
976 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000977 PyUnicodeObject *w = _PyUnicode_New(length);
978 if (w == NULL)
979 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
981 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000982 Py_DECREF(*unicode);
983 *unicode = w;
984 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000985 }
986
987 /* Note that we don't have to modify *unicode for unshared Unicode
988 objects, since we can modify them in-place. */
989 return unicode_resize(v, length);
990}
991
Alexander Belopolsky40018472011-02-26 01:02:56 +0000992int
993PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000994{
995 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
996}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200998static PyObject*
999get_latin1_char(unsigned char ch)
1000{
1001 PyUnicodeObject *unicode = unicode_latin1[ch];
1002 if (!unicode) {
1003 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
1004 if (!unicode)
1005 return NULL;
1006 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1007 unicode_latin1[ch] = unicode;
1008 }
1009 Py_INCREF(unicode);
1010 return (PyObject *)unicode;
1011}
1012
Alexander Belopolsky40018472011-02-26 01:02:56 +00001013PyObject *
1014PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015{
1016 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001017 Py_UCS4 maxchar = 0;
1018 Py_ssize_t num_surrogates;
1019
1020 if (u == NULL)
1021 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001023 /* If the Unicode data is known at construction time, we can apply
1024 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001026 /* Optimization for empty strings */
1027 if (size == 0 && unicode_empty != NULL) {
1028 Py_INCREF(unicode_empty);
1029 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001030 }
Tim Petersced69f82003-09-16 20:30:58 +00001031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001032 /* Single character Unicode objects in the Latin-1 range are
1033 shared when using this constructor */
1034 if (size == 1 && *u < 256)
1035 return get_latin1_char((unsigned char)*u);
1036
1037 /* If not empty and not single character, copy the Unicode data
1038 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001039 if (find_maxchar_surrogates(u, u + size,
1040 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041 return NULL;
1042
1043 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1044 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045 if (!unicode)
1046 return NULL;
1047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 switch (PyUnicode_KIND(unicode)) {
1049 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001050 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1052 break;
1053 case PyUnicode_2BYTE_KIND:
1054#if Py_UNICODE_SIZE == 2
1055 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1056#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001057 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1059#endif
1060 break;
1061 case PyUnicode_4BYTE_KIND:
1062#if SIZEOF_WCHAR_T == 2
1063 /* This is the only case which has to process surrogates, thus
1064 a simple copy loop is not enough and we need a function. */
1065 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1066 Py_DECREF(unicode);
1067 return NULL;
1068 }
1069#else
1070 assert(num_surrogates == 0);
1071 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1072#endif
1073 break;
1074 default:
1075 assert(0 && "Impossible state");
1076 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001077
1078 return (PyObject *)unicode;
1079}
1080
Alexander Belopolsky40018472011-02-26 01:02:56 +00001081PyObject *
1082PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001083{
1084 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001085
Benjamin Peterson14339b62009-01-31 16:36:08 +00001086 if (size < 0) {
1087 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001088 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001089 return NULL;
1090 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001091
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001092 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001093 some optimizations which share commonly used objects.
1094 Also, this means the input must be UTF-8, so fall back to the
1095 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001096 if (u != NULL) {
1097
Benjamin Peterson29060642009-01-31 22:14:21 +00001098 /* Optimization for empty strings */
1099 if (size == 0 && unicode_empty != NULL) {
1100 Py_INCREF(unicode_empty);
1101 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001102 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001103
1104 /* Single characters are shared when using this constructor.
1105 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 if (size == 1 && Py_CHARMASK(*u) < 128)
1107 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001108
1109 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001110 }
1111
Walter Dörwald55507312007-05-18 13:12:10 +00001112 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001113 if (!unicode)
1114 return NULL;
1115
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001116 return (PyObject *)unicode;
1117}
1118
Alexander Belopolsky40018472011-02-26 01:02:56 +00001119PyObject *
1120PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001121{
1122 size_t size = strlen(u);
1123 if (size > PY_SSIZE_T_MAX) {
1124 PyErr_SetString(PyExc_OverflowError, "input too long");
1125 return NULL;
1126 }
1127
1128 return PyUnicode_FromStringAndSize(u, size);
1129}
1130
Victor Stinnere57b1c02011-09-28 22:20:48 +02001131static PyObject*
1132_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 PyObject *res;
1135 unsigned char max = 127;
1136 Py_ssize_t i;
1137 for (i = 0; i < size; i++) {
1138 if (u[i] & 0x80) {
1139 max = 255;
1140 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001141 }
1142 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143 res = PyUnicode_New(size, max);
1144 if (!res)
1145 return NULL;
1146 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1147 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001148}
1149
Victor Stinnere57b1c02011-09-28 22:20:48 +02001150static PyObject*
1151_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152{
1153 PyObject *res;
1154 Py_UCS2 max = 0;
1155 Py_ssize_t i;
1156 for (i = 0; i < size; i++)
1157 if (u[i] > max)
1158 max = u[i];
1159 res = PyUnicode_New(size, max);
1160 if (!res)
1161 return NULL;
1162 if (max >= 256)
1163 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1164 else
1165 for (i = 0; i < size; i++)
1166 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1167 return res;
1168}
1169
Victor Stinnere57b1c02011-09-28 22:20:48 +02001170static PyObject*
1171_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001172{
1173 PyObject *res;
1174 Py_UCS4 max = 0;
1175 Py_ssize_t i;
1176 for (i = 0; i < size; i++)
1177 if (u[i] > max)
1178 max = u[i];
1179 res = PyUnicode_New(size, max);
1180 if (!res)
1181 return NULL;
1182 if (max >= 0x10000)
1183 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1184 else {
1185 int kind = PyUnicode_KIND(res);
1186 void *data = PyUnicode_DATA(res);
1187 for (i = 0; i < size; i++)
1188 PyUnicode_WRITE(kind, data, i, u[i]);
1189 }
1190 return res;
1191}
1192
1193PyObject*
1194PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1195{
1196 switch(kind) {
1197 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001198 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001200 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001201 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001202 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203 }
1204 assert(0);
1205 return NULL;
1206}
1207
Victor Stinner034f6cf2011-09-30 02:26:44 +02001208PyObject*
1209PyUnicode_Copy(PyObject *unicode)
1210{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001211 Py_ssize_t size;
1212 PyObject *copy;
1213 void *data;
1214
Victor Stinner034f6cf2011-09-30 02:26:44 +02001215 if (!PyUnicode_Check(unicode)) {
1216 PyErr_BadInternalCall();
1217 return NULL;
1218 }
1219 if (PyUnicode_READY(unicode))
1220 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001221
1222 size = PyUnicode_GET_LENGTH(unicode);
1223 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1224 if (!copy)
1225 return NULL;
1226 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1227
1228 data = PyUnicode_DATA(unicode);
1229 switch (PyUnicode_KIND(unicode))
1230 {
1231 case PyUnicode_1BYTE_KIND:
1232 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1233 break;
1234 case PyUnicode_2BYTE_KIND:
1235 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1236 break;
1237 case PyUnicode_4BYTE_KIND:
1238 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1239 break;
1240 default:
1241 assert(0);
1242 break;
1243 }
1244 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001245}
1246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247
1248/* Widen Unicode objects to larger buffers.
1249 Return NULL if the string is too wide already. */
1250
1251void*
1252_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1253{
1254 Py_ssize_t i;
1255 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1256 void *d = PyUnicode_DATA(s);
1257 unsigned int skind = PyUnicode_KIND(s);
1258 if (PyUnicode_KIND(s) >= kind) {
1259 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1260 return NULL;
1261 }
1262 switch(kind) {
1263 case PyUnicode_2BYTE_KIND: {
1264 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1265 if (!result) {
1266 PyErr_NoMemory();
1267 return 0;
1268 }
1269 for (i = 0; i < len; i++)
1270 result[i] = ((Py_UCS1*)d)[i];
1271 return result;
1272 }
1273 case PyUnicode_4BYTE_KIND: {
1274 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1275 if (!result) {
1276 PyErr_NoMemory();
1277 return 0;
1278 }
1279 for (i = 0; i < len; i++)
1280 result[i] = PyUnicode_READ(skind, d, i);
1281 return result;
1282 }
1283 }
1284 Py_FatalError("invalid kind");
1285 return NULL;
1286}
1287
1288static Py_UCS4*
1289as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1290 int copy_null)
1291{
1292 int kind;
1293 void *data;
1294 Py_ssize_t len, targetlen;
1295 if (PyUnicode_READY(string) == -1)
1296 return NULL;
1297 kind = PyUnicode_KIND(string);
1298 data = PyUnicode_DATA(string);
1299 len = PyUnicode_GET_LENGTH(string);
1300 targetlen = len;
1301 if (copy_null)
1302 targetlen++;
1303 if (!target) {
1304 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1305 PyErr_NoMemory();
1306 return NULL;
1307 }
1308 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1309 if (!target) {
1310 PyErr_NoMemory();
1311 return NULL;
1312 }
1313 }
1314 else {
1315 if (targetsize < targetlen) {
1316 PyErr_Format(PyExc_SystemError,
1317 "string is longer than the buffer");
1318 if (copy_null && 0 < targetsize)
1319 target[0] = 0;
1320 return NULL;
1321 }
1322 }
1323 if (kind != PyUnicode_4BYTE_KIND) {
1324 Py_ssize_t i;
1325 for (i = 0; i < len; i++)
1326 target[i] = PyUnicode_READ(kind, data, i);
1327 }
1328 else
1329 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1330 if (copy_null)
1331 target[len] = 0;
1332 return target;
1333}
1334
1335Py_UCS4*
1336PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1337 int copy_null)
1338{
1339 if (target == NULL || targetsize < 1) {
1340 PyErr_BadInternalCall();
1341 return NULL;
1342 }
1343 return as_ucs4(string, target, targetsize, copy_null);
1344}
1345
1346Py_UCS4*
1347PyUnicode_AsUCS4Copy(PyObject *string)
1348{
1349 return as_ucs4(string, NULL, 0, 1);
1350}
1351
1352#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001353
Alexander Belopolsky40018472011-02-26 01:02:56 +00001354PyObject *
1355PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001356{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001358 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001360 PyErr_BadInternalCall();
1361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362 }
1363
Martin v. Löwis790465f2008-04-05 20:41:37 +00001364 if (size == -1) {
1365 size = wcslen(w);
1366 }
1367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001369}
1370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001372
Walter Dörwald346737f2007-05-31 10:44:43 +00001373static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001374makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1375 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001376{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001377 *fmt++ = '%';
1378 if (width) {
1379 if (zeropad)
1380 *fmt++ = '0';
1381 fmt += sprintf(fmt, "%d", width);
1382 }
1383 if (precision)
1384 fmt += sprintf(fmt, ".%d", precision);
1385 if (longflag)
1386 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001387 else if (longlongflag) {
1388 /* longlongflag should only ever be nonzero on machines with
1389 HAVE_LONG_LONG defined */
1390#ifdef HAVE_LONG_LONG
1391 char *f = PY_FORMAT_LONG_LONG;
1392 while (*f)
1393 *fmt++ = *f++;
1394#else
1395 /* we shouldn't ever get here */
1396 assert(0);
1397 *fmt++ = 'l';
1398#endif
1399 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001400 else if (size_tflag) {
1401 char *f = PY_FORMAT_SIZE_T;
1402 while (*f)
1403 *fmt++ = *f++;
1404 }
1405 *fmt++ = c;
1406 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001407}
1408
Victor Stinner96865452011-03-01 23:44:09 +00001409/* helper for PyUnicode_FromFormatV() */
1410
1411static const char*
1412parse_format_flags(const char *f,
1413 int *p_width, int *p_precision,
1414 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1415{
1416 int width, precision, longflag, longlongflag, size_tflag;
1417
1418 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1419 f++;
1420 width = 0;
1421 while (Py_ISDIGIT((unsigned)*f))
1422 width = (width*10) + *f++ - '0';
1423 precision = 0;
1424 if (*f == '.') {
1425 f++;
1426 while (Py_ISDIGIT((unsigned)*f))
1427 precision = (precision*10) + *f++ - '0';
1428 if (*f == '%') {
1429 /* "%.3%s" => f points to "3" */
1430 f--;
1431 }
1432 }
1433 if (*f == '\0') {
1434 /* bogus format "%.1" => go backward, f points to "1" */
1435 f--;
1436 }
1437 if (p_width != NULL)
1438 *p_width = width;
1439 if (p_precision != NULL)
1440 *p_precision = precision;
1441
1442 /* Handle %ld, %lu, %lld and %llu. */
1443 longflag = 0;
1444 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001445 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001446
1447 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001448 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001449 longflag = 1;
1450 ++f;
1451 }
1452#ifdef HAVE_LONG_LONG
1453 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001454 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001455 longlongflag = 1;
1456 f += 2;
1457 }
1458#endif
1459 }
1460 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001461 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001462 size_tflag = 1;
1463 ++f;
1464 }
1465 if (p_longflag != NULL)
1466 *p_longflag = longflag;
1467 if (p_longlongflag != NULL)
1468 *p_longlongflag = longlongflag;
1469 if (p_size_tflag != NULL)
1470 *p_size_tflag = size_tflag;
1471 return f;
1472}
1473
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001474/* maximum number of characters required for output of %ld. 21 characters
1475 allows for 64-bit integers (in decimal) and an optional sign. */
1476#define MAX_LONG_CHARS 21
1477/* maximum number of characters required for output of %lld.
1478 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1479 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1480#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1481
Walter Dörwaldd2034312007-05-18 16:29:38 +00001482PyObject *
1483PyUnicode_FromFormatV(const char *format, va_list vargs)
1484{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001485 va_list count;
1486 Py_ssize_t callcount = 0;
1487 PyObject **callresults = NULL;
1488 PyObject **callresult = NULL;
1489 Py_ssize_t n = 0;
1490 int width = 0;
1491 int precision = 0;
1492 int zeropad;
1493 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001495 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001496 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1498 Py_UCS4 argmaxchar;
1499 Py_ssize_t numbersize = 0;
1500 char *numberresults = NULL;
1501 char *numberresult = NULL;
1502 Py_ssize_t i;
1503 int kind;
1504 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001505
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001506 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001507 /* step 1: count the number of %S/%R/%A/%s format specifications
1508 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1509 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001510 * result in an array)
1511 * also esimate a upper bound for all the number formats in the string,
1512 * numbers will be formated in step 3 and be keept in a '\0'-separated
1513 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001514 for (f = format; *f; f++) {
1515 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001516 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1518 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1519 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1520 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001522 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001523#ifdef HAVE_LONG_LONG
1524 if (longlongflag) {
1525 if (width < MAX_LONG_LONG_CHARS)
1526 width = MAX_LONG_LONG_CHARS;
1527 }
1528 else
1529#endif
1530 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1531 including sign. Decimal takes the most space. This
1532 isn't enough for octal. If a width is specified we
1533 need more (which we allocate later). */
1534 if (width < MAX_LONG_CHARS)
1535 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536
1537 /* account for the size + '\0' to separate numbers
1538 inside of the numberresults buffer */
1539 numbersize += (width + 1);
1540 }
1541 }
1542 else if ((unsigned char)*f > 127) {
1543 PyErr_Format(PyExc_ValueError,
1544 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1545 "string, got a non-ASCII byte: 0x%02x",
1546 (unsigned char)*f);
1547 return NULL;
1548 }
1549 }
1550 /* step 2: allocate memory for the results of
1551 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1552 if (callcount) {
1553 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1554 if (!callresults) {
1555 PyErr_NoMemory();
1556 return NULL;
1557 }
1558 callresult = callresults;
1559 }
1560 /* step 2.5: allocate memory for the results of formating numbers */
1561 if (numbersize) {
1562 numberresults = PyObject_Malloc(numbersize);
1563 if (!numberresults) {
1564 PyErr_NoMemory();
1565 goto fail;
1566 }
1567 numberresult = numberresults;
1568 }
1569
1570 /* step 3: format numbers and figure out how large a buffer we need */
1571 for (f = format; *f; f++) {
1572 if (*f == '%') {
1573 const char* p;
1574 int longflag;
1575 int longlongflag;
1576 int size_tflag;
1577 int numprinted;
1578
1579 p = f;
1580 zeropad = (f[1] == '0');
1581 f = parse_format_flags(f, &width, &precision,
1582 &longflag, &longlongflag, &size_tflag);
1583 switch (*f) {
1584 case 'c':
1585 {
1586 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001587 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588 n++;
1589 break;
1590 }
1591 case '%':
1592 n++;
1593 break;
1594 case 'i':
1595 case 'd':
1596 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1597 width, precision, *f);
1598 if (longflag)
1599 numprinted = sprintf(numberresult, fmt,
1600 va_arg(count, long));
1601#ifdef HAVE_LONG_LONG
1602 else if (longlongflag)
1603 numprinted = sprintf(numberresult, fmt,
1604 va_arg(count, PY_LONG_LONG));
1605#endif
1606 else if (size_tflag)
1607 numprinted = sprintf(numberresult, fmt,
1608 va_arg(count, Py_ssize_t));
1609 else
1610 numprinted = sprintf(numberresult, fmt,
1611 va_arg(count, int));
1612 n += numprinted;
1613 /* advance by +1 to skip over the '\0' */
1614 numberresult += (numprinted + 1);
1615 assert(*(numberresult - 1) == '\0');
1616 assert(*(numberresult - 2) != '\0');
1617 assert(numprinted >= 0);
1618 assert(numberresult <= numberresults + numbersize);
1619 break;
1620 case 'u':
1621 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1622 width, precision, 'u');
1623 if (longflag)
1624 numprinted = sprintf(numberresult, fmt,
1625 va_arg(count, unsigned long));
1626#ifdef HAVE_LONG_LONG
1627 else if (longlongflag)
1628 numprinted = sprintf(numberresult, fmt,
1629 va_arg(count, unsigned PY_LONG_LONG));
1630#endif
1631 else if (size_tflag)
1632 numprinted = sprintf(numberresult, fmt,
1633 va_arg(count, size_t));
1634 else
1635 numprinted = sprintf(numberresult, fmt,
1636 va_arg(count, unsigned int));
1637 n += numprinted;
1638 numberresult += (numprinted + 1);
1639 assert(*(numberresult - 1) == '\0');
1640 assert(*(numberresult - 2) != '\0');
1641 assert(numprinted >= 0);
1642 assert(numberresult <= numberresults + numbersize);
1643 break;
1644 case 'x':
1645 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1646 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1647 n += numprinted;
1648 numberresult += (numprinted + 1);
1649 assert(*(numberresult - 1) == '\0');
1650 assert(*(numberresult - 2) != '\0');
1651 assert(numprinted >= 0);
1652 assert(numberresult <= numberresults + numbersize);
1653 break;
1654 case 'p':
1655 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1656 /* %p is ill-defined: ensure leading 0x. */
1657 if (numberresult[1] == 'X')
1658 numberresult[1] = 'x';
1659 else if (numberresult[1] != 'x') {
1660 memmove(numberresult + 2, numberresult,
1661 strlen(numberresult) + 1);
1662 numberresult[0] = '0';
1663 numberresult[1] = 'x';
1664 numprinted += 2;
1665 }
1666 n += numprinted;
1667 numberresult += (numprinted + 1);
1668 assert(*(numberresult - 1) == '\0');
1669 assert(*(numberresult - 2) != '\0');
1670 assert(numprinted >= 0);
1671 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001672 break;
1673 case 's':
1674 {
1675 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001676 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001677 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1678 if (!str)
1679 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001680 /* since PyUnicode_DecodeUTF8 returns already flexible
1681 unicode objects, there is no need to call ready on them */
1682 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001683 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001685 /* Remember the str and switch to the next slot */
1686 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001687 break;
1688 }
1689 case 'U':
1690 {
1691 PyObject *obj = va_arg(count, PyObject *);
1692 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001693 if (PyUnicode_READY(obj) == -1)
1694 goto fail;
1695 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001696 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001698 break;
1699 }
1700 case 'V':
1701 {
1702 PyObject *obj = va_arg(count, PyObject *);
1703 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001704 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001705 assert(obj || str);
1706 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001707 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001708 if (PyUnicode_READY(obj) == -1)
1709 goto fail;
1710 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001711 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001712 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001713 *callresult++ = NULL;
1714 }
1715 else {
1716 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1717 if (!str_obj)
1718 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001720 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001722 *callresult++ = str_obj;
1723 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001724 break;
1725 }
1726 case 'S':
1727 {
1728 PyObject *obj = va_arg(count, PyObject *);
1729 PyObject *str;
1730 assert(obj);
1731 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001732 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001733 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001735 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001737 /* Remember the str and switch to the next slot */
1738 *callresult++ = str;
1739 break;
1740 }
1741 case 'R':
1742 {
1743 PyObject *obj = va_arg(count, PyObject *);
1744 PyObject *repr;
1745 assert(obj);
1746 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001747 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001748 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001750 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001752 /* Remember the repr and switch to the next slot */
1753 *callresult++ = repr;
1754 break;
1755 }
1756 case 'A':
1757 {
1758 PyObject *obj = va_arg(count, PyObject *);
1759 PyObject *ascii;
1760 assert(obj);
1761 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001763 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001765 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001767 /* Remember the repr and switch to the next slot */
1768 *callresult++ = ascii;
1769 break;
1770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001771 default:
1772 /* if we stumble upon an unknown
1773 formatting code, copy the rest of
1774 the format string to the output
1775 string. (we cannot just skip the
1776 code, since there's no way to know
1777 what's in the argument list) */
1778 n += strlen(p);
1779 goto expand;
1780 }
1781 } else
1782 n++;
1783 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001784 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001785 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001786 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001787 we don't have to resize the string.
1788 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001790 if (!string)
1791 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 kind = PyUnicode_KIND(string);
1793 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001794 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001798 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001799 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001800
1801 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1803 /* checking for == because the last argument could be a empty
1804 string, which causes i to point to end, the assert at the end of
1805 the loop */
1806 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001807
Benjamin Peterson14339b62009-01-31 16:36:08 +00001808 switch (*f) {
1809 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001810 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 const int ordinal = va_arg(vargs, int);
1812 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001813 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001814 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001815 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001816 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001817 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001818 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819 case 'p':
1820 /* unused, since we already have the result */
1821 if (*f == 'p')
1822 (void) va_arg(vargs, void *);
1823 else
1824 (void) va_arg(vargs, int);
1825 /* extract the result from numberresults and append. */
1826 for (; *numberresult; ++i, ++numberresult)
1827 PyUnicode_WRITE(kind, data, i, *numberresult);
1828 /* skip over the separating '\0' */
1829 assert(*numberresult == '\0');
1830 numberresult++;
1831 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001832 break;
1833 case 's':
1834 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001835 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001836 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001837 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838 size = PyUnicode_GET_LENGTH(*callresult);
1839 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001840 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1841 *callresult, 0,
1842 size) < 0)
1843 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001845 /* We're done with the unicode()/repr() => forget it */
1846 Py_DECREF(*callresult);
1847 /* switch to next unicode()/repr() result */
1848 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001849 break;
1850 }
1851 case 'U':
1852 {
1853 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001854 Py_ssize_t size;
1855 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1856 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001857 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1858 obj, 0,
1859 size) < 0)
1860 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001861 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001862 break;
1863 }
1864 case 'V':
1865 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001867 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001868 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001869 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001870 size = PyUnicode_GET_LENGTH(obj);
1871 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001872 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1873 obj, 0,
1874 size) < 0)
1875 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001876 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001877 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001878 size = PyUnicode_GET_LENGTH(*callresult);
1879 assert(PyUnicode_KIND(*callresult) <=
1880 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001881 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1882 *callresult,
1883 0, size) < 0)
1884 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001886 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001887 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001888 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001889 break;
1890 }
1891 case 'S':
1892 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001893 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001894 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001895 /* unused, since we already have the result */
1896 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001897 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001898 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1899 *callresult, 0,
1900 PyUnicode_GET_LENGTH(*callresult)) < 0)
1901 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001903 /* We're done with the unicode()/repr() => forget it */
1904 Py_DECREF(*callresult);
1905 /* switch to next unicode()/repr() result */
1906 ++callresult;
1907 break;
1908 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001909 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001910 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001911 break;
1912 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 for (; *p; ++p, ++i)
1914 PyUnicode_WRITE(kind, data, i, *p);
1915 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001916 goto end;
1917 }
Victor Stinner1205f272010-09-11 00:54:47 +00001918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 else {
1920 assert(i < PyUnicode_GET_LENGTH(string));
1921 PyUnicode_WRITE(kind, data, i++, *f);
1922 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001924 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001925
Benjamin Peterson29060642009-01-31 22:14:21 +00001926 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001927 if (callresults)
1928 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 if (numberresults)
1930 PyObject_Free(numberresults);
1931 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001932 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001933 if (callresults) {
1934 PyObject **callresult2 = callresults;
1935 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001936 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001937 ++callresult2;
1938 }
1939 PyObject_Free(callresults);
1940 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001941 if (numberresults)
1942 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001943 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001944}
1945
Walter Dörwaldd2034312007-05-18 16:29:38 +00001946PyObject *
1947PyUnicode_FromFormat(const char *format, ...)
1948{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001949 PyObject* ret;
1950 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001951
1952#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001953 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001954#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001955 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001956#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001957 ret = PyUnicode_FromFormatV(format, vargs);
1958 va_end(vargs);
1959 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001960}
1961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001962#ifdef HAVE_WCHAR_H
1963
Victor Stinner5593d8a2010-10-02 11:11:27 +00001964/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1965 convert a Unicode object to a wide character string.
1966
Victor Stinnerd88d9832011-09-06 02:00:05 +02001967 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001968 character) required to convert the unicode object. Ignore size argument.
1969
Victor Stinnerd88d9832011-09-06 02:00:05 +02001970 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001971 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001972 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001973static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001974unicode_aswidechar(PyUnicodeObject *unicode,
1975 wchar_t *w,
1976 Py_ssize_t size)
1977{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001978 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 const wchar_t *wstr;
1980
1981 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1982 if (wstr == NULL)
1983 return -1;
1984
Victor Stinner5593d8a2010-10-02 11:11:27 +00001985 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001986 if (size > res)
1987 size = res + 1;
1988 else
1989 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001991 return res;
1992 }
1993 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001995}
1996
1997Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001998PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001999 wchar_t *w,
2000 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001{
2002 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002003 PyErr_BadInternalCall();
2004 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002006 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007}
2008
Victor Stinner137c34c2010-09-29 10:25:54 +00002009wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002010PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002011 Py_ssize_t *size)
2012{
2013 wchar_t* buffer;
2014 Py_ssize_t buflen;
2015
2016 if (unicode == NULL) {
2017 PyErr_BadInternalCall();
2018 return NULL;
2019 }
2020
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002021 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 if (buflen == -1)
2023 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002024 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002025 PyErr_NoMemory();
2026 return NULL;
2027 }
2028
Victor Stinner137c34c2010-09-29 10:25:54 +00002029 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2030 if (buffer == NULL) {
2031 PyErr_NoMemory();
2032 return NULL;
2033 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002034 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 if (buflen == -1)
2036 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002037 if (size != NULL)
2038 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002039 return buffer;
2040}
2041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043
Alexander Belopolsky40018472011-02-26 01:02:56 +00002044PyObject *
2045PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002046{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002048 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002049 PyErr_SetString(PyExc_ValueError,
2050 "chr() arg not in range(0x110000)");
2051 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002052 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002054 if (ordinal < 256)
2055 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 v = PyUnicode_New(1, ordinal);
2058 if (v == NULL)
2059 return NULL;
2060 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2061 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002062}
2063
Alexander Belopolsky40018472011-02-26 01:02:56 +00002064PyObject *
2065PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002067 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002068 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002069 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002070 if (PyUnicode_READY(obj))
2071 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002072 Py_INCREF(obj);
2073 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002074 }
2075 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002076 /* For a Unicode subtype that's not a Unicode object,
2077 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002078 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002079 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002080 PyErr_Format(PyExc_TypeError,
2081 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002082 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002083 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002084}
2085
Alexander Belopolsky40018472011-02-26 01:02:56 +00002086PyObject *
2087PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002088 const char *encoding,
2089 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002090{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002091 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002092 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002093
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002095 PyErr_BadInternalCall();
2096 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002098
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002099 /* Decoding bytes objects is the most common case and should be fast */
2100 if (PyBytes_Check(obj)) {
2101 if (PyBytes_GET_SIZE(obj) == 0) {
2102 Py_INCREF(unicode_empty);
2103 v = (PyObject *) unicode_empty;
2104 }
2105 else {
2106 v = PyUnicode_Decode(
2107 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2108 encoding, errors);
2109 }
2110 return v;
2111 }
2112
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002113 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002114 PyErr_SetString(PyExc_TypeError,
2115 "decoding str is not supported");
2116 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002117 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002118
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002119 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2120 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2121 PyErr_Format(PyExc_TypeError,
2122 "coercing to str: need bytes, bytearray "
2123 "or buffer-like object, %.80s found",
2124 Py_TYPE(obj)->tp_name);
2125 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002126 }
Tim Petersced69f82003-09-16 20:30:58 +00002127
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002128 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002129 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002130 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 }
Tim Petersced69f82003-09-16 20:30:58 +00002132 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002133 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002134
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002135 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002136 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137}
2138
Victor Stinner600d3be2010-06-10 12:00:55 +00002139/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002140 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2141 1 on success. */
2142static int
2143normalize_encoding(const char *encoding,
2144 char *lower,
2145 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002147 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002148 char *l;
2149 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002150
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002151 e = encoding;
2152 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002153 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002154 while (*e) {
2155 if (l == l_end)
2156 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002157 if (Py_ISUPPER(*e)) {
2158 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002159 }
2160 else if (*e == '_') {
2161 *l++ = '-';
2162 e++;
2163 }
2164 else {
2165 *l++ = *e++;
2166 }
2167 }
2168 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002169 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002170}
2171
Alexander Belopolsky40018472011-02-26 01:02:56 +00002172PyObject *
2173PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002174 Py_ssize_t size,
2175 const char *encoding,
2176 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002177{
2178 PyObject *buffer = NULL, *unicode;
2179 Py_buffer info;
2180 char lower[11]; /* Enough for any encoding shortcut */
2181
2182 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002183 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002184
2185 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002186 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002187 if ((strcmp(lower, "utf-8") == 0) ||
2188 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002189 return PyUnicode_DecodeUTF8(s, size, errors);
2190 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002191 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002192 (strcmp(lower, "iso-8859-1") == 0))
2193 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002194#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002195 else if (strcmp(lower, "mbcs") == 0)
2196 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002197#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002198 else if (strcmp(lower, "ascii") == 0)
2199 return PyUnicode_DecodeASCII(s, size, errors);
2200 else if (strcmp(lower, "utf-16") == 0)
2201 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2202 else if (strcmp(lower, "utf-32") == 0)
2203 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205
2206 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002207 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002208 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002209 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002210 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 if (buffer == NULL)
2212 goto onError;
2213 unicode = PyCodec_Decode(buffer, encoding, errors);
2214 if (unicode == NULL)
2215 goto onError;
2216 if (!PyUnicode_Check(unicode)) {
2217 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002218 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002219 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220 Py_DECREF(unicode);
2221 goto onError;
2222 }
2223 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 if (PyUnicode_READY(unicode)) {
2225 Py_DECREF(unicode);
2226 return NULL;
2227 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002229
Benjamin Peterson29060642009-01-31 22:14:21 +00002230 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002231 Py_XDECREF(buffer);
2232 return NULL;
2233}
2234
Alexander Belopolsky40018472011-02-26 01:02:56 +00002235PyObject *
2236PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002237 const char *encoding,
2238 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002239{
2240 PyObject *v;
2241
2242 if (!PyUnicode_Check(unicode)) {
2243 PyErr_BadArgument();
2244 goto onError;
2245 }
2246
2247 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002248 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002249
2250 /* Decode via the codec registry */
2251 v = PyCodec_Decode(unicode, encoding, errors);
2252 if (v == NULL)
2253 goto onError;
2254 return v;
2255
Benjamin Peterson29060642009-01-31 22:14:21 +00002256 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002257 return NULL;
2258}
2259
Alexander Belopolsky40018472011-02-26 01:02:56 +00002260PyObject *
2261PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002262 const char *encoding,
2263 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002264{
2265 PyObject *v;
2266
2267 if (!PyUnicode_Check(unicode)) {
2268 PyErr_BadArgument();
2269 goto onError;
2270 }
2271
2272 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002273 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002274
2275 /* Decode via the codec registry */
2276 v = PyCodec_Decode(unicode, encoding, errors);
2277 if (v == NULL)
2278 goto onError;
2279 if (!PyUnicode_Check(v)) {
2280 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002281 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002282 Py_TYPE(v)->tp_name);
2283 Py_DECREF(v);
2284 goto onError;
2285 }
2286 return v;
2287
Benjamin Peterson29060642009-01-31 22:14:21 +00002288 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002289 return NULL;
2290}
2291
Alexander Belopolsky40018472011-02-26 01:02:56 +00002292PyObject *
2293PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002294 Py_ssize_t size,
2295 const char *encoding,
2296 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297{
2298 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002299
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 unicode = PyUnicode_FromUnicode(s, size);
2301 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2304 Py_DECREF(unicode);
2305 return v;
2306}
2307
Alexander Belopolsky40018472011-02-26 01:02:56 +00002308PyObject *
2309PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002310 const char *encoding,
2311 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002312{
2313 PyObject *v;
2314
2315 if (!PyUnicode_Check(unicode)) {
2316 PyErr_BadArgument();
2317 goto onError;
2318 }
2319
2320 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002321 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002322
2323 /* Encode via the codec registry */
2324 v = PyCodec_Encode(unicode, encoding, errors);
2325 if (v == NULL)
2326 goto onError;
2327 return v;
2328
Benjamin Peterson29060642009-01-31 22:14:21 +00002329 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002330 return NULL;
2331}
2332
Victor Stinnerad158722010-10-27 00:25:46 +00002333PyObject *
2334PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002335{
Victor Stinner99b95382011-07-04 14:23:54 +02002336#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002337 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2338 PyUnicode_GET_SIZE(unicode),
2339 NULL);
2340#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002341 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002342#else
Victor Stinner793b5312011-04-27 00:24:21 +02002343 PyInterpreterState *interp = PyThreadState_GET()->interp;
2344 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2345 cannot use it to encode and decode filenames before it is loaded. Load
2346 the Python codec requires to encode at least its own filename. Use the C
2347 version of the locale codec until the codec registry is initialized and
2348 the Python codec is loaded.
2349
2350 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2351 cannot only rely on it: check also interp->fscodec_initialized for
2352 subinterpreters. */
2353 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002354 return PyUnicode_AsEncodedString(unicode,
2355 Py_FileSystemDefaultEncoding,
2356 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002357 }
2358 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002359 /* locale encoding with surrogateescape */
2360 wchar_t *wchar;
2361 char *bytes;
2362 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002363 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002364
2365 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2366 if (wchar == NULL)
2367 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002368 bytes = _Py_wchar2char(wchar, &error_pos);
2369 if (bytes == NULL) {
2370 if (error_pos != (size_t)-1) {
2371 char *errmsg = strerror(errno);
2372 PyObject *exc = NULL;
2373 if (errmsg == NULL)
2374 errmsg = "Py_wchar2char() failed";
2375 raise_encode_exception(&exc,
2376 "filesystemencoding",
2377 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2378 error_pos, error_pos+1,
2379 errmsg);
2380 Py_XDECREF(exc);
2381 }
2382 else
2383 PyErr_NoMemory();
2384 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002385 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002386 }
2387 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002388
2389 bytes_obj = PyBytes_FromString(bytes);
2390 PyMem_Free(bytes);
2391 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002392 }
Victor Stinnerad158722010-10-27 00:25:46 +00002393#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002394}
2395
Alexander Belopolsky40018472011-02-26 01:02:56 +00002396PyObject *
2397PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002398 const char *encoding,
2399 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400{
2401 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002402 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002403
Guido van Rossumd57fd912000-03-10 22:53:23 +00002404 if (!PyUnicode_Check(unicode)) {
2405 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002407 }
Fred Drakee4315f52000-05-09 19:53:39 +00002408
Victor Stinner2f283c22011-03-02 01:21:46 +00002409 if (encoding == NULL) {
2410 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002412 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002414 }
Fred Drakee4315f52000-05-09 19:53:39 +00002415
2416 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002417 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002418 if ((strcmp(lower, "utf-8") == 0) ||
2419 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002420 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002421 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002423 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002425 }
Victor Stinner37296e82010-06-10 13:36:23 +00002426 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002427 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002428 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002430#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002431 else if (strcmp(lower, "mbcs") == 0)
2432 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2433 PyUnicode_GET_SIZE(unicode),
2434 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002435#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002436 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002438 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002439
2440 /* Encode via the codec registry */
2441 v = PyCodec_Encode(unicode, encoding, errors);
2442 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002443 return NULL;
2444
2445 /* The normal path */
2446 if (PyBytes_Check(v))
2447 return v;
2448
2449 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002450 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002451 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002452 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002453
2454 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2455 "encoder %s returned bytearray instead of bytes",
2456 encoding);
2457 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002458 Py_DECREF(v);
2459 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002460 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002461
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002462 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2463 Py_DECREF(v);
2464 return b;
2465 }
2466
2467 PyErr_Format(PyExc_TypeError,
2468 "encoder did not return a bytes object (type=%.400s)",
2469 Py_TYPE(v)->tp_name);
2470 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002471 return NULL;
2472}
2473
Alexander Belopolsky40018472011-02-26 01:02:56 +00002474PyObject *
2475PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002476 const char *encoding,
2477 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002478{
2479 PyObject *v;
2480
2481 if (!PyUnicode_Check(unicode)) {
2482 PyErr_BadArgument();
2483 goto onError;
2484 }
2485
2486 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002487 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002488
2489 /* Encode via the codec registry */
2490 v = PyCodec_Encode(unicode, encoding, errors);
2491 if (v == NULL)
2492 goto onError;
2493 if (!PyUnicode_Check(v)) {
2494 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002495 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002496 Py_TYPE(v)->tp_name);
2497 Py_DECREF(v);
2498 goto onError;
2499 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002501
Benjamin Peterson29060642009-01-31 22:14:21 +00002502 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503 return NULL;
2504}
2505
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002506PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002507PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002508 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002509 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2510}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002511
Christian Heimes5894ba72007-11-04 11:43:14 +00002512PyObject*
2513PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2514{
Victor Stinner99b95382011-07-04 14:23:54 +02002515#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002516 return PyUnicode_DecodeMBCS(s, size, NULL);
2517#elif defined(__APPLE__)
2518 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2519#else
Victor Stinner793b5312011-04-27 00:24:21 +02002520 PyInterpreterState *interp = PyThreadState_GET()->interp;
2521 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2522 cannot use it to encode and decode filenames before it is loaded. Load
2523 the Python codec requires to encode at least its own filename. Use the C
2524 version of the locale codec until the codec registry is initialized and
2525 the Python codec is loaded.
2526
2527 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2528 cannot only rely on it: check also interp->fscodec_initialized for
2529 subinterpreters. */
2530 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002531 return PyUnicode_Decode(s, size,
2532 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002533 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002534 }
2535 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002536 /* locale encoding with surrogateescape */
2537 wchar_t *wchar;
2538 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002539 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002540
2541 if (s[size] != '\0' || size != strlen(s)) {
2542 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2543 return NULL;
2544 }
2545
Victor Stinner168e1172010-10-16 23:16:16 +00002546 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002547 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002548 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002549
Victor Stinner168e1172010-10-16 23:16:16 +00002550 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002551 PyMem_Free(wchar);
2552 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002553 }
Victor Stinnerad158722010-10-27 00:25:46 +00002554#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002555}
2556
Martin v. Löwis011e8422009-05-05 04:43:17 +00002557
2558int
2559PyUnicode_FSConverter(PyObject* arg, void* addr)
2560{
2561 PyObject *output = NULL;
2562 Py_ssize_t size;
2563 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002564 if (arg == NULL) {
2565 Py_DECREF(*(PyObject**)addr);
2566 return 1;
2567 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002568 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002569 output = arg;
2570 Py_INCREF(output);
2571 }
2572 else {
2573 arg = PyUnicode_FromObject(arg);
2574 if (!arg)
2575 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002576 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002577 Py_DECREF(arg);
2578 if (!output)
2579 return 0;
2580 if (!PyBytes_Check(output)) {
2581 Py_DECREF(output);
2582 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2583 return 0;
2584 }
2585 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002586 size = PyBytes_GET_SIZE(output);
2587 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002588 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002589 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002590 Py_DECREF(output);
2591 return 0;
2592 }
2593 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002594 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002595}
2596
2597
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002598int
2599PyUnicode_FSDecoder(PyObject* arg, void* addr)
2600{
2601 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002602 if (arg == NULL) {
2603 Py_DECREF(*(PyObject**)addr);
2604 return 1;
2605 }
2606 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 if (PyUnicode_READY(arg))
2608 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002609 output = arg;
2610 Py_INCREF(output);
2611 }
2612 else {
2613 arg = PyBytes_FromObject(arg);
2614 if (!arg)
2615 return 0;
2616 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2617 PyBytes_GET_SIZE(arg));
2618 Py_DECREF(arg);
2619 if (!output)
2620 return 0;
2621 if (!PyUnicode_Check(output)) {
2622 Py_DECREF(output);
2623 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2624 return 0;
2625 }
2626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002627 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2628 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002629 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2630 Py_DECREF(output);
2631 return 0;
2632 }
2633 *(PyObject**)addr = output;
2634 return Py_CLEANUP_SUPPORTED;
2635}
2636
2637
Martin v. Löwis5b222132007-06-10 09:51:05 +00002638char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002640{
Christian Heimesf3863112007-11-22 07:46:41 +00002641 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2643
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002644 if (!PyUnicode_Check(unicode)) {
2645 PyErr_BadArgument();
2646 return NULL;
2647 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002648 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002649 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002650
2651 if (_PyUnicode_UTF8(unicode) == NULL) {
2652 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2653 if (bytes == NULL)
2654 return NULL;
2655 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2656 if (u->_base.utf8 == NULL) {
2657 Py_DECREF(bytes);
2658 return NULL;
2659 }
2660 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2661 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2662 Py_DECREF(bytes);
2663 }
2664
2665 if (psize)
2666 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2667 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002668}
2669
2670char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002671PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002672{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2674}
2675
2676#ifdef Py_DEBUG
2677int unicode_as_unicode_calls = 0;
2678#endif
2679
2680
2681Py_UNICODE *
2682PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2683{
2684 PyUnicodeObject *u;
2685 const unsigned char *one_byte;
2686#if SIZEOF_WCHAR_T == 4
2687 const Py_UCS2 *two_bytes;
2688#else
2689 const Py_UCS4 *four_bytes;
2690 const Py_UCS4 *ucs4_end;
2691 Py_ssize_t num_surrogates;
2692#endif
2693 wchar_t *w;
2694 wchar_t *wchar_end;
2695
2696 if (!PyUnicode_Check(unicode)) {
2697 PyErr_BadArgument();
2698 return NULL;
2699 }
2700 u = (PyUnicodeObject*)unicode;
2701 if (_PyUnicode_WSTR(u) == NULL) {
2702 /* Non-ASCII compact unicode object */
2703 assert(_PyUnicode_KIND(u) != 0);
2704 assert(PyUnicode_IS_READY(u));
2705
2706#ifdef Py_DEBUG
2707 ++unicode_as_unicode_calls;
2708#endif
2709
2710 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2711#if SIZEOF_WCHAR_T == 2
2712 four_bytes = PyUnicode_4BYTE_DATA(u);
2713 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2714 num_surrogates = 0;
2715
2716 for (; four_bytes < ucs4_end; ++four_bytes) {
2717 if (*four_bytes > 0xFFFF)
2718 ++num_surrogates;
2719 }
2720
2721 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2722 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2723 if (!_PyUnicode_WSTR(u)) {
2724 PyErr_NoMemory();
2725 return NULL;
2726 }
2727 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2728
2729 w = _PyUnicode_WSTR(u);
2730 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2731 four_bytes = PyUnicode_4BYTE_DATA(u);
2732 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2733 if (*four_bytes > 0xFFFF) {
2734 /* encode surrogate pair in this case */
2735 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2736 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2737 }
2738 else
2739 *w = *four_bytes;
2740
2741 if (w > wchar_end) {
2742 assert(0 && "Miscalculated string end");
2743 }
2744 }
2745 *w = 0;
2746#else
2747 /* sizeof(wchar_t) == 4 */
2748 Py_FatalError("Impossible unicode object state, wstr and str "
2749 "should share memory already.");
2750 return NULL;
2751#endif
2752 }
2753 else {
2754 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2755 (_PyUnicode_LENGTH(u) + 1));
2756 if (!_PyUnicode_WSTR(u)) {
2757 PyErr_NoMemory();
2758 return NULL;
2759 }
2760 if (!PyUnicode_IS_COMPACT_ASCII(u))
2761 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2762 w = _PyUnicode_WSTR(u);
2763 wchar_end = w + _PyUnicode_LENGTH(u);
2764
2765 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2766 one_byte = PyUnicode_1BYTE_DATA(u);
2767 for (; w < wchar_end; ++one_byte, ++w)
2768 *w = *one_byte;
2769 /* null-terminate the wstr */
2770 *w = 0;
2771 }
2772 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2773#if SIZEOF_WCHAR_T == 4
2774 two_bytes = PyUnicode_2BYTE_DATA(u);
2775 for (; w < wchar_end; ++two_bytes, ++w)
2776 *w = *two_bytes;
2777 /* null-terminate the wstr */
2778 *w = 0;
2779#else
2780 /* sizeof(wchar_t) == 2 */
2781 PyObject_FREE(_PyUnicode_WSTR(u));
2782 _PyUnicode_WSTR(u) = NULL;
2783 Py_FatalError("Impossible unicode object state, wstr "
2784 "and str should share memory already.");
2785 return NULL;
2786#endif
2787 }
2788 else {
2789 assert(0 && "This should never happen.");
2790 }
2791 }
2792 }
2793 if (size != NULL)
2794 *size = PyUnicode_WSTR_LENGTH(u);
2795 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002796}
2797
Alexander Belopolsky40018472011-02-26 01:02:56 +00002798Py_UNICODE *
2799PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802}
2803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002804
Alexander Belopolsky40018472011-02-26 01:02:56 +00002805Py_ssize_t
2806PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807{
2808 if (!PyUnicode_Check(unicode)) {
2809 PyErr_BadArgument();
2810 goto onError;
2811 }
2812 return PyUnicode_GET_SIZE(unicode);
2813
Benjamin Peterson29060642009-01-31 22:14:21 +00002814 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 return -1;
2816}
2817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002818Py_ssize_t
2819PyUnicode_GetLength(PyObject *unicode)
2820{
2821 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2822 PyErr_BadArgument();
2823 return -1;
2824 }
2825
2826 return PyUnicode_GET_LENGTH(unicode);
2827}
2828
2829Py_UCS4
2830PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2831{
2832 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2833 return PyErr_BadArgument();
2834 return (Py_UCS4)-1;
2835 }
2836 return PyUnicode_READ_CHAR(unicode, index);
2837}
2838
2839int
2840PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2841{
2842 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2843 return PyErr_BadArgument();
2844 return -1;
2845 }
2846
2847 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2848 index, ch);
2849 return 0;
2850}
2851
Alexander Belopolsky40018472011-02-26 01:02:56 +00002852const char *
2853PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002854{
Victor Stinner42cb4622010-09-01 19:39:01 +00002855 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002856}
2857
Victor Stinner554f3f02010-06-16 23:33:54 +00002858/* create or adjust a UnicodeDecodeError */
2859static void
2860make_decode_exception(PyObject **exceptionObject,
2861 const char *encoding,
2862 const char *input, Py_ssize_t length,
2863 Py_ssize_t startpos, Py_ssize_t endpos,
2864 const char *reason)
2865{
2866 if (*exceptionObject == NULL) {
2867 *exceptionObject = PyUnicodeDecodeError_Create(
2868 encoding, input, length, startpos, endpos, reason);
2869 }
2870 else {
2871 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2872 goto onError;
2873 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2874 goto onError;
2875 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2876 goto onError;
2877 }
2878 return;
2879
2880onError:
2881 Py_DECREF(*exceptionObject);
2882 *exceptionObject = NULL;
2883}
2884
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002885/* error handling callback helper:
2886 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002887 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002888 and adjust various state variables.
2889 return 0 on success, -1 on error
2890*/
2891
Alexander Belopolsky40018472011-02-26 01:02:56 +00002892static int
2893unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002894 const char *encoding, const char *reason,
2895 const char **input, const char **inend, Py_ssize_t *startinpos,
2896 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2897 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002898{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002899 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002900
2901 PyObject *restuple = NULL;
2902 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002903 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002904 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002905 Py_ssize_t requiredsize;
2906 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002907 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002908 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002909 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002910 int res = -1;
2911
2912 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002913 *errorHandler = PyCodec_LookupError(errors);
2914 if (*errorHandler == NULL)
2915 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002916 }
2917
Victor Stinner554f3f02010-06-16 23:33:54 +00002918 make_decode_exception(exceptionObject,
2919 encoding,
2920 *input, *inend - *input,
2921 *startinpos, *endinpos,
2922 reason);
2923 if (*exceptionObject == NULL)
2924 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002925
2926 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2927 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002928 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002929 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002930 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002931 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002932 }
2933 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002934 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002935
2936 /* Copy back the bytes variables, which might have been modified by the
2937 callback */
2938 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2939 if (!inputobj)
2940 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002941 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002942 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002943 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002944 *input = PyBytes_AS_STRING(inputobj);
2945 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002946 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002947 /* we can DECREF safely, as the exception has another reference,
2948 so the object won't go away. */
2949 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002950
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002951 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002952 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002953 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002954 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2955 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002956 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002957
2958 /* need more space? (at least enough for what we
2959 have+the replacement+the rest of the string (starting
2960 at the new input position), so we won't have to check space
2961 when there are no errors in the rest of the string) */
2962 repptr = PyUnicode_AS_UNICODE(repunicode);
2963 repsize = PyUnicode_GET_SIZE(repunicode);
2964 requiredsize = *outpos + repsize + insize-newpos;
2965 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002966 if (requiredsize<2*outsize)
2967 requiredsize = 2*outsize;
2968 if (_PyUnicode_Resize(output, requiredsize) < 0)
2969 goto onError;
2970 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002971 }
2972 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002973 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002974 Py_UNICODE_COPY(*outptr, repptr, repsize);
2975 *outptr += repsize;
2976 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002977
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002978 /* we made it! */
2979 res = 0;
2980
Benjamin Peterson29060642009-01-31 22:14:21 +00002981 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002982 Py_XDECREF(restuple);
2983 return res;
2984}
2985
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002986/* --- UTF-7 Codec -------------------------------------------------------- */
2987
Antoine Pitrou244651a2009-05-04 18:56:13 +00002988/* See RFC2152 for details. We encode conservatively and decode liberally. */
2989
2990/* Three simple macros defining base-64. */
2991
2992/* Is c a base-64 character? */
2993
2994#define IS_BASE64(c) \
2995 (((c) >= 'A' && (c) <= 'Z') || \
2996 ((c) >= 'a' && (c) <= 'z') || \
2997 ((c) >= '0' && (c) <= '9') || \
2998 (c) == '+' || (c) == '/')
2999
3000/* given that c is a base-64 character, what is its base-64 value? */
3001
3002#define FROM_BASE64(c) \
3003 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3004 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3005 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3006 (c) == '+' ? 62 : 63)
3007
3008/* What is the base-64 character of the bottom 6 bits of n? */
3009
3010#define TO_BASE64(n) \
3011 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3012
3013/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3014 * decoded as itself. We are permissive on decoding; the only ASCII
3015 * byte not decoding to itself is the + which begins a base64
3016 * string. */
3017
3018#define DECODE_DIRECT(c) \
3019 ((c) <= 127 && (c) != '+')
3020
3021/* The UTF-7 encoder treats ASCII characters differently according to
3022 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3023 * the above). See RFC2152. This array identifies these different
3024 * sets:
3025 * 0 : "Set D"
3026 * alphanumeric and '(),-./:?
3027 * 1 : "Set O"
3028 * !"#$%&*;<=>@[]^_`{|}
3029 * 2 : "whitespace"
3030 * ht nl cr sp
3031 * 3 : special (must be base64 encoded)
3032 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3033 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003034
Tim Petersced69f82003-09-16 20:30:58 +00003035static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003036char utf7_category[128] = {
3037/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3038 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3039/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3040 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3041/* sp ! " # $ % & ' ( ) * + , - . / */
3042 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3043/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3044 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3045/* @ A B C D E F G H I J K L M N O */
3046 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3047/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3048 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3049/* ` a b c d e f g h i j k l m n o */
3050 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3051/* p q r s t u v w x y z { | } ~ del */
3052 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003053};
3054
Antoine Pitrou244651a2009-05-04 18:56:13 +00003055/* ENCODE_DIRECT: this character should be encoded as itself. The
3056 * answer depends on whether we are encoding set O as itself, and also
3057 * on whether we are encoding whitespace as itself. RFC2152 makes it
3058 * clear that the answers to these questions vary between
3059 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003060
Antoine Pitrou244651a2009-05-04 18:56:13 +00003061#define ENCODE_DIRECT(c, directO, directWS) \
3062 ((c) < 128 && (c) > 0 && \
3063 ((utf7_category[(c)] == 0) || \
3064 (directWS && (utf7_category[(c)] == 2)) || \
3065 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003066
Alexander Belopolsky40018472011-02-26 01:02:56 +00003067PyObject *
3068PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003069 Py_ssize_t size,
3070 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003071{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003072 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3073}
3074
Antoine Pitrou244651a2009-05-04 18:56:13 +00003075/* The decoder. The only state we preserve is our read position,
3076 * i.e. how many characters we have consumed. So if we end in the
3077 * middle of a shift sequence we have to back off the read position
3078 * and the output to the beginning of the sequence, otherwise we lose
3079 * all the shift state (seen bits, number of bits seen, high
3080 * surrogate). */
3081
Alexander Belopolsky40018472011-02-26 01:02:56 +00003082PyObject *
3083PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003084 Py_ssize_t size,
3085 const char *errors,
3086 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003087{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003089 Py_ssize_t startinpos;
3090 Py_ssize_t endinpos;
3091 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003092 const char *e;
3093 PyUnicodeObject *unicode;
3094 Py_UNICODE *p;
3095 const char *errmsg = "";
3096 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003097 Py_UNICODE *shiftOutStart;
3098 unsigned int base64bits = 0;
3099 unsigned long base64buffer = 0;
3100 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003101 PyObject *errorHandler = NULL;
3102 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003103
3104 unicode = _PyUnicode_New(size);
3105 if (!unicode)
3106 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003107 if (size == 0) {
3108 if (consumed)
3109 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003110 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003111 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003113 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003114 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003115 e = s + size;
3116
3117 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003118 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003119 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003120 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003121
Antoine Pitrou244651a2009-05-04 18:56:13 +00003122 if (inShift) { /* in a base-64 section */
3123 if (IS_BASE64(ch)) { /* consume a base-64 character */
3124 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3125 base64bits += 6;
3126 s++;
3127 if (base64bits >= 16) {
3128 /* we have enough bits for a UTF-16 value */
3129 Py_UNICODE outCh = (Py_UNICODE)
3130 (base64buffer >> (base64bits-16));
3131 base64bits -= 16;
3132 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3133 if (surrogate) {
3134 /* expecting a second surrogate */
3135 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3136#ifdef Py_UNICODE_WIDE
3137 *p++ = (((surrogate & 0x3FF)<<10)
3138 | (outCh & 0x3FF)) + 0x10000;
3139#else
3140 *p++ = surrogate;
3141 *p++ = outCh;
3142#endif
3143 surrogate = 0;
3144 }
3145 else {
3146 surrogate = 0;
3147 errmsg = "second surrogate missing";
3148 goto utf7Error;
3149 }
3150 }
3151 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3152 /* first surrogate */
3153 surrogate = outCh;
3154 }
3155 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3156 errmsg = "unexpected second surrogate";
3157 goto utf7Error;
3158 }
3159 else {
3160 *p++ = outCh;
3161 }
3162 }
3163 }
3164 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003165 inShift = 0;
3166 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003167 if (surrogate) {
3168 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003169 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003170 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003171 if (base64bits > 0) { /* left-over bits */
3172 if (base64bits >= 6) {
3173 /* We've seen at least one base-64 character */
3174 errmsg = "partial character in shift sequence";
3175 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003176 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003177 else {
3178 /* Some bits remain; they should be zero */
3179 if (base64buffer != 0) {
3180 errmsg = "non-zero padding bits in shift sequence";
3181 goto utf7Error;
3182 }
3183 }
3184 }
3185 if (ch != '-') {
3186 /* '-' is absorbed; other terminating
3187 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003188 *p++ = ch;
3189 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003190 }
3191 }
3192 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003193 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003194 s++; /* consume '+' */
3195 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003196 s++;
3197 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003198 }
3199 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003200 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003201 shiftOutStart = p;
3202 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003203 }
3204 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003205 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003206 *p++ = ch;
3207 s++;
3208 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003209 else {
3210 startinpos = s-starts;
3211 s++;
3212 errmsg = "unexpected special character";
3213 goto utf7Error;
3214 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003215 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003216utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003217 outpos = p-PyUnicode_AS_UNICODE(unicode);
3218 endinpos = s-starts;
3219 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003220 errors, &errorHandler,
3221 "utf7", errmsg,
3222 &starts, &e, &startinpos, &endinpos, &exc, &s,
3223 &unicode, &outpos, &p))
3224 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003225 }
3226
Antoine Pitrou244651a2009-05-04 18:56:13 +00003227 /* end of string */
3228
3229 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3230 /* if we're in an inconsistent state, that's an error */
3231 if (surrogate ||
3232 (base64bits >= 6) ||
3233 (base64bits > 0 && base64buffer != 0)) {
3234 outpos = p-PyUnicode_AS_UNICODE(unicode);
3235 endinpos = size;
3236 if (unicode_decode_call_errorhandler(
3237 errors, &errorHandler,
3238 "utf7", "unterminated shift sequence",
3239 &starts, &e, &startinpos, &endinpos, &exc, &s,
3240 &unicode, &outpos, &p))
3241 goto onError;
3242 if (s < e)
3243 goto restart;
3244 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003245 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003246
3247 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003248 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003249 if (inShift) {
3250 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003251 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003252 }
3253 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003254 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003255 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003256 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003257
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003258 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003259 goto onError;
3260
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003261 Py_XDECREF(errorHandler);
3262 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003263 if (PyUnicode_READY(unicode) == -1) {
3264 Py_DECREF(unicode);
3265 return NULL;
3266 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003267 return (PyObject *)unicode;
3268
Benjamin Peterson29060642009-01-31 22:14:21 +00003269 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003270 Py_XDECREF(errorHandler);
3271 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003272 Py_DECREF(unicode);
3273 return NULL;
3274}
3275
3276
Alexander Belopolsky40018472011-02-26 01:02:56 +00003277PyObject *
3278PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003279 Py_ssize_t size,
3280 int base64SetO,
3281 int base64WhiteSpace,
3282 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003283{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003284 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003285 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003286 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003287 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003288 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003289 unsigned int base64bits = 0;
3290 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003291 char * out;
3292 char * start;
3293
3294 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003295 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003296
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003297 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003298 return PyErr_NoMemory();
3299
Antoine Pitrou244651a2009-05-04 18:56:13 +00003300 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003301 if (v == NULL)
3302 return NULL;
3303
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003304 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003305 for (;i < size; ++i) {
3306 Py_UNICODE ch = s[i];
3307
Antoine Pitrou244651a2009-05-04 18:56:13 +00003308 if (inShift) {
3309 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3310 /* shifting out */
3311 if (base64bits) { /* output remaining bits */
3312 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3313 base64buffer = 0;
3314 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003315 }
3316 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003317 /* Characters not in the BASE64 set implicitly unshift the sequence
3318 so no '-' is required, except if the character is itself a '-' */
3319 if (IS_BASE64(ch) || ch == '-') {
3320 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003321 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003322 *out++ = (char) ch;
3323 }
3324 else {
3325 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003326 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003327 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003328 else { /* not in a shift sequence */
3329 if (ch == '+') {
3330 *out++ = '+';
3331 *out++ = '-';
3332 }
3333 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3334 *out++ = (char) ch;
3335 }
3336 else {
3337 *out++ = '+';
3338 inShift = 1;
3339 goto encode_char;
3340 }
3341 }
3342 continue;
3343encode_char:
3344#ifdef Py_UNICODE_WIDE
3345 if (ch >= 0x10000) {
3346 /* code first surrogate */
3347 base64bits += 16;
3348 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3349 while (base64bits >= 6) {
3350 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3351 base64bits -= 6;
3352 }
3353 /* prepare second surrogate */
3354 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3355 }
3356#endif
3357 base64bits += 16;
3358 base64buffer = (base64buffer << 16) | ch;
3359 while (base64bits >= 6) {
3360 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3361 base64bits -= 6;
3362 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003363 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003364 if (base64bits)
3365 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3366 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003367 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003368 if (_PyBytes_Resize(&v, out - start) < 0)
3369 return NULL;
3370 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003371}
3372
Antoine Pitrou244651a2009-05-04 18:56:13 +00003373#undef IS_BASE64
3374#undef FROM_BASE64
3375#undef TO_BASE64
3376#undef DECODE_DIRECT
3377#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003378
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379/* --- UTF-8 Codec -------------------------------------------------------- */
3380
Tim Petersced69f82003-09-16 20:30:58 +00003381static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003383 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3384 illegal prefix. See RFC 3629 for details */
3385 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3386 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003387 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003388 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3389 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3390 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3391 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003392 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3393 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003394 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3395 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003396 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3397 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3398 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3399 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3400 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401};
3402
Alexander Belopolsky40018472011-02-26 01:02:56 +00003403PyObject *
3404PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003405 Py_ssize_t size,
3406 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407{
Walter Dörwald69652032004-09-07 20:24:22 +00003408 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3409}
3410
Antoine Pitrouab868312009-01-10 15:40:25 +00003411/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3412#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3413
3414/* Mask to quickly check whether a C 'long' contains a
3415 non-ASCII, UTF8-encoded char. */
3416#if (SIZEOF_LONG == 8)
3417# define ASCII_CHAR_MASK 0x8080808080808080L
3418#elif (SIZEOF_LONG == 4)
3419# define ASCII_CHAR_MASK 0x80808080L
3420#else
3421# error C 'long' size should be either 4 or 8!
3422#endif
3423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003424/* Scans a UTF-8 string and returns the maximum character to be expected,
3425 the size of the decoded unicode string and if any major errors were
3426 encountered.
3427
3428 This function does check basic UTF-8 sanity, it does however NOT CHECK
3429 if the string contains surrogates, and if all continuation bytes are
3430 within the correct ranges, these checks are performed in
3431 PyUnicode_DecodeUTF8Stateful.
3432
3433 If it sets has_errors to 1, it means the value of unicode_size and max_char
3434 will be bogus and you should not rely on useful information in them.
3435 */
3436static Py_UCS4
3437utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3438 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3439 int *has_errors)
3440{
3441 Py_ssize_t n;
3442 Py_ssize_t char_count = 0;
3443 Py_UCS4 max_char = 127, new_max;
3444 Py_UCS4 upper_bound;
3445 const unsigned char *p = (const unsigned char *)s;
3446 const unsigned char *end = p + string_size;
3447 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3448 int err = 0;
3449
3450 for (; p < end && !err; ++p, ++char_count) {
3451 /* Only check value if it's not a ASCII char... */
3452 if (*p < 0x80) {
3453 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3454 an explanation. */
3455 if (!((size_t) p & LONG_PTR_MASK)) {
3456 /* Help register allocation */
3457 register const unsigned char *_p = p;
3458 while (_p < aligned_end) {
3459 unsigned long value = *(unsigned long *) _p;
3460 if (value & ASCII_CHAR_MASK)
3461 break;
3462 _p += SIZEOF_LONG;
3463 char_count += SIZEOF_LONG;
3464 }
3465 p = _p;
3466 if (p == end)
3467 break;
3468 }
3469 }
3470 if (*p >= 0x80) {
3471 n = utf8_code_length[*p];
3472 new_max = max_char;
3473 switch (n) {
3474 /* invalid start byte */
3475 case 0:
3476 err = 1;
3477 break;
3478 case 2:
3479 /* Code points between 0x00FF and 0x07FF inclusive.
3480 Approximate the upper bound of the code point,
3481 if this flips over 255 we can be sure it will be more
3482 than 255 and the string will need 2 bytes per code coint,
3483 if it stays under or equal to 255, we can be sure 1 byte
3484 is enough.
3485 ((*p & 0b00011111) << 6) | 0b00111111 */
3486 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3487 if (max_char < upper_bound)
3488 new_max = upper_bound;
3489 /* Ensure we track at least that we left ASCII space. */
3490 if (new_max < 128)
3491 new_max = 128;
3492 break;
3493 case 3:
3494 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3495 always > 255 and <= 65535 and will always need 2 bytes. */
3496 if (max_char < 65535)
3497 new_max = 65535;
3498 break;
3499 case 4:
3500 /* Code point will be above 0xFFFF for sure in this case. */
3501 new_max = 65537;
3502 break;
3503 /* Internal error, this should be caught by the first if */
3504 case 1:
3505 default:
3506 assert(0 && "Impossible case in utf8_max_char_and_size");
3507 err = 1;
3508 }
3509 /* Instead of number of overall bytes for this code point,
3510 n containts the number of following bytes: */
3511 --n;
3512 /* Check if the follow up chars are all valid continuation bytes */
3513 if (n >= 1) {
3514 const unsigned char *cont;
3515 if ((p + n) >= end) {
3516 if (consumed == 0)
3517 /* incomplete data, non-incremental decoding */
3518 err = 1;
3519 break;
3520 }
3521 for (cont = p + 1; cont < (p + n); ++cont) {
3522 if ((*cont & 0xc0) != 0x80) {
3523 err = 1;
3524 break;
3525 }
3526 }
3527 p += n;
3528 }
3529 else
3530 err = 1;
3531 max_char = new_max;
3532 }
3533 }
3534
3535 if (unicode_size)
3536 *unicode_size = char_count;
3537 if (has_errors)
3538 *has_errors = err;
3539 return max_char;
3540}
3541
3542/* Similar to PyUnicode_WRITE but can also write into wstr field
3543 of the legacy unicode representation */
3544#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3545 do { \
3546 const int k_ = (kind); \
3547 if (k_ == PyUnicode_WCHAR_KIND) \
3548 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3549 else if (k_ == PyUnicode_1BYTE_KIND) \
3550 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3551 else if (k_ == PyUnicode_2BYTE_KIND) \
3552 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3553 else \
3554 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3555 } while (0)
3556
Alexander Belopolsky40018472011-02-26 01:02:56 +00003557PyObject *
3558PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003559 Py_ssize_t size,
3560 const char *errors,
3561 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003562{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003565 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003566 Py_ssize_t startinpos;
3567 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003568 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003570 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003571 PyObject *errorHandler = NULL;
3572 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003573 Py_UCS4 maxchar = 0;
3574 Py_ssize_t unicode_size;
3575 Py_ssize_t i;
3576 int kind;
3577 void *data;
3578 int has_errors;
3579 Py_UNICODE *error_outptr;
3580#if SIZEOF_WCHAR_T == 2
3581 Py_ssize_t wchar_offset = 0;
3582#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583
Walter Dörwald69652032004-09-07 20:24:22 +00003584 if (size == 0) {
3585 if (consumed)
3586 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003587 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003588 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003589 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3590 consumed, &has_errors);
3591 if (has_errors) {
3592 unicode = _PyUnicode_New(size);
3593 if (!unicode)
3594 return NULL;
3595 kind = PyUnicode_WCHAR_KIND;
3596 data = PyUnicode_AS_UNICODE(unicode);
3597 assert(data != NULL);
3598 }
3599 else {
3600 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3601 if (!unicode)
3602 return NULL;
3603 /* When the string is ASCII only, just use memcpy and return.
3604 unicode_size may be != size if there is an incomplete UTF-8
3605 sequence at the end of the ASCII block. */
3606 if (maxchar < 128 && size == unicode_size) {
3607 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3608 return (PyObject *)unicode;
3609 }
3610 kind = PyUnicode_KIND(unicode);
3611 data = PyUnicode_DATA(unicode);
3612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003614 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003616 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003617
3618 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003619 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620
3621 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003622 /* Fast path for runs of ASCII characters. Given that common UTF-8
3623 input will consist of an overwhelming majority of ASCII
3624 characters, we try to optimize for this case by checking
3625 as many characters as a C 'long' can contain.
3626 First, check if we can do an aligned read, as most CPUs have
3627 a penalty for unaligned reads.
3628 */
3629 if (!((size_t) s & LONG_PTR_MASK)) {
3630 /* Help register allocation */
3631 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003632 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003633 while (_s < aligned_end) {
3634 /* Read a whole long at a time (either 4 or 8 bytes),
3635 and do a fast unrolled copy if it only contains ASCII
3636 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003637 unsigned long value = *(unsigned long *) _s;
3638 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003639 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003640 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3641 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3642 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3643 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003644#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003645 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3646 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3647 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3648 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003649#endif
3650 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003651 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003652 }
3653 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003654 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003655 if (s == e)
3656 break;
3657 ch = (unsigned char)*s;
3658 }
3659 }
3660
3661 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003662 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 s++;
3664 continue;
3665 }
3666
3667 n = utf8_code_length[ch];
3668
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003669 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003670 if (consumed)
3671 break;
3672 else {
3673 errmsg = "unexpected end of data";
3674 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003675 endinpos = startinpos+1;
3676 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3677 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003678 goto utf8Error;
3679 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681
3682 switch (n) {
3683
3684 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003685 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003686 startinpos = s-starts;
3687 endinpos = startinpos+1;
3688 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689
3690 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003691 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003692 startinpos = s-starts;
3693 endinpos = startinpos+1;
3694 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695
3696 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003697 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003698 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003699 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003700 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003701 goto utf8Error;
3702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003704 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003705 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 break;
3707
3708 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003709 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3710 will result in surrogates in range d800-dfff. Surrogates are
3711 not valid UTF-8 so they are rejected.
3712 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3713 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003714 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003715 (s[2] & 0xc0) != 0x80 ||
3716 ((unsigned char)s[0] == 0xE0 &&
3717 (unsigned char)s[1] < 0xA0) ||
3718 ((unsigned char)s[0] == 0xED &&
3719 (unsigned char)s[1] > 0x9F)) {
3720 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003721 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003722 endinpos = startinpos + 1;
3723
3724 /* if s[1] first two bits are 1 and 0, then the invalid
3725 continuation byte is s[2], so increment endinpos by 1,
3726 if not, s[1] is invalid and endinpos doesn't need to
3727 be incremented. */
3728 if ((s[1] & 0xC0) == 0x80)
3729 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003730 goto utf8Error;
3731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003733 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003734 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003735 break;
3736
3737 case 4:
3738 if ((s[1] & 0xc0) != 0x80 ||
3739 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003740 (s[3] & 0xc0) != 0x80 ||
3741 ((unsigned char)s[0] == 0xF0 &&
3742 (unsigned char)s[1] < 0x90) ||
3743 ((unsigned char)s[0] == 0xF4 &&
3744 (unsigned char)s[1] > 0x8F)) {
3745 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003746 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003747 endinpos = startinpos + 1;
3748 if ((s[1] & 0xC0) == 0x80) {
3749 endinpos++;
3750 if ((s[2] & 0xC0) == 0x80)
3751 endinpos++;
3752 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003753 goto utf8Error;
3754 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003755 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003756 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3757 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003759 /* If the string is flexible or we have native UCS-4, write
3760 directly.. */
3761 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3762 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003763
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764 else {
3765 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003767 /* translate from 10000..10FFFF to 0..FFFF */
3768 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003770 /* high surrogate = top 10 bits added to D800 */
3771 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3772 (Py_UNICODE)(0xD800 + (ch >> 10)));
3773
3774 /* low surrogate = bottom 10 bits added to DC00 */
3775 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3776 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3777 }
3778#if SIZEOF_WCHAR_T == 2
3779 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003780#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 }
3783 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003784 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003785
Benjamin Peterson29060642009-01-31 22:14:21 +00003786 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787 /* If this is not yet a resizable string, make it one.. */
3788 if (kind != PyUnicode_WCHAR_KIND) {
3789 const Py_UNICODE *u;
3790 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3791 if (!new_unicode)
3792 goto onError;
3793 u = PyUnicode_AsUnicode((PyObject *)unicode);
3794 if (!u)
3795 goto onError;
3796#if SIZEOF_WCHAR_T == 2
3797 i += wchar_offset;
3798#endif
3799 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3800 Py_DECREF(unicode);
3801 unicode = new_unicode;
3802 kind = 0;
3803 data = PyUnicode_AS_UNICODE(new_unicode);
3804 assert(data != NULL);
3805 }
3806 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003807 if (unicode_decode_call_errorhandler(
3808 errors, &errorHandler,
3809 "utf8", errmsg,
3810 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003812 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003813 /* Update data because unicode_decode_call_errorhandler might have
3814 re-created or resized the unicode object. */
3815 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003816 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818 /* Ensure the unicode_size calculation above was correct: */
3819 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3820
Walter Dörwald69652032004-09-07 20:24:22 +00003821 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003822 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003824 /* Adjust length and ready string when it contained errors and
3825 is of the old resizable kind. */
3826 if (kind == PyUnicode_WCHAR_KIND) {
3827 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3828 PyUnicode_READY(unicode) == -1)
3829 goto onError;
3830 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003832 Py_XDECREF(errorHandler);
3833 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003834 if (PyUnicode_READY(unicode) == -1) {
3835 Py_DECREF(unicode);
3836 return NULL;
3837 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 return (PyObject *)unicode;
3839
Benjamin Peterson29060642009-01-31 22:14:21 +00003840 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003841 Py_XDECREF(errorHandler);
3842 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 Py_DECREF(unicode);
3844 return NULL;
3845}
3846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003847#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003848
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003849#ifdef __APPLE__
3850
3851/* Simplified UTF-8 decoder using surrogateescape error handler,
3852 used to decode the command line arguments on Mac OS X. */
3853
3854wchar_t*
3855_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3856{
3857 int n;
3858 const char *e;
3859 wchar_t *unicode, *p;
3860
3861 /* Note: size will always be longer than the resulting Unicode
3862 character count */
3863 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3864 PyErr_NoMemory();
3865 return NULL;
3866 }
3867 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3868 if (!unicode)
3869 return NULL;
3870
3871 /* Unpack UTF-8 encoded data */
3872 p = unicode;
3873 e = s + size;
3874 while (s < e) {
3875 Py_UCS4 ch = (unsigned char)*s;
3876
3877 if (ch < 0x80) {
3878 *p++ = (wchar_t)ch;
3879 s++;
3880 continue;
3881 }
3882
3883 n = utf8_code_length[ch];
3884 if (s + n > e) {
3885 goto surrogateescape;
3886 }
3887
3888 switch (n) {
3889 case 0:
3890 case 1:
3891 goto surrogateescape;
3892
3893 case 2:
3894 if ((s[1] & 0xc0) != 0x80)
3895 goto surrogateescape;
3896 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3897 assert ((ch > 0x007F) && (ch <= 0x07FF));
3898 *p++ = (wchar_t)ch;
3899 break;
3900
3901 case 3:
3902 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3903 will result in surrogates in range d800-dfff. Surrogates are
3904 not valid UTF-8 so they are rejected.
3905 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3906 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3907 if ((s[1] & 0xc0) != 0x80 ||
3908 (s[2] & 0xc0) != 0x80 ||
3909 ((unsigned char)s[0] == 0xE0 &&
3910 (unsigned char)s[1] < 0xA0) ||
3911 ((unsigned char)s[0] == 0xED &&
3912 (unsigned char)s[1] > 0x9F)) {
3913
3914 goto surrogateescape;
3915 }
3916 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3917 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003918 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003919 break;
3920
3921 case 4:
3922 if ((s[1] & 0xc0) != 0x80 ||
3923 (s[2] & 0xc0) != 0x80 ||
3924 (s[3] & 0xc0) != 0x80 ||
3925 ((unsigned char)s[0] == 0xF0 &&
3926 (unsigned char)s[1] < 0x90) ||
3927 ((unsigned char)s[0] == 0xF4 &&
3928 (unsigned char)s[1] > 0x8F)) {
3929 goto surrogateescape;
3930 }
3931 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3932 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3933 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3934
3935#if SIZEOF_WCHAR_T == 4
3936 *p++ = (wchar_t)ch;
3937#else
3938 /* compute and append the two surrogates: */
3939
3940 /* translate from 10000..10FFFF to 0..FFFF */
3941 ch -= 0x10000;
3942
3943 /* high surrogate = top 10 bits added to D800 */
3944 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3945
3946 /* low surrogate = bottom 10 bits added to DC00 */
3947 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3948#endif
3949 break;
3950 }
3951 s += n;
3952 continue;
3953
3954 surrogateescape:
3955 *p++ = 0xDC00 + ch;
3956 s++;
3957 }
3958 *p = L'\0';
3959 return unicode;
3960}
3961
3962#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964/* Primary internal function which creates utf8 encoded bytes objects.
3965
3966 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003967 and allocate exactly as much space needed at the end. Else allocate the
3968 maximum possible needed (4 result bytes per Unicode character), and return
3969 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003970*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003971PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973{
Tim Peters602f7402002-04-27 18:03:26 +00003974#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003975
Guido van Rossum98297ee2007-11-06 21:34:58 +00003976 Py_ssize_t i; /* index into s of next input byte */
3977 PyObject *result; /* result string object */
3978 char *p; /* next free byte in output buffer */
3979 Py_ssize_t nallocated; /* number of result bytes allocated */
3980 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003981 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003982 PyObject *errorHandler = NULL;
3983 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003984 int kind;
3985 void *data;
3986 Py_ssize_t size;
3987 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3988#if SIZEOF_WCHAR_T == 2
3989 Py_ssize_t wchar_offset = 0;
3990#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992 if (!PyUnicode_Check(unicode)) {
3993 PyErr_BadArgument();
3994 return NULL;
3995 }
3996
3997 if (PyUnicode_READY(unicode) == -1)
3998 return NULL;
3999
4000 if (_PyUnicode_UTF8(unicode))
4001 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
4002 _PyUnicode_UTF8_LENGTH(unicode));
4003
4004 kind = PyUnicode_KIND(unicode);
4005 data = PyUnicode_DATA(unicode);
4006 size = PyUnicode_GET_LENGTH(unicode);
4007
Tim Peters602f7402002-04-27 18:03:26 +00004008 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009
Tim Peters602f7402002-04-27 18:03:26 +00004010 if (size <= MAX_SHORT_UNICHARS) {
4011 /* Write into the stack buffer; nallocated can't overflow.
4012 * At the end, we'll allocate exactly as much heap space as it
4013 * turns out we need.
4014 */
4015 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004016 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004017 p = stackbuf;
4018 }
4019 else {
4020 /* Overallocate on the heap, and give the excess back at the end. */
4021 nallocated = size * 4;
4022 if (nallocated / 4 != size) /* overflow! */
4023 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004024 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004025 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004026 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004027 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004028 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004029
Tim Peters602f7402002-04-27 18:03:26 +00004030 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004031 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004032
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004033 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004034 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004036
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004038 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004039 *p++ = (char)(0xc0 | (ch >> 6));
4040 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004041 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 Py_ssize_t newpos;
4043 PyObject *rep;
4044 Py_ssize_t repsize, k, startpos;
4045 startpos = i-1;
4046#if SIZEOF_WCHAR_T == 2
4047 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004048#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049 rep = unicode_encode_call_errorhandler(
4050 errors, &errorHandler, "utf-8", "surrogates not allowed",
4051 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4052 &exc, startpos, startpos+1, &newpos);
4053 if (!rep)
4054 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056 if (PyBytes_Check(rep))
4057 repsize = PyBytes_GET_SIZE(rep);
4058 else
4059 repsize = PyUnicode_GET_SIZE(rep);
4060
4061 if (repsize > 4) {
4062 Py_ssize_t offset;
4063
4064 if (result == NULL)
4065 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004066 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004067 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4070 /* integer overflow */
4071 PyErr_NoMemory();
4072 goto error;
4073 }
4074 nallocated += repsize - 4;
4075 if (result != NULL) {
4076 if (_PyBytes_Resize(&result, nallocated) < 0)
4077 goto error;
4078 } else {
4079 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004080 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004081 goto error;
4082 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4083 }
4084 p = PyBytes_AS_STRING(result) + offset;
4085 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004087 if (PyBytes_Check(rep)) {
4088 char *prep = PyBytes_AS_STRING(rep);
4089 for(k = repsize; k > 0; k--)
4090 *p++ = *prep++;
4091 } else /* rep is unicode */ {
4092 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4093 Py_UNICODE c;
4094
4095 for(k=0; k<repsize; k++) {
4096 c = prep[k];
4097 if (0x80 <= c) {
4098 raise_encode_exception(&exc, "utf-8",
4099 PyUnicode_AS_UNICODE(unicode),
4100 size, i-1, i,
4101 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004102 goto error;
4103 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004104 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004105 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004106 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004108 } else if (ch < 0x10000) {
4109 *p++ = (char)(0xe0 | (ch >> 12));
4110 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4111 *p++ = (char)(0x80 | (ch & 0x3f));
4112 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004113 /* Encode UCS4 Unicode ordinals */
4114 *p++ = (char)(0xf0 | (ch >> 18));
4115 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4116 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4117 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004118#if SIZEOF_WCHAR_T == 2
4119 wchar_offset++;
4120#endif
Tim Peters602f7402002-04-27 18:03:26 +00004121 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004123
Guido van Rossum98297ee2007-11-06 21:34:58 +00004124 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004125 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004126 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004127 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004128 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004129 }
4130 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004131 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004132 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004133 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004134 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004135 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004136
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004137 Py_XDECREF(errorHandler);
4138 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004139 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004140 error:
4141 Py_XDECREF(errorHandler);
4142 Py_XDECREF(exc);
4143 Py_XDECREF(result);
4144 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004145
Tim Peters602f7402002-04-27 18:03:26 +00004146#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147}
4148
Alexander Belopolsky40018472011-02-26 01:02:56 +00004149PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004150PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4151 Py_ssize_t size,
4152 const char *errors)
4153{
4154 PyObject *v, *unicode;
4155
4156 unicode = PyUnicode_FromUnicode(s, size);
4157 if (unicode == NULL)
4158 return NULL;
4159 v = _PyUnicode_AsUTF8String(unicode, errors);
4160 Py_DECREF(unicode);
4161 return v;
4162}
4163
4164PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004165PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004167 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168}
4169
Walter Dörwald41980ca2007-08-16 21:55:45 +00004170/* --- UTF-32 Codec ------------------------------------------------------- */
4171
4172PyObject *
4173PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 Py_ssize_t size,
4175 const char *errors,
4176 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004177{
4178 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4179}
4180
4181PyObject *
4182PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 Py_ssize_t size,
4184 const char *errors,
4185 int *byteorder,
4186 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004187{
4188 const char *starts = s;
4189 Py_ssize_t startinpos;
4190 Py_ssize_t endinpos;
4191 Py_ssize_t outpos;
4192 PyUnicodeObject *unicode;
4193 Py_UNICODE *p;
4194#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004195 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004196 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004197#else
4198 const int pairs = 0;
4199#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004200 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004201 int bo = 0; /* assume native ordering by default */
4202 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004203 /* Offsets from q for retrieving bytes in the right order. */
4204#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4205 int iorder[] = {0, 1, 2, 3};
4206#else
4207 int iorder[] = {3, 2, 1, 0};
4208#endif
4209 PyObject *errorHandler = NULL;
4210 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004211
Walter Dörwald41980ca2007-08-16 21:55:45 +00004212 q = (unsigned char *)s;
4213 e = q + size;
4214
4215 if (byteorder)
4216 bo = *byteorder;
4217
4218 /* Check for BOM marks (U+FEFF) in the input and adjust current
4219 byte order setting accordingly. In native mode, the leading BOM
4220 mark is skipped, in all other modes, it is copied to the output
4221 stream as-is (giving a ZWNBSP character). */
4222 if (bo == 0) {
4223 if (size >= 4) {
4224 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004226#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004227 if (bom == 0x0000FEFF) {
4228 q += 4;
4229 bo = -1;
4230 }
4231 else if (bom == 0xFFFE0000) {
4232 q += 4;
4233 bo = 1;
4234 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004235#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004236 if (bom == 0x0000FEFF) {
4237 q += 4;
4238 bo = 1;
4239 }
4240 else if (bom == 0xFFFE0000) {
4241 q += 4;
4242 bo = -1;
4243 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004244#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004245 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004246 }
4247
4248 if (bo == -1) {
4249 /* force LE */
4250 iorder[0] = 0;
4251 iorder[1] = 1;
4252 iorder[2] = 2;
4253 iorder[3] = 3;
4254 }
4255 else if (bo == 1) {
4256 /* force BE */
4257 iorder[0] = 3;
4258 iorder[1] = 2;
4259 iorder[2] = 1;
4260 iorder[3] = 0;
4261 }
4262
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004263 /* On narrow builds we split characters outside the BMP into two
4264 codepoints => count how much extra space we need. */
4265#ifndef Py_UNICODE_WIDE
4266 for (qq = q; qq < e; qq += 4)
4267 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4268 pairs++;
4269#endif
4270
4271 /* This might be one to much, because of a BOM */
4272 unicode = _PyUnicode_New((size+3)/4+pairs);
4273 if (!unicode)
4274 return NULL;
4275 if (size == 0)
4276 return (PyObject *)unicode;
4277
4278 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004279 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004280
Walter Dörwald41980ca2007-08-16 21:55:45 +00004281 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 Py_UCS4 ch;
4283 /* remaining bytes at the end? (size should be divisible by 4) */
4284 if (e-q<4) {
4285 if (consumed)
4286 break;
4287 errmsg = "truncated data";
4288 startinpos = ((const char *)q)-starts;
4289 endinpos = ((const char *)e)-starts;
4290 goto utf32Error;
4291 /* The remaining input chars are ignored if the callback
4292 chooses to skip the input */
4293 }
4294 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4295 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004296
Benjamin Peterson29060642009-01-31 22:14:21 +00004297 if (ch >= 0x110000)
4298 {
4299 errmsg = "codepoint not in range(0x110000)";
4300 startinpos = ((const char *)q)-starts;
4301 endinpos = startinpos+4;
4302 goto utf32Error;
4303 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004304#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004305 if (ch >= 0x10000)
4306 {
4307 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4308 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4309 }
4310 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004311#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 *p++ = ch;
4313 q += 4;
4314 continue;
4315 utf32Error:
4316 outpos = p-PyUnicode_AS_UNICODE(unicode);
4317 if (unicode_decode_call_errorhandler(
4318 errors, &errorHandler,
4319 "utf32", errmsg,
4320 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4321 &unicode, &outpos, &p))
4322 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004323 }
4324
4325 if (byteorder)
4326 *byteorder = bo;
4327
4328 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004329 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004330
4331 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004332 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004333 goto onError;
4334
4335 Py_XDECREF(errorHandler);
4336 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004337 if (PyUnicode_READY(unicode) == -1) {
4338 Py_DECREF(unicode);
4339 return NULL;
4340 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004341 return (PyObject *)unicode;
4342
Benjamin Peterson29060642009-01-31 22:14:21 +00004343 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004344 Py_DECREF(unicode);
4345 Py_XDECREF(errorHandler);
4346 Py_XDECREF(exc);
4347 return NULL;
4348}
4349
4350PyObject *
4351PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 Py_ssize_t size,
4353 const char *errors,
4354 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004355{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004356 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004357 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004358 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004359#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004360 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004361#else
4362 const int pairs = 0;
4363#endif
4364 /* Offsets from p for storing byte pairs in the right order. */
4365#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4366 int iorder[] = {0, 1, 2, 3};
4367#else
4368 int iorder[] = {3, 2, 1, 0};
4369#endif
4370
Benjamin Peterson29060642009-01-31 22:14:21 +00004371#define STORECHAR(CH) \
4372 do { \
4373 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4374 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4375 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4376 p[iorder[0]] = (CH) & 0xff; \
4377 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004378 } while(0)
4379
4380 /* In narrow builds we can output surrogate pairs as one codepoint,
4381 so we need less space. */
4382#ifndef Py_UNICODE_WIDE
4383 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004384 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4385 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4386 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004387#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004388 nsize = (size - pairs + (byteorder == 0));
4389 bytesize = nsize * 4;
4390 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004391 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004392 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004393 if (v == NULL)
4394 return NULL;
4395
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004396 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004397 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004399 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004400 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004401
4402 if (byteorder == -1) {
4403 /* force LE */
4404 iorder[0] = 0;
4405 iorder[1] = 1;
4406 iorder[2] = 2;
4407 iorder[3] = 3;
4408 }
4409 else if (byteorder == 1) {
4410 /* force BE */
4411 iorder[0] = 3;
4412 iorder[1] = 2;
4413 iorder[2] = 1;
4414 iorder[3] = 0;
4415 }
4416
4417 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004418 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004419#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4421 Py_UCS4 ch2 = *s;
4422 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4423 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4424 s++;
4425 size--;
4426 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004427 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004428#endif
4429 STORECHAR(ch);
4430 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004431
4432 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004433 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004434#undef STORECHAR
4435}
4436
Alexander Belopolsky40018472011-02-26 01:02:56 +00004437PyObject *
4438PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004439{
4440 if (!PyUnicode_Check(unicode)) {
4441 PyErr_BadArgument();
4442 return NULL;
4443 }
4444 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 PyUnicode_GET_SIZE(unicode),
4446 NULL,
4447 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004448}
4449
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450/* --- UTF-16 Codec ------------------------------------------------------- */
4451
Tim Peters772747b2001-08-09 22:21:55 +00004452PyObject *
4453PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004454 Py_ssize_t size,
4455 const char *errors,
4456 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457{
Walter Dörwald69652032004-09-07 20:24:22 +00004458 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4459}
4460
Antoine Pitrouab868312009-01-10 15:40:25 +00004461/* Two masks for fast checking of whether a C 'long' may contain
4462 UTF16-encoded surrogate characters. This is an efficient heuristic,
4463 assuming that non-surrogate characters with a code point >= 0x8000 are
4464 rare in most input.
4465 FAST_CHAR_MASK is used when the input is in native byte ordering,
4466 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004467*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004468#if (SIZEOF_LONG == 8)
4469# define FAST_CHAR_MASK 0x8000800080008000L
4470# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4471#elif (SIZEOF_LONG == 4)
4472# define FAST_CHAR_MASK 0x80008000L
4473# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4474#else
4475# error C 'long' size should be either 4 or 8!
4476#endif
4477
Walter Dörwald69652032004-09-07 20:24:22 +00004478PyObject *
4479PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 Py_ssize_t size,
4481 const char *errors,
4482 int *byteorder,
4483 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004484{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004486 Py_ssize_t startinpos;
4487 Py_ssize_t endinpos;
4488 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 PyUnicodeObject *unicode;
4490 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004491 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004492 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004493 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004494 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004495 /* Offsets from q for retrieving byte pairs in the right order. */
4496#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4497 int ihi = 1, ilo = 0;
4498#else
4499 int ihi = 0, ilo = 1;
4500#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501 PyObject *errorHandler = NULL;
4502 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503
4504 /* Note: size will always be longer than the resulting Unicode
4505 character count */
4506 unicode = _PyUnicode_New(size);
4507 if (!unicode)
4508 return NULL;
4509 if (size == 0)
4510 return (PyObject *)unicode;
4511
4512 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004513 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004514 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004515 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516
4517 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004518 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004520 /* Check for BOM marks (U+FEFF) in the input and adjust current
4521 byte order setting accordingly. In native mode, the leading BOM
4522 mark is skipped, in all other modes, it is copied to the output
4523 stream as-is (giving a ZWNBSP character). */
4524 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004525 if (size >= 2) {
4526 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004527#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 if (bom == 0xFEFF) {
4529 q += 2;
4530 bo = -1;
4531 }
4532 else if (bom == 0xFFFE) {
4533 q += 2;
4534 bo = 1;
4535 }
Tim Petersced69f82003-09-16 20:30:58 +00004536#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004537 if (bom == 0xFEFF) {
4538 q += 2;
4539 bo = 1;
4540 }
4541 else if (bom == 0xFFFE) {
4542 q += 2;
4543 bo = -1;
4544 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004545#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548
Tim Peters772747b2001-08-09 22:21:55 +00004549 if (bo == -1) {
4550 /* force LE */
4551 ihi = 1;
4552 ilo = 0;
4553 }
4554 else if (bo == 1) {
4555 /* force BE */
4556 ihi = 0;
4557 ilo = 1;
4558 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004559#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4560 native_ordering = ilo < ihi;
4561#else
4562 native_ordering = ilo > ihi;
4563#endif
Tim Peters772747b2001-08-09 22:21:55 +00004564
Antoine Pitrouab868312009-01-10 15:40:25 +00004565 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004566 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004567 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004568 /* First check for possible aligned read of a C 'long'. Unaligned
4569 reads are more expensive, better to defer to another iteration. */
4570 if (!((size_t) q & LONG_PTR_MASK)) {
4571 /* Fast path for runs of non-surrogate chars. */
4572 register const unsigned char *_q = q;
4573 Py_UNICODE *_p = p;
4574 if (native_ordering) {
4575 /* Native ordering is simple: as long as the input cannot
4576 possibly contain a surrogate char, do an unrolled copy
4577 of several 16-bit code points to the target object.
4578 The non-surrogate check is done on several input bytes
4579 at a time (as many as a C 'long' can contain). */
4580 while (_q < aligned_end) {
4581 unsigned long data = * (unsigned long *) _q;
4582 if (data & FAST_CHAR_MASK)
4583 break;
4584 _p[0] = ((unsigned short *) _q)[0];
4585 _p[1] = ((unsigned short *) _q)[1];
4586#if (SIZEOF_LONG == 8)
4587 _p[2] = ((unsigned short *) _q)[2];
4588 _p[3] = ((unsigned short *) _q)[3];
4589#endif
4590 _q += SIZEOF_LONG;
4591 _p += SIZEOF_LONG / 2;
4592 }
4593 }
4594 else {
4595 /* Byteswapped ordering is similar, but we must decompose
4596 the copy bytewise, and take care of zero'ing out the
4597 upper bytes if the target object is in 32-bit units
4598 (that is, in UCS-4 builds). */
4599 while (_q < aligned_end) {
4600 unsigned long data = * (unsigned long *) _q;
4601 if (data & SWAPPED_FAST_CHAR_MASK)
4602 break;
4603 /* Zero upper bytes in UCS-4 builds */
4604#if (Py_UNICODE_SIZE > 2)
4605 _p[0] = 0;
4606 _p[1] = 0;
4607#if (SIZEOF_LONG == 8)
4608 _p[2] = 0;
4609 _p[3] = 0;
4610#endif
4611#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004612 /* Issue #4916; UCS-4 builds on big endian machines must
4613 fill the two last bytes of each 4-byte unit. */
4614#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4615# define OFF 2
4616#else
4617# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004618#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004619 ((unsigned char *) _p)[OFF + 1] = _q[0];
4620 ((unsigned char *) _p)[OFF + 0] = _q[1];
4621 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4622 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4623#if (SIZEOF_LONG == 8)
4624 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4625 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4626 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4627 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4628#endif
4629#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004630 _q += SIZEOF_LONG;
4631 _p += SIZEOF_LONG / 2;
4632 }
4633 }
4634 p = _p;
4635 q = _q;
4636 if (q >= e)
4637 break;
4638 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004639 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640
Benjamin Peterson14339b62009-01-31 16:36:08 +00004641 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004642
4643 if (ch < 0xD800 || ch > 0xDFFF) {
4644 *p++ = ch;
4645 continue;
4646 }
4647
4648 /* UTF-16 code pair: */
4649 if (q > e) {
4650 errmsg = "unexpected end of data";
4651 startinpos = (((const char *)q) - 2) - starts;
4652 endinpos = ((const char *)e) + 1 - starts;
4653 goto utf16Error;
4654 }
4655 if (0xD800 <= ch && ch <= 0xDBFF) {
4656 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4657 q += 2;
4658 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004659#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004660 *p++ = ch;
4661 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004662#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004663 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004664#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004665 continue;
4666 }
4667 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004668 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004669 startinpos = (((const char *)q)-4)-starts;
4670 endinpos = startinpos+2;
4671 goto utf16Error;
4672 }
4673
Benjamin Peterson14339b62009-01-31 16:36:08 +00004674 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 errmsg = "illegal encoding";
4676 startinpos = (((const char *)q)-2)-starts;
4677 endinpos = startinpos+2;
4678 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004679
Benjamin Peterson29060642009-01-31 22:14:21 +00004680 utf16Error:
4681 outpos = p - PyUnicode_AS_UNICODE(unicode);
4682 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004683 errors,
4684 &errorHandler,
4685 "utf16", errmsg,
4686 &starts,
4687 (const char **)&e,
4688 &startinpos,
4689 &endinpos,
4690 &exc,
4691 (const char **)&q,
4692 &unicode,
4693 &outpos,
4694 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004695 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004697 /* remaining byte at the end? (size should be even) */
4698 if (e == q) {
4699 if (!consumed) {
4700 errmsg = "truncated data";
4701 startinpos = ((const char *)q) - starts;
4702 endinpos = ((const char *)e) + 1 - starts;
4703 outpos = p - PyUnicode_AS_UNICODE(unicode);
4704 if (unicode_decode_call_errorhandler(
4705 errors,
4706 &errorHandler,
4707 "utf16", errmsg,
4708 &starts,
4709 (const char **)&e,
4710 &startinpos,
4711 &endinpos,
4712 &exc,
4713 (const char **)&q,
4714 &unicode,
4715 &outpos,
4716 &p))
4717 goto onError;
4718 /* The remaining input chars are ignored if the callback
4719 chooses to skip the input */
4720 }
4721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722
4723 if (byteorder)
4724 *byteorder = bo;
4725
Walter Dörwald69652032004-09-07 20:24:22 +00004726 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004728
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004730 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 goto onError;
4732
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 Py_XDECREF(errorHandler);
4734 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004735 if (PyUnicode_READY(unicode) == -1) {
4736 Py_DECREF(unicode);
4737 return NULL;
4738 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 return (PyObject *)unicode;
4740
Benjamin Peterson29060642009-01-31 22:14:21 +00004741 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004743 Py_XDECREF(errorHandler);
4744 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 return NULL;
4746}
4747
Antoine Pitrouab868312009-01-10 15:40:25 +00004748#undef FAST_CHAR_MASK
4749#undef SWAPPED_FAST_CHAR_MASK
4750
Tim Peters772747b2001-08-09 22:21:55 +00004751PyObject *
4752PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004753 Py_ssize_t size,
4754 const char *errors,
4755 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004757 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004758 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004759 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004760#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004761 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004762#else
4763 const int pairs = 0;
4764#endif
Tim Peters772747b2001-08-09 22:21:55 +00004765 /* Offsets from p for storing byte pairs in the right order. */
4766#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4767 int ihi = 1, ilo = 0;
4768#else
4769 int ihi = 0, ilo = 1;
4770#endif
4771
Benjamin Peterson29060642009-01-31 22:14:21 +00004772#define STORECHAR(CH) \
4773 do { \
4774 p[ihi] = ((CH) >> 8) & 0xff; \
4775 p[ilo] = (CH) & 0xff; \
4776 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004777 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004779#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004780 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004781 if (s[i] >= 0x10000)
4782 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004783#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004784 /* 2 * (size + pairs + (byteorder == 0)) */
4785 if (size > PY_SSIZE_T_MAX ||
4786 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004788 nsize = size + pairs + (byteorder == 0);
4789 bytesize = nsize * 2;
4790 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004792 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 if (v == NULL)
4794 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004796 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004799 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004800 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004801
4802 if (byteorder == -1) {
4803 /* force LE */
4804 ihi = 1;
4805 ilo = 0;
4806 }
4807 else if (byteorder == 1) {
4808 /* force BE */
4809 ihi = 0;
4810 ilo = 1;
4811 }
4812
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004813 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004814 Py_UNICODE ch = *s++;
4815 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004816#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004817 if (ch >= 0x10000) {
4818 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4819 ch = 0xD800 | ((ch-0x10000) >> 10);
4820 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004821#endif
Tim Peters772747b2001-08-09 22:21:55 +00004822 STORECHAR(ch);
4823 if (ch2)
4824 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004825 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004826
4827 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004828 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004829#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830}
4831
Alexander Belopolsky40018472011-02-26 01:02:56 +00004832PyObject *
4833PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834{
4835 if (!PyUnicode_Check(unicode)) {
4836 PyErr_BadArgument();
4837 return NULL;
4838 }
4839 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004840 PyUnicode_GET_SIZE(unicode),
4841 NULL,
4842 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843}
4844
4845/* --- Unicode Escape Codec ----------------------------------------------- */
4846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004847/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4848 if all the escapes in the string make it still a valid ASCII string.
4849 Returns -1 if any escapes were found which cause the string to
4850 pop out of ASCII range. Otherwise returns the length of the
4851 required buffer to hold the string.
4852 */
4853Py_ssize_t
4854length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4855{
4856 const unsigned char *p = (const unsigned char *)s;
4857 const unsigned char *end = p + size;
4858 Py_ssize_t length = 0;
4859
4860 if (size < 0)
4861 return -1;
4862
4863 for (; p < end; ++p) {
4864 if (*p > 127) {
4865 /* Non-ASCII */
4866 return -1;
4867 }
4868 else if (*p != '\\') {
4869 /* Normal character */
4870 ++length;
4871 }
4872 else {
4873 /* Backslash-escape, check next char */
4874 ++p;
4875 /* Escape sequence reaches till end of string or
4876 non-ASCII follow-up. */
4877 if (p >= end || *p > 127)
4878 return -1;
4879 switch (*p) {
4880 case '\n':
4881 /* backslash + \n result in zero characters */
4882 break;
4883 case '\\': case '\'': case '\"':
4884 case 'b': case 'f': case 't':
4885 case 'n': case 'r': case 'v': case 'a':
4886 ++length;
4887 break;
4888 case '0': case '1': case '2': case '3':
4889 case '4': case '5': case '6': case '7':
4890 case 'x': case 'u': case 'U': case 'N':
4891 /* these do not guarantee ASCII characters */
4892 return -1;
4893 default:
4894 /* count the backslash + the other character */
4895 length += 2;
4896 }
4897 }
4898 }
4899 return length;
4900}
4901
4902/* Similar to PyUnicode_WRITE but either write into wstr field
4903 or treat string as ASCII. */
4904#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4905 do { \
4906 if ((kind) != PyUnicode_WCHAR_KIND) \
4907 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4908 else \
4909 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4910 } while (0)
4911
4912#define WRITE_WSTR(buf, index, value) \
4913 assert(kind == PyUnicode_WCHAR_KIND), \
4914 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4915
4916
Fredrik Lundh06d12682001-01-24 07:59:11 +00004917static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004918
Alexander Belopolsky40018472011-02-26 01:02:56 +00004919PyObject *
4920PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004921 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02004922 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004924 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004925 Py_ssize_t startinpos;
4926 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004927 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004929 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004931 char* message;
4932 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004933 PyObject *errorHandler = NULL;
4934 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004935 Py_ssize_t ascii_length;
4936 Py_ssize_t i;
4937 int kind;
4938 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004940 ascii_length = length_of_escaped_ascii_string(s, size);
4941
4942 /* After length_of_escaped_ascii_string() there are two alternatives,
4943 either the string is pure ASCII with named escapes like \n, etc.
4944 and we determined it's exact size (common case)
4945 or it contains \x, \u, ... escape sequences. then we create a
4946 legacy wchar string and resize it at the end of this function. */
4947 if (ascii_length >= 0) {
4948 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4949 if (!v)
4950 goto onError;
4951 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4952 kind = PyUnicode_1BYTE_KIND;
4953 data = PyUnicode_DATA(v);
4954 }
4955 else {
4956 /* Escaped strings will always be longer than the resulting
4957 Unicode string, so we start with size here and then reduce the
4958 length after conversion to the true value.
4959 (but if the error callback returns a long replacement string
4960 we'll have to allocate more space) */
4961 v = _PyUnicode_New(size);
4962 if (!v)
4963 goto onError;
4964 kind = PyUnicode_WCHAR_KIND;
4965 data = PyUnicode_AS_UNICODE(v);
4966 }
4967
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968 if (size == 0)
4969 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004970 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004972
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973 while (s < end) {
4974 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004975 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004976 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004978 if (kind == PyUnicode_WCHAR_KIND) {
4979 assert(i < _PyUnicode_WSTR_LENGTH(v));
4980 }
4981 else {
4982 /* The only case in which i == ascii_length is a backslash
4983 followed by a newline. */
4984 assert(i <= ascii_length);
4985 }
4986
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987 /* Non-escape characters are interpreted as Unicode ordinals */
4988 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004989 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 continue;
4991 }
4992
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004993 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994 /* \ - Escapes */
4995 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004996 c = *s++;
4997 if (s > end)
4998 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004999
5000 if (kind == PyUnicode_WCHAR_KIND) {
5001 assert(i < _PyUnicode_WSTR_LENGTH(v));
5002 }
5003 else {
5004 /* The only case in which i == ascii_length is a backslash
5005 followed by a newline. */
5006 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5007 }
5008
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005009 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010
Benjamin Peterson29060642009-01-31 22:14:21 +00005011 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005013 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5014 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5015 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5016 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5017 /* FF */
5018 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5019 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5020 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5021 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5022 /* VT */
5023 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5024 /* BEL, not classic C */
5025 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005026
Benjamin Peterson29060642009-01-31 22:14:21 +00005027 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028 case '0': case '1': case '2': case '3':
5029 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005030 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005031 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005032 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005033 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005034 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005036 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 break;
5038
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 /* hex escapes */
5040 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005042 digits = 2;
5043 message = "truncated \\xXX escape";
5044 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005048 digits = 4;
5049 message = "truncated \\uXXXX escape";
5050 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005053 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005054 digits = 8;
5055 message = "truncated \\UXXXXXXXX escape";
5056 hexescape:
5057 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005058 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005059 if (s+digits>end) {
5060 endinpos = size;
5061 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 errors, &errorHandler,
5063 "unicodeescape", "end of string in escape sequence",
5064 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005065 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005066 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005067 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005068 goto nextByte;
5069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005070 for (j = 0; j < digits; ++j) {
5071 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005072 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005073 endinpos = (s+j+1)-starts;
5074 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005075 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 errors, &errorHandler,
5077 "unicodeescape", message,
5078 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005079 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005080 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005081 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005082 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005083 }
5084 chr = (chr<<4) & ~0xF;
5085 if (c >= '0' && c <= '9')
5086 chr += c - '0';
5087 else if (c >= 'a' && c <= 'f')
5088 chr += 10 + c - 'a';
5089 else
5090 chr += 10 + c - 'A';
5091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005092 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005093 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005094 /* _decoding_error will have already written into the
5095 target buffer. */
5096 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005097 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005098 /* when we get here, chr is a 32-bit unicode character */
5099 if (chr <= 0xffff)
5100 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005101 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005102 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005103 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005104 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005105#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005106 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005107#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005108 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005109 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5110 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005111#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005112 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005114 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005115 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 errors, &errorHandler,
5117 "unicodeescape", "illegal Unicode character",
5118 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005119 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005120 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005121 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005122 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005123 break;
5124
Benjamin Peterson29060642009-01-31 22:14:21 +00005125 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005126 case 'N':
5127 message = "malformed \\N character escape";
5128 if (ucnhash_CAPI == NULL) {
5129 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005130 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5131 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005132 if (ucnhash_CAPI == NULL)
5133 goto ucnhashError;
5134 }
5135 if (*s == '{') {
5136 const char *start = s+1;
5137 /* look for the closing brace */
5138 while (*s != '}' && s < end)
5139 s++;
5140 if (s > start && s < end && *s == '}') {
5141 /* found a name. look it up in the unicode database */
5142 message = "unknown Unicode character name";
5143 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005144 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5145 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005146 goto store;
5147 }
5148 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005149 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005150 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005151 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 errors, &errorHandler,
5153 "unicodeescape", message,
5154 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005155 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005156 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005157 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005158 break;
5159
5160 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005161 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005162 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005163 message = "\\ at end of string";
5164 s--;
5165 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005166 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005167 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005168 errors, &errorHandler,
5169 "unicodeescape", message,
5170 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005171 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005172 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005173 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005174 }
5175 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005176 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5177 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005178 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005179 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005182 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005184 /* Ensure the length prediction worked in case of ASCII strings */
5185 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5186
5187 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5188 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005189 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005190 Py_XDECREF(errorHandler);
5191 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005193
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005195 PyErr_SetString(
5196 PyExc_UnicodeError,
5197 "\\N escapes not supported (can't load unicodedata module)"
5198 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005199 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005200 Py_XDECREF(errorHandler);
5201 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005202 return NULL;
5203
Benjamin Peterson29060642009-01-31 22:14:21 +00005204 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005206 Py_XDECREF(errorHandler);
5207 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 return NULL;
5209}
5210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005211#undef WRITE_ASCII_OR_WSTR
5212#undef WRITE_WSTR
5213
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214/* Return a Unicode-Escape string version of the Unicode object.
5215
5216 If quotes is true, the string is enclosed in u"" or u'' quotes as
5217 appropriate.
5218
5219*/
5220
Walter Dörwald79e913e2007-05-12 11:08:06 +00005221static const char *hexdigits = "0123456789abcdef";
5222
Alexander Belopolsky40018472011-02-26 01:02:56 +00005223PyObject *
5224PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005225 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005227 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005230#ifdef Py_UNICODE_WIDE
5231 const Py_ssize_t expandsize = 10;
5232#else
5233 const Py_ssize_t expandsize = 6;
5234#endif
5235
Thomas Wouters89f507f2006-12-13 04:49:30 +00005236 /* XXX(nnorwitz): rather than over-allocating, it would be
5237 better to choose a different scheme. Perhaps scan the
5238 first N-chars of the string and allocate based on that size.
5239 */
5240 /* Initial allocation is based on the longest-possible unichr
5241 escape.
5242
5243 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5244 unichr, so in this case it's the longest unichr escape. In
5245 narrow (UTF-16) builds this is five chars per source unichr
5246 since there are two unichrs in the surrogate pair, so in narrow
5247 (UTF-16) builds it's not the longest unichr escape.
5248
5249 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5250 so in the narrow (UTF-16) build case it's the longest unichr
5251 escape.
5252 */
5253
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005254 if (size == 0)
5255 return PyBytes_FromStringAndSize(NULL, 0);
5256
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005257 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005259
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005260 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 2
5262 + expandsize*size
5263 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 if (repr == NULL)
5265 return NULL;
5266
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005267 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 while (size-- > 0) {
5270 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005271
Walter Dörwald79e913e2007-05-12 11:08:06 +00005272 /* Escape backslashes */
5273 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 *p++ = '\\';
5275 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005276 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005277 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005278
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005279#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005280 /* Map 21-bit characters to '\U00xxxxxx' */
5281 else if (ch >= 0x10000) {
5282 *p++ = '\\';
5283 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005284 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5285 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5286 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5287 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5288 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5289 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5290 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5291 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005292 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005293 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005294#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005295 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5296 else if (ch >= 0xD800 && ch < 0xDC00) {
5297 Py_UNICODE ch2;
5298 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005299
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 ch2 = *s++;
5301 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005302 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5304 *p++ = '\\';
5305 *p++ = 'U';
5306 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5307 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5308 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5309 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5310 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5311 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5312 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5313 *p++ = hexdigits[ucs & 0x0000000F];
5314 continue;
5315 }
5316 /* Fall through: isolated surrogates are copied as-is */
5317 s--;
5318 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005319 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005320#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005321
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005323 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 *p++ = '\\';
5325 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005326 *p++ = hexdigits[(ch >> 12) & 0x000F];
5327 *p++ = hexdigits[(ch >> 8) & 0x000F];
5328 *p++ = hexdigits[(ch >> 4) & 0x000F];
5329 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005331
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005332 /* Map special whitespace to '\t', \n', '\r' */
5333 else if (ch == '\t') {
5334 *p++ = '\\';
5335 *p++ = 't';
5336 }
5337 else if (ch == '\n') {
5338 *p++ = '\\';
5339 *p++ = 'n';
5340 }
5341 else if (ch == '\r') {
5342 *p++ = '\\';
5343 *p++ = 'r';
5344 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005345
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005346 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005347 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005349 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005350 *p++ = hexdigits[(ch >> 4) & 0x000F];
5351 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005352 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005353
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 /* Copy everything else as-is */
5355 else
5356 *p++ = (char) ch;
5357 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005359 assert(p - PyBytes_AS_STRING(repr) > 0);
5360 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5361 return NULL;
5362 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363}
5364
Alexander Belopolsky40018472011-02-26 01:02:56 +00005365PyObject *
5366PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005368 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 if (!PyUnicode_Check(unicode)) {
5370 PyErr_BadArgument();
5371 return NULL;
5372 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005373 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5374 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005375 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376}
5377
5378/* --- Raw Unicode Escape Codec ------------------------------------------- */
5379
Alexander Belopolsky40018472011-02-26 01:02:56 +00005380PyObject *
5381PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005382 Py_ssize_t size,
5383 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005385 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005386 Py_ssize_t startinpos;
5387 Py_ssize_t endinpos;
5388 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005390 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 const char *end;
5392 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005393 PyObject *errorHandler = NULL;
5394 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005395
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 /* Escaped strings will always be longer than the resulting
5397 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005398 length after conversion to the true value. (But decoding error
5399 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 v = _PyUnicode_New(size);
5401 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005402 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005405 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 end = s + size;
5407 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 unsigned char c;
5409 Py_UCS4 x;
5410 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005411 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 /* Non-escape characters are interpreted as Unicode ordinals */
5414 if (*s != '\\') {
5415 *p++ = (unsigned char)*s++;
5416 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005417 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 startinpos = s-starts;
5419
5420 /* \u-escapes are only interpreted iff the number of leading
5421 backslashes if odd */
5422 bs = s;
5423 for (;s < end;) {
5424 if (*s != '\\')
5425 break;
5426 *p++ = (unsigned char)*s++;
5427 }
5428 if (((s - bs) & 1) == 0 ||
5429 s >= end ||
5430 (*s != 'u' && *s != 'U')) {
5431 continue;
5432 }
5433 p--;
5434 count = *s=='u' ? 4 : 8;
5435 s++;
5436
5437 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5438 outpos = p-PyUnicode_AS_UNICODE(v);
5439 for (x = 0, i = 0; i < count; ++i, ++s) {
5440 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005441 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005442 endinpos = s-starts;
5443 if (unicode_decode_call_errorhandler(
5444 errors, &errorHandler,
5445 "rawunicodeescape", "truncated \\uXXXX",
5446 &starts, &end, &startinpos, &endinpos, &exc, &s,
5447 &v, &outpos, &p))
5448 goto onError;
5449 goto nextByte;
5450 }
5451 x = (x<<4) & ~0xF;
5452 if (c >= '0' && c <= '9')
5453 x += c - '0';
5454 else if (c >= 'a' && c <= 'f')
5455 x += 10 + c - 'a';
5456 else
5457 x += 10 + c - 'A';
5458 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005459 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 /* UCS-2 character */
5461 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005462 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005463 /* UCS-4 character. Either store directly, or as
5464 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005465#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005466 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005467#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 x -= 0x10000L;
5469 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5470 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005471#endif
5472 } else {
5473 endinpos = s-starts;
5474 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005475 if (unicode_decode_call_errorhandler(
5476 errors, &errorHandler,
5477 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 &starts, &end, &startinpos, &endinpos, &exc, &s,
5479 &v, &outpos, &p))
5480 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005481 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 nextByte:
5483 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005485 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005487 Py_XDECREF(errorHandler);
5488 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005489 if (PyUnicode_READY(v) == -1) {
5490 Py_DECREF(v);
5491 return NULL;
5492 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005494
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005497 Py_XDECREF(errorHandler);
5498 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 return NULL;
5500}
5501
Alexander Belopolsky40018472011-02-26 01:02:56 +00005502PyObject *
5503PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005504 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005506 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 char *p;
5508 char *q;
5509
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005510#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005511 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005512#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005513 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005514#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005515
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005516 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005518
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005519 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 if (repr == NULL)
5521 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005522 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005523 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005525 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 while (size-- > 0) {
5527 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005528#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 /* Map 32-bit characters to '\Uxxxxxxxx' */
5530 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005531 *p++ = '\\';
5532 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005533 *p++ = hexdigits[(ch >> 28) & 0xf];
5534 *p++ = hexdigits[(ch >> 24) & 0xf];
5535 *p++ = hexdigits[(ch >> 20) & 0xf];
5536 *p++ = hexdigits[(ch >> 16) & 0xf];
5537 *p++ = hexdigits[(ch >> 12) & 0xf];
5538 *p++ = hexdigits[(ch >> 8) & 0xf];
5539 *p++ = hexdigits[(ch >> 4) & 0xf];
5540 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005541 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005542 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005543#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005544 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5545 if (ch >= 0xD800 && ch < 0xDC00) {
5546 Py_UNICODE ch2;
5547 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005548
Benjamin Peterson29060642009-01-31 22:14:21 +00005549 ch2 = *s++;
5550 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005551 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005552 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5553 *p++ = '\\';
5554 *p++ = 'U';
5555 *p++ = hexdigits[(ucs >> 28) & 0xf];
5556 *p++ = hexdigits[(ucs >> 24) & 0xf];
5557 *p++ = hexdigits[(ucs >> 20) & 0xf];
5558 *p++ = hexdigits[(ucs >> 16) & 0xf];
5559 *p++ = hexdigits[(ucs >> 12) & 0xf];
5560 *p++ = hexdigits[(ucs >> 8) & 0xf];
5561 *p++ = hexdigits[(ucs >> 4) & 0xf];
5562 *p++ = hexdigits[ucs & 0xf];
5563 continue;
5564 }
5565 /* Fall through: isolated surrogates are copied as-is */
5566 s--;
5567 size++;
5568 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005569#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 /* Map 16-bit characters to '\uxxxx' */
5571 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 *p++ = '\\';
5573 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005574 *p++ = hexdigits[(ch >> 12) & 0xf];
5575 *p++ = hexdigits[(ch >> 8) & 0xf];
5576 *p++ = hexdigits[(ch >> 4) & 0xf];
5577 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 /* Copy everything else as-is */
5580 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 *p++ = (char) ch;
5582 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005583 size = p - q;
5584
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005585 assert(size > 0);
5586 if (_PyBytes_Resize(&repr, size) < 0)
5587 return NULL;
5588 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589}
5590
Alexander Belopolsky40018472011-02-26 01:02:56 +00005591PyObject *
5592PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005594 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005596 PyErr_BadArgument();
5597 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005599 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5600 PyUnicode_GET_SIZE(unicode));
5601
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005602 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603}
5604
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005605/* --- Unicode Internal Codec ------------------------------------------- */
5606
Alexander Belopolsky40018472011-02-26 01:02:56 +00005607PyObject *
5608_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005609 Py_ssize_t size,
5610 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005611{
5612 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005613 Py_ssize_t startinpos;
5614 Py_ssize_t endinpos;
5615 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005616 PyUnicodeObject *v;
5617 Py_UNICODE *p;
5618 const char *end;
5619 const char *reason;
5620 PyObject *errorHandler = NULL;
5621 PyObject *exc = NULL;
5622
Neal Norwitzd43069c2006-01-08 01:12:10 +00005623#ifdef Py_UNICODE_WIDE
5624 Py_UNICODE unimax = PyUnicode_GetMax();
5625#endif
5626
Thomas Wouters89f507f2006-12-13 04:49:30 +00005627 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005628 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5629 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5632 as string was created with the old API. */
5633 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005635 p = PyUnicode_AS_UNICODE(v);
5636 end = s + size;
5637
5638 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005639 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005640 /* We have to sanity check the raw data, otherwise doom looms for
5641 some malformed UCS-4 data. */
5642 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005643#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005644 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005645#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005646 end-s < Py_UNICODE_SIZE
5647 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005649 startinpos = s - starts;
5650 if (end-s < Py_UNICODE_SIZE) {
5651 endinpos = end-starts;
5652 reason = "truncated input";
5653 }
5654 else {
5655 endinpos = s - starts + Py_UNICODE_SIZE;
5656 reason = "illegal code point (> 0x10FFFF)";
5657 }
5658 outpos = p - PyUnicode_AS_UNICODE(v);
5659 if (unicode_decode_call_errorhandler(
5660 errors, &errorHandler,
5661 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005662 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005663 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005664 goto onError;
5665 }
5666 }
5667 else {
5668 p++;
5669 s += Py_UNICODE_SIZE;
5670 }
5671 }
5672
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005673 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005674 goto onError;
5675 Py_XDECREF(errorHandler);
5676 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005677 if (PyUnicode_READY(v) == -1) {
5678 Py_DECREF(v);
5679 return NULL;
5680 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005681 return (PyObject *)v;
5682
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005684 Py_XDECREF(v);
5685 Py_XDECREF(errorHandler);
5686 Py_XDECREF(exc);
5687 return NULL;
5688}
5689
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690/* --- Latin-1 Codec ------------------------------------------------------ */
5691
Alexander Belopolsky40018472011-02-26 01:02:56 +00005692PyObject *
5693PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005694 Py_ssize_t size,
5695 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005698 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699}
5700
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005702static void
5703make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005704 const char *encoding,
5705 const Py_UNICODE *unicode, Py_ssize_t size,
5706 Py_ssize_t startpos, Py_ssize_t endpos,
5707 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005710 *exceptionObject = PyUnicodeEncodeError_Create(
5711 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 }
5713 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5715 goto onError;
5716 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5717 goto onError;
5718 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5719 goto onError;
5720 return;
5721 onError:
5722 Py_DECREF(*exceptionObject);
5723 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 }
5725}
5726
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005727/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005728static void
5729raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005730 const char *encoding,
5731 const Py_UNICODE *unicode, Py_ssize_t size,
5732 Py_ssize_t startpos, Py_ssize_t endpos,
5733 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005734{
5735 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739}
5740
5741/* error handling callback helper:
5742 build arguments, call the callback and check the arguments,
5743 put the result into newpos and return the replacement string, which
5744 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005745static PyObject *
5746unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005747 PyObject **errorHandler,
5748 const char *encoding, const char *reason,
5749 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5750 Py_ssize_t startpos, Py_ssize_t endpos,
5751 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005753 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005754
5755 PyObject *restuple;
5756 PyObject *resunicode;
5757
5758 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005760 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762 }
5763
5764 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005766 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768
5769 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005774 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 Py_DECREF(restuple);
5776 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005778 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 &resunicode, newpos)) {
5780 Py_DECREF(restuple);
5781 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005783 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5784 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5785 Py_DECREF(restuple);
5786 return NULL;
5787 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005788 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005790 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5792 Py_DECREF(restuple);
5793 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005794 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 Py_INCREF(resunicode);
5796 Py_DECREF(restuple);
5797 return resunicode;
5798}
5799
Alexander Belopolsky40018472011-02-26 01:02:56 +00005800static PyObject *
5801unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005802 Py_ssize_t size,
5803 const char *errors,
5804 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005805{
5806 /* output object */
5807 PyObject *res;
5808 /* pointers to the beginning and end+1 of input */
5809 const Py_UNICODE *startp = p;
5810 const Py_UNICODE *endp = p + size;
5811 /* pointer to the beginning of the unencodable characters */
5812 /* const Py_UNICODE *badp = NULL; */
5813 /* pointer into the output */
5814 char *str;
5815 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005816 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005817 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5818 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005819 PyObject *errorHandler = NULL;
5820 PyObject *exc = NULL;
5821 /* the following variable is used for caching string comparisons
5822 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5823 int known_errorHandler = -1;
5824
5825 /* allocate enough for a simple encoding without
5826 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005827 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005828 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005829 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005830 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005831 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005832 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005833 ressize = size;
5834
5835 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005837
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 /* can we encode this? */
5839 if (c<limit) {
5840 /* no overflow check, because we know that the space is enough */
5841 *str++ = (char)c;
5842 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005843 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 else {
5845 Py_ssize_t unicodepos = p-startp;
5846 Py_ssize_t requiredsize;
5847 PyObject *repunicode;
5848 Py_ssize_t repsize;
5849 Py_ssize_t newpos;
5850 Py_ssize_t respos;
5851 Py_UNICODE *uni2;
5852 /* startpos for collecting unencodable chars */
5853 const Py_UNICODE *collstart = p;
5854 const Py_UNICODE *collend = p;
5855 /* find all unecodable characters */
5856 while ((collend < endp) && ((*collend)>=limit))
5857 ++collend;
5858 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5859 if (known_errorHandler==-1) {
5860 if ((errors==NULL) || (!strcmp(errors, "strict")))
5861 known_errorHandler = 1;
5862 else if (!strcmp(errors, "replace"))
5863 known_errorHandler = 2;
5864 else if (!strcmp(errors, "ignore"))
5865 known_errorHandler = 3;
5866 else if (!strcmp(errors, "xmlcharrefreplace"))
5867 known_errorHandler = 4;
5868 else
5869 known_errorHandler = 0;
5870 }
5871 switch (known_errorHandler) {
5872 case 1: /* strict */
5873 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5874 goto onError;
5875 case 2: /* replace */
5876 while (collstart++<collend)
5877 *str++ = '?'; /* fall through */
5878 case 3: /* ignore */
5879 p = collend;
5880 break;
5881 case 4: /* xmlcharrefreplace */
5882 respos = str - PyBytes_AS_STRING(res);
5883 /* determine replacement size (temporarily (mis)uses p) */
5884 for (p = collstart, repsize = 0; p < collend; ++p) {
5885 if (*p<10)
5886 repsize += 2+1+1;
5887 else if (*p<100)
5888 repsize += 2+2+1;
5889 else if (*p<1000)
5890 repsize += 2+3+1;
5891 else if (*p<10000)
5892 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005893#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005894 else
5895 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005896#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 else if (*p<100000)
5898 repsize += 2+5+1;
5899 else if (*p<1000000)
5900 repsize += 2+6+1;
5901 else
5902 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005903#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 }
5905 requiredsize = respos+repsize+(endp-collend);
5906 if (requiredsize > ressize) {
5907 if (requiredsize<2*ressize)
5908 requiredsize = 2*ressize;
5909 if (_PyBytes_Resize(&res, requiredsize))
5910 goto onError;
5911 str = PyBytes_AS_STRING(res) + respos;
5912 ressize = requiredsize;
5913 }
5914 /* generate replacement (temporarily (mis)uses p) */
5915 for (p = collstart; p < collend; ++p) {
5916 str += sprintf(str, "&#%d;", (int)*p);
5917 }
5918 p = collend;
5919 break;
5920 default:
5921 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5922 encoding, reason, startp, size, &exc,
5923 collstart-startp, collend-startp, &newpos);
5924 if (repunicode == NULL)
5925 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005926 if (PyBytes_Check(repunicode)) {
5927 /* Directly copy bytes result to output. */
5928 repsize = PyBytes_Size(repunicode);
5929 if (repsize > 1) {
5930 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005931 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005932 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5933 Py_DECREF(repunicode);
5934 goto onError;
5935 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005936 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005937 ressize += repsize-1;
5938 }
5939 memcpy(str, PyBytes_AsString(repunicode), repsize);
5940 str += repsize;
5941 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005942 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005943 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005944 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 /* need more space? (at least enough for what we
5946 have+the replacement+the rest of the string, so
5947 we won't have to check space for encodable characters) */
5948 respos = str - PyBytes_AS_STRING(res);
5949 repsize = PyUnicode_GET_SIZE(repunicode);
5950 requiredsize = respos+repsize+(endp-collend);
5951 if (requiredsize > ressize) {
5952 if (requiredsize<2*ressize)
5953 requiredsize = 2*ressize;
5954 if (_PyBytes_Resize(&res, requiredsize)) {
5955 Py_DECREF(repunicode);
5956 goto onError;
5957 }
5958 str = PyBytes_AS_STRING(res) + respos;
5959 ressize = requiredsize;
5960 }
5961 /* check if there is anything unencodable in the replacement
5962 and copy it to the output */
5963 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5964 c = *uni2;
5965 if (c >= limit) {
5966 raise_encode_exception(&exc, encoding, startp, size,
5967 unicodepos, unicodepos+1, reason);
5968 Py_DECREF(repunicode);
5969 goto onError;
5970 }
5971 *str = (char)c;
5972 }
5973 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005974 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005975 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005976 }
5977 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005978 /* Resize if we allocated to much */
5979 size = str - PyBytes_AS_STRING(res);
5980 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005981 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005982 if (_PyBytes_Resize(&res, size) < 0)
5983 goto onError;
5984 }
5985
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005986 Py_XDECREF(errorHandler);
5987 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005988 return res;
5989
5990 onError:
5991 Py_XDECREF(res);
5992 Py_XDECREF(errorHandler);
5993 Py_XDECREF(exc);
5994 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005995}
5996
Alexander Belopolsky40018472011-02-26 01:02:56 +00005997PyObject *
5998PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005999 Py_ssize_t size,
6000 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003}
6004
Alexander Belopolsky40018472011-02-26 01:02:56 +00006005PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006006_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007{
6008 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 PyErr_BadArgument();
6010 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006012 if (PyUnicode_READY(unicode) == -1)
6013 return NULL;
6014 /* Fast path: if it is a one-byte string, construct
6015 bytes object directly. */
6016 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6017 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6018 PyUnicode_GET_LENGTH(unicode));
6019 /* Non-Latin-1 characters present. Defer to above function to
6020 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006023 errors);
6024}
6025
6026PyObject*
6027PyUnicode_AsLatin1String(PyObject *unicode)
6028{
6029 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030}
6031
6032/* --- 7-bit ASCII Codec -------------------------------------------------- */
6033
Alexander Belopolsky40018472011-02-26 01:02:56 +00006034PyObject *
6035PyUnicode_DecodeASCII(const char *s,
6036 Py_ssize_t size,
6037 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006039 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 PyUnicodeObject *v;
6041 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006042 Py_ssize_t startinpos;
6043 Py_ssize_t endinpos;
6044 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006046 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006047 PyObject *errorHandler = NULL;
6048 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006049 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006050
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006052 if (size == 1 && *(unsigned char*)s < 128)
6053 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6054
6055 /* Fast path. Assume the input actually *is* ASCII, and allocate
6056 a single-block Unicode object with that assumption. If there is
6057 an error, drop the object and start over. */
6058 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6059 if (v == NULL)
6060 goto onError;
6061 d = PyUnicode_1BYTE_DATA(v);
6062 for (i = 0; i < size; i++) {
6063 unsigned char ch = ((unsigned char*)s)[i];
6064 if (ch < 128)
6065 d[i] = ch;
6066 else
6067 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006069 if (i == size)
6070 return (PyObject*)v;
6071 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006072
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 v = _PyUnicode_New(size);
6074 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 e = s + size;
6080 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 register unsigned char c = (unsigned char)*s;
6082 if (c < 128) {
6083 *p++ = c;
6084 ++s;
6085 }
6086 else {
6087 startinpos = s-starts;
6088 endinpos = startinpos + 1;
6089 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6090 if (unicode_decode_call_errorhandler(
6091 errors, &errorHandler,
6092 "ascii", "ordinal not in range(128)",
6093 &starts, &e, &startinpos, &endinpos, &exc, &s,
6094 &v, &outpos, &p))
6095 goto onError;
6096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006098 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6100 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006101 Py_XDECREF(errorHandler);
6102 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006103 if (PyUnicode_READY(v) == -1) {
6104 Py_DECREF(v);
6105 return NULL;
6106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006108
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111 Py_XDECREF(errorHandler);
6112 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 return NULL;
6114}
6115
Alexander Belopolsky40018472011-02-26 01:02:56 +00006116PyObject *
6117PyUnicode_EncodeASCII(const Py_UNICODE *p,
6118 Py_ssize_t size,
6119 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122}
6123
Alexander Belopolsky40018472011-02-26 01:02:56 +00006124PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006125_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126{
6127 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 PyErr_BadArgument();
6129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006131 if (PyUnicode_READY(unicode) == -1)
6132 return NULL;
6133 /* Fast path: if it is an ASCII-only string, construct bytes object
6134 directly. Else defer to above function to raise the exception. */
6135 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6136 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6137 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006140 errors);
6141}
6142
6143PyObject *
6144PyUnicode_AsASCIIString(PyObject *unicode)
6145{
6146 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147}
6148
Victor Stinner99b95382011-07-04 14:23:54 +02006149#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006150
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006151/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006152
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006153#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006154#define NEED_RETRY
6155#endif
6156
6157/* XXX This code is limited to "true" double-byte encodings, as
6158 a) it assumes an incomplete character consists of a single byte, and
6159 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006161
Alexander Belopolsky40018472011-02-26 01:02:56 +00006162static int
6163is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006164{
6165 const char *curr = s + offset;
6166
6167 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 const char *prev = CharPrev(s, curr);
6169 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006170 }
6171 return 0;
6172}
6173
6174/*
6175 * Decode MBCS string into unicode object. If 'final' is set, converts
6176 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6177 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006178static int
6179decode_mbcs(PyUnicodeObject **v,
6180 const char *s, /* MBCS string */
6181 int size, /* sizeof MBCS string */
6182 int final,
6183 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006184{
6185 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006186 Py_ssize_t n;
6187 DWORD usize;
6188 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006189
6190 assert(size >= 0);
6191
Victor Stinner554f3f02010-06-16 23:33:54 +00006192 /* check and handle 'errors' arg */
6193 if (errors==NULL || strcmp(errors, "strict")==0)
6194 flags = MB_ERR_INVALID_CHARS;
6195 else if (strcmp(errors, "ignore")==0)
6196 flags = 0;
6197 else {
6198 PyErr_Format(PyExc_ValueError,
6199 "mbcs encoding does not support errors='%s'",
6200 errors);
6201 return -1;
6202 }
6203
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006204 /* Skip trailing lead-byte unless 'final' is set */
6205 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006207
6208 /* First get the size of the result */
6209 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006210 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6211 if (usize==0)
6212 goto mbcs_decode_error;
6213 } else
6214 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006215
6216 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006217 /* Create unicode object */
6218 *v = _PyUnicode_New(usize);
6219 if (*v == NULL)
6220 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006221 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006222 }
6223 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006224 /* Extend unicode object */
6225 n = PyUnicode_GET_SIZE(*v);
6226 if (_PyUnicode_Resize(v, n + usize) < 0)
6227 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006228 }
6229
6230 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006231 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006232 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006233 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6234 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006236 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006237 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006238
6239mbcs_decode_error:
6240 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6241 we raise a UnicodeDecodeError - else it is a 'generic'
6242 windows error
6243 */
6244 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6245 /* Ideally, we should get reason from FormatMessage - this
6246 is the Windows 2000 English version of the message
6247 */
6248 PyObject *exc = NULL;
6249 const char *reason = "No mapping for the Unicode character exists "
6250 "in the target multi-byte code page.";
6251 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6252 if (exc != NULL) {
6253 PyCodec_StrictErrors(exc);
6254 Py_DECREF(exc);
6255 }
6256 } else {
6257 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6258 }
6259 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006260}
6261
Alexander Belopolsky40018472011-02-26 01:02:56 +00006262PyObject *
6263PyUnicode_DecodeMBCSStateful(const char *s,
6264 Py_ssize_t size,
6265 const char *errors,
6266 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006267{
6268 PyUnicodeObject *v = NULL;
6269 int done;
6270
6271 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006273
6274#ifdef NEED_RETRY
6275 retry:
6276 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006277 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006278 else
6279#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006280 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006281
6282 if (done < 0) {
6283 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006285 }
6286
6287 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006289
6290#ifdef NEED_RETRY
6291 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 s += done;
6293 size -= done;
6294 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006295 }
6296#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006297 if (PyUnicode_READY(v) == -1) {
6298 Py_DECREF(v);
6299 return NULL;
6300 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006301 return (PyObject *)v;
6302}
6303
Alexander Belopolsky40018472011-02-26 01:02:56 +00006304PyObject *
6305PyUnicode_DecodeMBCS(const char *s,
6306 Py_ssize_t size,
6307 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006308{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006309 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6310}
6311
6312/*
6313 * Convert unicode into string object (MBCS).
6314 * Returns 0 if succeed, -1 otherwise.
6315 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006316static int
6317encode_mbcs(PyObject **repr,
6318 const Py_UNICODE *p, /* unicode */
6319 int size, /* size of unicode */
6320 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006321{
Victor Stinner554f3f02010-06-16 23:33:54 +00006322 BOOL usedDefaultChar = FALSE;
6323 BOOL *pusedDefaultChar;
6324 int mbcssize;
6325 Py_ssize_t n;
6326 PyObject *exc = NULL;
6327 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006328
6329 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006330
Victor Stinner554f3f02010-06-16 23:33:54 +00006331 /* check and handle 'errors' arg */
6332 if (errors==NULL || strcmp(errors, "strict")==0) {
6333 flags = WC_NO_BEST_FIT_CHARS;
6334 pusedDefaultChar = &usedDefaultChar;
6335 } else if (strcmp(errors, "replace")==0) {
6336 flags = 0;
6337 pusedDefaultChar = NULL;
6338 } else {
6339 PyErr_Format(PyExc_ValueError,
6340 "mbcs encoding does not support errors='%s'",
6341 errors);
6342 return -1;
6343 }
6344
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006345 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006346 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006347 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6348 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 if (mbcssize == 0) {
6350 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6351 return -1;
6352 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006353 /* If we used a default char, then we failed! */
6354 if (pusedDefaultChar && *pusedDefaultChar)
6355 goto mbcs_encode_error;
6356 } else {
6357 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006358 }
6359
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006360 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006361 /* Create string object */
6362 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6363 if (*repr == NULL)
6364 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006365 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006366 }
6367 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 /* Extend string object */
6369 n = PyBytes_Size(*repr);
6370 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6371 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006372 }
6373
6374 /* Do the conversion */
6375 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006377 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6378 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6380 return -1;
6381 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006382 if (pusedDefaultChar && *pusedDefaultChar)
6383 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006384 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006385 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006386
6387mbcs_encode_error:
6388 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6389 Py_XDECREF(exc);
6390 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006391}
6392
Alexander Belopolsky40018472011-02-26 01:02:56 +00006393PyObject *
6394PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6395 Py_ssize_t size,
6396 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006397{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006398 PyObject *repr = NULL;
6399 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006400
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006401#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006403 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006404 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006405 else
6406#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006407 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006408
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006409 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 Py_XDECREF(repr);
6411 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006412 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006413
6414#ifdef NEED_RETRY
6415 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 p += INT_MAX;
6417 size -= INT_MAX;
6418 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006419 }
6420#endif
6421
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006422 return repr;
6423}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006424
Alexander Belopolsky40018472011-02-26 01:02:56 +00006425PyObject *
6426PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006427{
6428 if (!PyUnicode_Check(unicode)) {
6429 PyErr_BadArgument();
6430 return NULL;
6431 }
6432 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 PyUnicode_GET_SIZE(unicode),
6434 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006435}
6436
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006437#undef NEED_RETRY
6438
Victor Stinner99b95382011-07-04 14:23:54 +02006439#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006440
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441/* --- Character Mapping Codec -------------------------------------------- */
6442
Alexander Belopolsky40018472011-02-26 01:02:56 +00006443PyObject *
6444PyUnicode_DecodeCharmap(const char *s,
6445 Py_ssize_t size,
6446 PyObject *mapping,
6447 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006449 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006450 Py_ssize_t startinpos;
6451 Py_ssize_t endinpos;
6452 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006453 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 PyUnicodeObject *v;
6455 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006456 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457 PyObject *errorHandler = NULL;
6458 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006459 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006460 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006461
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 /* Default to Latin-1 */
6463 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465
6466 v = _PyUnicode_New(size);
6467 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006472 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006473 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006474 mapstring = PyUnicode_AS_UNICODE(mapping);
6475 maplen = PyUnicode_GET_SIZE(mapping);
6476 while (s < e) {
6477 unsigned char ch = *s;
6478 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479
Benjamin Peterson29060642009-01-31 22:14:21 +00006480 if (ch < maplen)
6481 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 if (x == 0xfffe) {
6484 /* undefined mapping */
6485 outpos = p-PyUnicode_AS_UNICODE(v);
6486 startinpos = s-starts;
6487 endinpos = startinpos+1;
6488 if (unicode_decode_call_errorhandler(
6489 errors, &errorHandler,
6490 "charmap", "character maps to <undefined>",
6491 &starts, &e, &startinpos, &endinpos, &exc, &s,
6492 &v, &outpos, &p)) {
6493 goto onError;
6494 }
6495 continue;
6496 }
6497 *p++ = x;
6498 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006499 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006500 }
6501 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 while (s < e) {
6503 unsigned char ch = *s;
6504 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006505
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6507 w = PyLong_FromLong((long)ch);
6508 if (w == NULL)
6509 goto onError;
6510 x = PyObject_GetItem(mapping, w);
6511 Py_DECREF(w);
6512 if (x == NULL) {
6513 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6514 /* No mapping found means: mapping is undefined. */
6515 PyErr_Clear();
6516 x = Py_None;
6517 Py_INCREF(x);
6518 } else
6519 goto onError;
6520 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006521
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 /* Apply mapping */
6523 if (PyLong_Check(x)) {
6524 long value = PyLong_AS_LONG(x);
6525 if (value < 0 || value > 65535) {
6526 PyErr_SetString(PyExc_TypeError,
6527 "character mapping must be in range(65536)");
6528 Py_DECREF(x);
6529 goto onError;
6530 }
6531 *p++ = (Py_UNICODE)value;
6532 }
6533 else if (x == Py_None) {
6534 /* undefined mapping */
6535 outpos = p-PyUnicode_AS_UNICODE(v);
6536 startinpos = s-starts;
6537 endinpos = startinpos+1;
6538 if (unicode_decode_call_errorhandler(
6539 errors, &errorHandler,
6540 "charmap", "character maps to <undefined>",
6541 &starts, &e, &startinpos, &endinpos, &exc, &s,
6542 &v, &outpos, &p)) {
6543 Py_DECREF(x);
6544 goto onError;
6545 }
6546 Py_DECREF(x);
6547 continue;
6548 }
6549 else if (PyUnicode_Check(x)) {
6550 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006551
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 if (targetsize == 1)
6553 /* 1-1 mapping */
6554 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006555
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 else if (targetsize > 1) {
6557 /* 1-n mapping */
6558 if (targetsize > extrachars) {
6559 /* resize first */
6560 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6561 Py_ssize_t needed = (targetsize - extrachars) + \
6562 (targetsize << 2);
6563 extrachars += needed;
6564 /* XXX overflow detection missing */
6565 if (_PyUnicode_Resize(&v,
6566 PyUnicode_GET_SIZE(v) + needed) < 0) {
6567 Py_DECREF(x);
6568 goto onError;
6569 }
6570 p = PyUnicode_AS_UNICODE(v) + oldpos;
6571 }
6572 Py_UNICODE_COPY(p,
6573 PyUnicode_AS_UNICODE(x),
6574 targetsize);
6575 p += targetsize;
6576 extrachars -= targetsize;
6577 }
6578 /* 1-0 mapping: skip the character */
6579 }
6580 else {
6581 /* wrong return value */
6582 PyErr_SetString(PyExc_TypeError,
6583 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006584 Py_DECREF(x);
6585 goto onError;
6586 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 Py_DECREF(x);
6588 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 }
6591 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6593 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006594 Py_XDECREF(errorHandler);
6595 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006596 if (PyUnicode_READY(v) == -1) {
6597 Py_DECREF(v);
6598 return NULL;
6599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006601
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006603 Py_XDECREF(errorHandler);
6604 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 Py_XDECREF(v);
6606 return NULL;
6607}
6608
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006609/* Charmap encoding: the lookup table */
6610
Alexander Belopolsky40018472011-02-26 01:02:56 +00006611struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006612 PyObject_HEAD
6613 unsigned char level1[32];
6614 int count2, count3;
6615 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006616};
6617
6618static PyObject*
6619encoding_map_size(PyObject *obj, PyObject* args)
6620{
6621 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006622 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006624}
6625
6626static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006627 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 PyDoc_STR("Return the size (in bytes) of this object") },
6629 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006630};
6631
6632static void
6633encoding_map_dealloc(PyObject* o)
6634{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006635 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006636}
6637
6638static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006639 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006640 "EncodingMap", /*tp_name*/
6641 sizeof(struct encoding_map), /*tp_basicsize*/
6642 0, /*tp_itemsize*/
6643 /* methods */
6644 encoding_map_dealloc, /*tp_dealloc*/
6645 0, /*tp_print*/
6646 0, /*tp_getattr*/
6647 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006648 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 0, /*tp_repr*/
6650 0, /*tp_as_number*/
6651 0, /*tp_as_sequence*/
6652 0, /*tp_as_mapping*/
6653 0, /*tp_hash*/
6654 0, /*tp_call*/
6655 0, /*tp_str*/
6656 0, /*tp_getattro*/
6657 0, /*tp_setattro*/
6658 0, /*tp_as_buffer*/
6659 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6660 0, /*tp_doc*/
6661 0, /*tp_traverse*/
6662 0, /*tp_clear*/
6663 0, /*tp_richcompare*/
6664 0, /*tp_weaklistoffset*/
6665 0, /*tp_iter*/
6666 0, /*tp_iternext*/
6667 encoding_map_methods, /*tp_methods*/
6668 0, /*tp_members*/
6669 0, /*tp_getset*/
6670 0, /*tp_base*/
6671 0, /*tp_dict*/
6672 0, /*tp_descr_get*/
6673 0, /*tp_descr_set*/
6674 0, /*tp_dictoffset*/
6675 0, /*tp_init*/
6676 0, /*tp_alloc*/
6677 0, /*tp_new*/
6678 0, /*tp_free*/
6679 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006680};
6681
6682PyObject*
6683PyUnicode_BuildEncodingMap(PyObject* string)
6684{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006685 PyObject *result;
6686 struct encoding_map *mresult;
6687 int i;
6688 int need_dict = 0;
6689 unsigned char level1[32];
6690 unsigned char level2[512];
6691 unsigned char *mlevel1, *mlevel2, *mlevel3;
6692 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006693 int kind;
6694 void *data;
6695 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006697 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006698 PyErr_BadArgument();
6699 return NULL;
6700 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006701 kind = PyUnicode_KIND(string);
6702 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006703 memset(level1, 0xFF, sizeof level1);
6704 memset(level2, 0xFF, sizeof level2);
6705
6706 /* If there isn't a one-to-one mapping of NULL to \0,
6707 or if there are non-BMP characters, we need to use
6708 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006709 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006710 need_dict = 1;
6711 for (i = 1; i < 256; i++) {
6712 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006713 ch = PyUnicode_READ(kind, data, i);
6714 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006715 need_dict = 1;
6716 break;
6717 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006718 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006719 /* unmapped character */
6720 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006721 l1 = ch >> 11;
6722 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006723 if (level1[l1] == 0xFF)
6724 level1[l1] = count2++;
6725 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006726 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006727 }
6728
6729 if (count2 >= 0xFF || count3 >= 0xFF)
6730 need_dict = 1;
6731
6732 if (need_dict) {
6733 PyObject *result = PyDict_New();
6734 PyObject *key, *value;
6735 if (!result)
6736 return NULL;
6737 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006738 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006739 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006740 if (!key || !value)
6741 goto failed1;
6742 if (PyDict_SetItem(result, key, value) == -1)
6743 goto failed1;
6744 Py_DECREF(key);
6745 Py_DECREF(value);
6746 }
6747 return result;
6748 failed1:
6749 Py_XDECREF(key);
6750 Py_XDECREF(value);
6751 Py_DECREF(result);
6752 return NULL;
6753 }
6754
6755 /* Create a three-level trie */
6756 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6757 16*count2 + 128*count3 - 1);
6758 if (!result)
6759 return PyErr_NoMemory();
6760 PyObject_Init(result, &EncodingMapType);
6761 mresult = (struct encoding_map*)result;
6762 mresult->count2 = count2;
6763 mresult->count3 = count3;
6764 mlevel1 = mresult->level1;
6765 mlevel2 = mresult->level23;
6766 mlevel3 = mresult->level23 + 16*count2;
6767 memcpy(mlevel1, level1, 32);
6768 memset(mlevel2, 0xFF, 16*count2);
6769 memset(mlevel3, 0, 128*count3);
6770 count3 = 0;
6771 for (i = 1; i < 256; i++) {
6772 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006773 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006774 /* unmapped character */
6775 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006776 o1 = PyUnicode_READ(kind, data, i)>>11;
6777 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006778 i2 = 16*mlevel1[o1] + o2;
6779 if (mlevel2[i2] == 0xFF)
6780 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006781 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006782 i3 = 128*mlevel2[i2] + o3;
6783 mlevel3[i3] = i;
6784 }
6785 return result;
6786}
6787
6788static int
6789encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6790{
6791 struct encoding_map *map = (struct encoding_map*)mapping;
6792 int l1 = c>>11;
6793 int l2 = (c>>7) & 0xF;
6794 int l3 = c & 0x7F;
6795 int i;
6796
6797#ifdef Py_UNICODE_WIDE
6798 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006800 }
6801#endif
6802 if (c == 0)
6803 return 0;
6804 /* level 1*/
6805 i = map->level1[l1];
6806 if (i == 0xFF) {
6807 return -1;
6808 }
6809 /* level 2*/
6810 i = map->level23[16*i+l2];
6811 if (i == 0xFF) {
6812 return -1;
6813 }
6814 /* level 3 */
6815 i = map->level23[16*map->count2 + 128*i + l3];
6816 if (i == 0) {
6817 return -1;
6818 }
6819 return i;
6820}
6821
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822/* Lookup the character ch in the mapping. If the character
6823 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006824 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006825static PyObject *
6826charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827{
Christian Heimes217cfd12007-12-02 14:31:20 +00006828 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006829 PyObject *x;
6830
6831 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006833 x = PyObject_GetItem(mapping, w);
6834 Py_DECREF(w);
6835 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006836 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6837 /* No mapping found means: mapping is undefined. */
6838 PyErr_Clear();
6839 x = Py_None;
6840 Py_INCREF(x);
6841 return x;
6842 } else
6843 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006845 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006847 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006848 long value = PyLong_AS_LONG(x);
6849 if (value < 0 || value > 255) {
6850 PyErr_SetString(PyExc_TypeError,
6851 "character mapping must be in range(256)");
6852 Py_DECREF(x);
6853 return NULL;
6854 }
6855 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006857 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 /* wrong return value */
6861 PyErr_Format(PyExc_TypeError,
6862 "character mapping must return integer, bytes or None, not %.400s",
6863 x->ob_type->tp_name);
6864 Py_DECREF(x);
6865 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866 }
6867}
6868
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006869static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006870charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006871{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006872 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6873 /* exponentially overallocate to minimize reallocations */
6874 if (requiredsize < 2*outsize)
6875 requiredsize = 2*outsize;
6876 if (_PyBytes_Resize(outobj, requiredsize))
6877 return -1;
6878 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006879}
6880
Benjamin Peterson14339b62009-01-31 16:36:08 +00006881typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006882 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006883} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006884/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006885 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006886 space is available. Return a new reference to the object that
6887 was put in the output buffer, or Py_None, if the mapping was undefined
6888 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006889 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006890static charmapencode_result
6891charmapencode_output(Py_UNICODE c, PyObject *mapping,
6892 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006893{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006894 PyObject *rep;
6895 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006896 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006897
Christian Heimes90aa7642007-12-19 02:45:37 +00006898 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006899 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006901 if (res == -1)
6902 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 if (outsize<requiredsize)
6904 if (charmapencode_resize(outobj, outpos, requiredsize))
6905 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006906 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 outstart[(*outpos)++] = (char)res;
6908 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006909 }
6910
6911 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006912 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006913 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006914 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006915 Py_DECREF(rep);
6916 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006917 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 if (PyLong_Check(rep)) {
6919 Py_ssize_t requiredsize = *outpos+1;
6920 if (outsize<requiredsize)
6921 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6922 Py_DECREF(rep);
6923 return enc_EXCEPTION;
6924 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006925 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006927 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 else {
6929 const char *repchars = PyBytes_AS_STRING(rep);
6930 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6931 Py_ssize_t requiredsize = *outpos+repsize;
6932 if (outsize<requiredsize)
6933 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6934 Py_DECREF(rep);
6935 return enc_EXCEPTION;
6936 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006937 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 memcpy(outstart + *outpos, repchars, repsize);
6939 *outpos += repsize;
6940 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006941 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006942 Py_DECREF(rep);
6943 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006944}
6945
6946/* handle an error in PyUnicode_EncodeCharmap
6947 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006948static int
6949charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006950 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006951 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006952 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006953 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006954{
6955 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006956 Py_ssize_t repsize;
6957 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006958 Py_UNICODE *uni2;
6959 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006960 Py_ssize_t collstartpos = *inpos;
6961 Py_ssize_t collendpos = *inpos+1;
6962 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006963 char *encoding = "charmap";
6964 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006965 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006966
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006967 /* find all unencodable characters */
6968 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006969 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006970 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 int res = encoding_map_lookup(p[collendpos], mapping);
6972 if (res != -1)
6973 break;
6974 ++collendpos;
6975 continue;
6976 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006977
Benjamin Peterson29060642009-01-31 22:14:21 +00006978 rep = charmapencode_lookup(p[collendpos], mapping);
6979 if (rep==NULL)
6980 return -1;
6981 else if (rep!=Py_None) {
6982 Py_DECREF(rep);
6983 break;
6984 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006985 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006986 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006987 }
6988 /* cache callback name lookup
6989 * (if not done yet, i.e. it's the first error) */
6990 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 if ((errors==NULL) || (!strcmp(errors, "strict")))
6992 *known_errorHandler = 1;
6993 else if (!strcmp(errors, "replace"))
6994 *known_errorHandler = 2;
6995 else if (!strcmp(errors, "ignore"))
6996 *known_errorHandler = 3;
6997 else if (!strcmp(errors, "xmlcharrefreplace"))
6998 *known_errorHandler = 4;
6999 else
7000 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007001 }
7002 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007003 case 1: /* strict */
7004 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7005 return -1;
7006 case 2: /* replace */
7007 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 x = charmapencode_output('?', mapping, res, respos);
7009 if (x==enc_EXCEPTION) {
7010 return -1;
7011 }
7012 else if (x==enc_FAILED) {
7013 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7014 return -1;
7015 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007016 }
7017 /* fall through */
7018 case 3: /* ignore */
7019 *inpos = collendpos;
7020 break;
7021 case 4: /* xmlcharrefreplace */
7022 /* generate replacement (temporarily (mis)uses p) */
7023 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 char buffer[2+29+1+1];
7025 char *cp;
7026 sprintf(buffer, "&#%d;", (int)p[collpos]);
7027 for (cp = buffer; *cp; ++cp) {
7028 x = charmapencode_output(*cp, mapping, res, respos);
7029 if (x==enc_EXCEPTION)
7030 return -1;
7031 else if (x==enc_FAILED) {
7032 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7033 return -1;
7034 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007035 }
7036 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007037 *inpos = collendpos;
7038 break;
7039 default:
7040 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007041 encoding, reason, p, size, exceptionObject,
7042 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007043 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007044 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007045 if (PyBytes_Check(repunicode)) {
7046 /* Directly copy bytes result to output. */
7047 Py_ssize_t outsize = PyBytes_Size(*res);
7048 Py_ssize_t requiredsize;
7049 repsize = PyBytes_Size(repunicode);
7050 requiredsize = *respos + repsize;
7051 if (requiredsize > outsize)
7052 /* Make room for all additional bytes. */
7053 if (charmapencode_resize(res, respos, requiredsize)) {
7054 Py_DECREF(repunicode);
7055 return -1;
7056 }
7057 memcpy(PyBytes_AsString(*res) + *respos,
7058 PyBytes_AsString(repunicode), repsize);
7059 *respos += repsize;
7060 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007061 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007062 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007063 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007064 /* generate replacement */
7065 repsize = PyUnicode_GET_SIZE(repunicode);
7066 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007067 x = charmapencode_output(*uni2, mapping, res, respos);
7068 if (x==enc_EXCEPTION) {
7069 return -1;
7070 }
7071 else if (x==enc_FAILED) {
7072 Py_DECREF(repunicode);
7073 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7074 return -1;
7075 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007076 }
7077 *inpos = newpos;
7078 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007079 }
7080 return 0;
7081}
7082
Alexander Belopolsky40018472011-02-26 01:02:56 +00007083PyObject *
7084PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7085 Py_ssize_t size,
7086 PyObject *mapping,
7087 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007089 /* output object */
7090 PyObject *res = NULL;
7091 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007092 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007093 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007094 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007095 PyObject *errorHandler = NULL;
7096 PyObject *exc = NULL;
7097 /* the following variable is used for caching string comparisons
7098 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7099 * 3=ignore, 4=xmlcharrefreplace */
7100 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101
7102 /* Default to Latin-1 */
7103 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007106 /* allocate enough for a simple encoding without
7107 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007108 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007109 if (res == NULL)
7110 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007111 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007112 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007114 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 /* try to encode it */
7116 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7117 if (x==enc_EXCEPTION) /* error */
7118 goto onError;
7119 if (x==enc_FAILED) { /* unencodable character */
7120 if (charmap_encoding_error(p, size, &inpos, mapping,
7121 &exc,
7122 &known_errorHandler, &errorHandler, errors,
7123 &res, &respos)) {
7124 goto onError;
7125 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007126 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 else
7128 /* done with this character => adjust input position */
7129 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007132 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007133 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007134 if (_PyBytes_Resize(&res, respos) < 0)
7135 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007136
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007137 Py_XDECREF(exc);
7138 Py_XDECREF(errorHandler);
7139 return res;
7140
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007142 Py_XDECREF(res);
7143 Py_XDECREF(exc);
7144 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 return NULL;
7146}
7147
Alexander Belopolsky40018472011-02-26 01:02:56 +00007148PyObject *
7149PyUnicode_AsCharmapString(PyObject *unicode,
7150 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151{
7152 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007153 PyErr_BadArgument();
7154 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155 }
7156 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007157 PyUnicode_GET_SIZE(unicode),
7158 mapping,
7159 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160}
7161
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007162/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007163static void
7164make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007165 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007166 Py_ssize_t startpos, Py_ssize_t endpos,
7167 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007169 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007170 *exceptionObject = _PyUnicodeTranslateError_Create(
7171 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172 }
7173 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7175 goto onError;
7176 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7177 goto onError;
7178 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7179 goto onError;
7180 return;
7181 onError:
7182 Py_DECREF(*exceptionObject);
7183 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184 }
7185}
7186
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007187/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007188static void
7189raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007190 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007191 Py_ssize_t startpos, Py_ssize_t endpos,
7192 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007193{
7194 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007195 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007196 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007198}
7199
7200/* error handling callback helper:
7201 build arguments, call the callback and check the arguments,
7202 put the result into newpos and return the replacement string, which
7203 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007204static PyObject *
7205unicode_translate_call_errorhandler(const char *errors,
7206 PyObject **errorHandler,
7207 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007208 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007209 Py_ssize_t startpos, Py_ssize_t endpos,
7210 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007211{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007212 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007213
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007214 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007215 PyObject *restuple;
7216 PyObject *resunicode;
7217
7218 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007220 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007221 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007222 }
7223
7224 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007225 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007226 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007228
7229 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007231 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007233 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007234 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007235 Py_DECREF(restuple);
7236 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007237 }
7238 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007239 &resunicode, &i_newpos)) {
7240 Py_DECREF(restuple);
7241 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007242 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007243 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007244 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007245 else
7246 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007247 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007248 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7249 Py_DECREF(restuple);
7250 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007251 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007252 Py_INCREF(resunicode);
7253 Py_DECREF(restuple);
7254 return resunicode;
7255}
7256
7257/* Lookup the character ch in the mapping and put the result in result,
7258 which must be decrefed by the caller.
7259 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007260static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007261charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007262{
Christian Heimes217cfd12007-12-02 14:31:20 +00007263 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007264 PyObject *x;
7265
7266 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007268 x = PyObject_GetItem(mapping, w);
7269 Py_DECREF(w);
7270 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007271 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7272 /* No mapping found means: use 1:1 mapping. */
7273 PyErr_Clear();
7274 *result = NULL;
7275 return 0;
7276 } else
7277 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007278 }
7279 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007280 *result = x;
7281 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007282 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007283 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007284 long value = PyLong_AS_LONG(x);
7285 long max = PyUnicode_GetMax();
7286 if (value < 0 || value > max) {
7287 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007288 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007289 Py_DECREF(x);
7290 return -1;
7291 }
7292 *result = x;
7293 return 0;
7294 }
7295 else if (PyUnicode_Check(x)) {
7296 *result = x;
7297 return 0;
7298 }
7299 else {
7300 /* wrong return value */
7301 PyErr_SetString(PyExc_TypeError,
7302 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007303 Py_DECREF(x);
7304 return -1;
7305 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007306}
7307/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007308 if not reallocate and adjust various state variables.
7309 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007310static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007311charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007312 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007313{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007314 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007315 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007316 /* exponentially overallocate to minimize reallocations */
7317 if (requiredsize < 2 * oldsize)
7318 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007319 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7320 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007321 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007322 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007323 }
7324 return 0;
7325}
7326/* lookup the character, put the result in the output string and adjust
7327 various state variables. Return a new reference to the object that
7328 was put in the output buffer in *result, or Py_None, if the mapping was
7329 undefined (in which case no character was written).
7330 The called must decref result.
7331 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007332static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007333charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7334 PyObject *mapping, Py_UCS4 **output,
7335 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007336 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007337{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007338 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7339 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007340 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007341 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007342 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007343 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007344 }
7345 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007346 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007347 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007349 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007350 }
7351 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007352 Py_ssize_t repsize;
7353 if (PyUnicode_READY(*res) == -1)
7354 return -1;
7355 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 if (repsize==1) {
7357 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007358 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 }
7360 else if (repsize!=0) {
7361 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007362 Py_ssize_t requiredsize = *opos +
7363 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007364 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007365 Py_ssize_t i;
7366 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007368 for(i = 0; i < repsize; i++)
7369 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007371 }
7372 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007374 return 0;
7375}
7376
Alexander Belopolsky40018472011-02-26 01:02:56 +00007377PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007378_PyUnicode_TranslateCharmap(PyObject *input,
7379 PyObject *mapping,
7380 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007382 /* input object */
7383 char *idata;
7384 Py_ssize_t size, i;
7385 int kind;
7386 /* output buffer */
7387 Py_UCS4 *output = NULL;
7388 Py_ssize_t osize;
7389 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007390 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007391 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007392 char *reason = "character maps to <undefined>";
7393 PyObject *errorHandler = NULL;
7394 PyObject *exc = NULL;
7395 /* the following variable is used for caching string comparisons
7396 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7397 * 3=ignore, 4=xmlcharrefreplace */
7398 int known_errorHandler = -1;
7399
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 PyErr_BadArgument();
7402 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007405 if (PyUnicode_READY(input) == -1)
7406 return NULL;
7407 idata = (char*)PyUnicode_DATA(input);
7408 kind = PyUnicode_KIND(input);
7409 size = PyUnicode_GET_LENGTH(input);
7410 i = 0;
7411
7412 if (size == 0) {
7413 Py_INCREF(input);
7414 return input;
7415 }
7416
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007417 /* allocate enough for a simple 1:1 translation without
7418 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007419 osize = size;
7420 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7421 opos = 0;
7422 if (output == NULL) {
7423 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007424 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007425 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007427 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 /* try to encode it */
7429 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007430 if (charmaptranslate_output(input, i, mapping,
7431 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 Py_XDECREF(x);
7433 goto onError;
7434 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007435 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007436 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007437 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007438 else { /* untranslatable character */
7439 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7440 Py_ssize_t repsize;
7441 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007442 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007444 Py_ssize_t collstart = i;
7445 Py_ssize_t collend = i+1;
7446 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007449 while (collend < size) {
7450 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 goto onError;
7452 Py_XDECREF(x);
7453 if (x!=Py_None)
7454 break;
7455 ++collend;
7456 }
7457 /* cache callback name lookup
7458 * (if not done yet, i.e. it's the first error) */
7459 if (known_errorHandler==-1) {
7460 if ((errors==NULL) || (!strcmp(errors, "strict")))
7461 known_errorHandler = 1;
7462 else if (!strcmp(errors, "replace"))
7463 known_errorHandler = 2;
7464 else if (!strcmp(errors, "ignore"))
7465 known_errorHandler = 3;
7466 else if (!strcmp(errors, "xmlcharrefreplace"))
7467 known_errorHandler = 4;
7468 else
7469 known_errorHandler = 0;
7470 }
7471 switch (known_errorHandler) {
7472 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007473 raise_translate_exception(&exc, input, collstart,
7474 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007475 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 case 2: /* replace */
7477 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007478 for (coll = collstart; coll<collend; coll++)
7479 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 /* fall through */
7481 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007482 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 break;
7484 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007485 /* generate replacement (temporarily (mis)uses i) */
7486 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007487 char buffer[2+29+1+1];
7488 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007489 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7490 if (charmaptranslate_makespace(&output, &osize,
7491 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007492 goto onError;
7493 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007494 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007496 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007497 break;
7498 default:
7499 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007500 reason, input, &exc,
7501 collstart, collend, &newpos);
7502 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007503 goto onError;
7504 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007505 repsize = PyUnicode_GET_LENGTH(repunicode);
7506 if (charmaptranslate_makespace(&output, &osize,
7507 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007508 Py_DECREF(repunicode);
7509 goto onError;
7510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007511 for (uni2 = 0; repsize-->0; ++uni2)
7512 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7513 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007515 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007516 }
7517 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007518 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7519 if (!res)
7520 goto onError;
7521 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007522 Py_XDECREF(exc);
7523 Py_XDECREF(errorHandler);
7524 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007527 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007528 Py_XDECREF(exc);
7529 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530 return NULL;
7531}
7532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007533/* Deprecated. Use PyUnicode_Translate instead. */
7534PyObject *
7535PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7536 Py_ssize_t size,
7537 PyObject *mapping,
7538 const char *errors)
7539{
7540 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7541 if (!unicode)
7542 return NULL;
7543 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7544}
7545
Alexander Belopolsky40018472011-02-26 01:02:56 +00007546PyObject *
7547PyUnicode_Translate(PyObject *str,
7548 PyObject *mapping,
7549 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550{
7551 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007552
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553 str = PyUnicode_FromObject(str);
7554 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007556 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557 Py_DECREF(str);
7558 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007559
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561 Py_XDECREF(str);
7562 return NULL;
7563}
Tim Petersced69f82003-09-16 20:30:58 +00007564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007565static Py_UCS4
7566fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7567{
7568 /* No need to call PyUnicode_READY(self) because this function is only
7569 called as a callback from fixup() which does it already. */
7570 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7571 const int kind = PyUnicode_KIND(self);
7572 void *data = PyUnicode_DATA(self);
7573 Py_UCS4 maxchar = 0, ch, fixed;
7574 Py_ssize_t i;
7575
7576 for (i = 0; i < len; ++i) {
7577 ch = PyUnicode_READ(kind, data, i);
7578 fixed = 0;
7579 if (ch > 127) {
7580 if (Py_UNICODE_ISSPACE(ch))
7581 fixed = ' ';
7582 else {
7583 const int decimal = Py_UNICODE_TODECIMAL(ch);
7584 if (decimal >= 0)
7585 fixed = '0' + decimal;
7586 }
7587 if (fixed != 0) {
7588 if (fixed > maxchar)
7589 maxchar = fixed;
7590 PyUnicode_WRITE(kind, data, i, fixed);
7591 }
7592 else if (ch > maxchar)
7593 maxchar = ch;
7594 }
7595 else if (ch > maxchar)
7596 maxchar = ch;
7597 }
7598
7599 return maxchar;
7600}
7601
7602PyObject *
7603_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7604{
7605 if (!PyUnicode_Check(unicode)) {
7606 PyErr_BadInternalCall();
7607 return NULL;
7608 }
7609 if (PyUnicode_READY(unicode) == -1)
7610 return NULL;
7611 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7612 /* If the string is already ASCII, just return the same string */
7613 Py_INCREF(unicode);
7614 return unicode;
7615 }
7616 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7617}
7618
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007619PyObject *
7620PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7621 Py_ssize_t length)
7622{
7623 PyObject *result;
7624 Py_UNICODE *p; /* write pointer into result */
7625 Py_ssize_t i;
7626 /* Copy to a new string */
7627 result = (PyObject *)_PyUnicode_New(length);
7628 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7629 if (result == NULL)
7630 return result;
7631 p = PyUnicode_AS_UNICODE(result);
7632 /* Iterate over code points */
7633 for (i = 0; i < length; i++) {
7634 Py_UNICODE ch =s[i];
7635 if (ch > 127) {
7636 int decimal = Py_UNICODE_TODECIMAL(ch);
7637 if (decimal >= 0)
7638 p[i] = '0' + decimal;
7639 }
7640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007641 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7642 Py_DECREF(result);
7643 return NULL;
7644 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007645 return result;
7646}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007647/* --- Decimal Encoder ---------------------------------------------------- */
7648
Alexander Belopolsky40018472011-02-26 01:02:56 +00007649int
7650PyUnicode_EncodeDecimal(Py_UNICODE *s,
7651 Py_ssize_t length,
7652 char *output,
7653 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007654{
7655 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007656 PyObject *errorHandler = NULL;
7657 PyObject *exc = NULL;
7658 const char *encoding = "decimal";
7659 const char *reason = "invalid decimal Unicode string";
7660 /* the following variable is used for caching string comparisons
7661 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7662 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007663
7664 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 PyErr_BadArgument();
7666 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007667 }
7668
7669 p = s;
7670 end = s + length;
7671 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 register Py_UNICODE ch = *p;
7673 int decimal;
7674 PyObject *repunicode;
7675 Py_ssize_t repsize;
7676 Py_ssize_t newpos;
7677 Py_UNICODE *uni2;
7678 Py_UNICODE *collstart;
7679 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007680
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007682 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007683 ++p;
7684 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007685 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007686 decimal = Py_UNICODE_TODECIMAL(ch);
7687 if (decimal >= 0) {
7688 *output++ = '0' + decimal;
7689 ++p;
7690 continue;
7691 }
7692 if (0 < ch && ch < 256) {
7693 *output++ = (char)ch;
7694 ++p;
7695 continue;
7696 }
7697 /* All other characters are considered unencodable */
7698 collstart = p;
7699 collend = p+1;
7700 while (collend < end) {
7701 if ((0 < *collend && *collend < 256) ||
7702 !Py_UNICODE_ISSPACE(*collend) ||
7703 Py_UNICODE_TODECIMAL(*collend))
7704 break;
7705 }
7706 /* cache callback name lookup
7707 * (if not done yet, i.e. it's the first error) */
7708 if (known_errorHandler==-1) {
7709 if ((errors==NULL) || (!strcmp(errors, "strict")))
7710 known_errorHandler = 1;
7711 else if (!strcmp(errors, "replace"))
7712 known_errorHandler = 2;
7713 else if (!strcmp(errors, "ignore"))
7714 known_errorHandler = 3;
7715 else if (!strcmp(errors, "xmlcharrefreplace"))
7716 known_errorHandler = 4;
7717 else
7718 known_errorHandler = 0;
7719 }
7720 switch (known_errorHandler) {
7721 case 1: /* strict */
7722 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7723 goto onError;
7724 case 2: /* replace */
7725 for (p = collstart; p < collend; ++p)
7726 *output++ = '?';
7727 /* fall through */
7728 case 3: /* ignore */
7729 p = collend;
7730 break;
7731 case 4: /* xmlcharrefreplace */
7732 /* generate replacement (temporarily (mis)uses p) */
7733 for (p = collstart; p < collend; ++p)
7734 output += sprintf(output, "&#%d;", (int)*p);
7735 p = collend;
7736 break;
7737 default:
7738 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7739 encoding, reason, s, length, &exc,
7740 collstart-s, collend-s, &newpos);
7741 if (repunicode == NULL)
7742 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007743 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007744 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007745 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7746 Py_DECREF(repunicode);
7747 goto onError;
7748 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 /* generate replacement */
7750 repsize = PyUnicode_GET_SIZE(repunicode);
7751 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7752 Py_UNICODE ch = *uni2;
7753 if (Py_UNICODE_ISSPACE(ch))
7754 *output++ = ' ';
7755 else {
7756 decimal = Py_UNICODE_TODECIMAL(ch);
7757 if (decimal >= 0)
7758 *output++ = '0' + decimal;
7759 else if (0 < ch && ch < 256)
7760 *output++ = (char)ch;
7761 else {
7762 Py_DECREF(repunicode);
7763 raise_encode_exception(&exc, encoding,
7764 s, length, collstart-s, collend-s, reason);
7765 goto onError;
7766 }
7767 }
7768 }
7769 p = s + newpos;
7770 Py_DECREF(repunicode);
7771 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007772 }
7773 /* 0-terminate the output string */
7774 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007775 Py_XDECREF(exc);
7776 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007777 return 0;
7778
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007780 Py_XDECREF(exc);
7781 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007782 return -1;
7783}
7784
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785/* --- Helpers ------------------------------------------------------------ */
7786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007787#include "stringlib/ucs1lib.h"
7788#include "stringlib/fastsearch.h"
7789#include "stringlib/partition.h"
7790#include "stringlib/split.h"
7791#include "stringlib/count.h"
7792#include "stringlib/find.h"
7793#include "stringlib/localeutil.h"
7794#include "stringlib/undef.h"
7795
7796#include "stringlib/ucs2lib.h"
7797#include "stringlib/fastsearch.h"
7798#include "stringlib/partition.h"
7799#include "stringlib/split.h"
7800#include "stringlib/count.h"
7801#include "stringlib/find.h"
7802#include "stringlib/localeutil.h"
7803#include "stringlib/undef.h"
7804
7805#include "stringlib/ucs4lib.h"
7806#include "stringlib/fastsearch.h"
7807#include "stringlib/partition.h"
7808#include "stringlib/split.h"
7809#include "stringlib/count.h"
7810#include "stringlib/find.h"
7811#include "stringlib/localeutil.h"
7812#include "stringlib/undef.h"
7813
7814static Py_ssize_t
7815any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7816 const Py_UCS1*, Py_ssize_t,
7817 Py_ssize_t, Py_ssize_t),
7818 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7819 const Py_UCS2*, Py_ssize_t,
7820 Py_ssize_t, Py_ssize_t),
7821 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7822 const Py_UCS4*, Py_ssize_t,
7823 Py_ssize_t, Py_ssize_t),
7824 PyObject* s1, PyObject* s2,
7825 Py_ssize_t start,
7826 Py_ssize_t end)
7827{
7828 int kind1, kind2, kind;
7829 void *buf1, *buf2;
7830 Py_ssize_t len1, len2, result;
7831
7832 kind1 = PyUnicode_KIND(s1);
7833 kind2 = PyUnicode_KIND(s2);
7834 kind = kind1 > kind2 ? kind1 : kind2;
7835 buf1 = PyUnicode_DATA(s1);
7836 buf2 = PyUnicode_DATA(s2);
7837 if (kind1 != kind)
7838 buf1 = _PyUnicode_AsKind(s1, kind);
7839 if (!buf1)
7840 return -2;
7841 if (kind2 != kind)
7842 buf2 = _PyUnicode_AsKind(s2, kind);
7843 if (!buf2) {
7844 if (kind1 != kind) PyMem_Free(buf1);
7845 return -2;
7846 }
7847 len1 = PyUnicode_GET_LENGTH(s1);
7848 len2 = PyUnicode_GET_LENGTH(s2);
7849
7850 switch(kind) {
7851 case PyUnicode_1BYTE_KIND:
7852 result = ucs1(buf1, len1, buf2, len2, start, end);
7853 break;
7854 case PyUnicode_2BYTE_KIND:
7855 result = ucs2(buf1, len1, buf2, len2, start, end);
7856 break;
7857 case PyUnicode_4BYTE_KIND:
7858 result = ucs4(buf1, len1, buf2, len2, start, end);
7859 break;
7860 default:
7861 assert(0); result = -2;
7862 }
7863
7864 if (kind1 != kind)
7865 PyMem_Free(buf1);
7866 if (kind2 != kind)
7867 PyMem_Free(buf2);
7868
7869 return result;
7870}
7871
7872Py_ssize_t
7873_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7874 Py_ssize_t n_buffer,
7875 void *digits, Py_ssize_t n_digits,
7876 Py_ssize_t min_width,
7877 const char *grouping,
7878 const char *thousands_sep)
7879{
7880 switch(kind) {
7881 case PyUnicode_1BYTE_KIND:
7882 return _PyUnicode_ucs1_InsertThousandsGrouping(
7883 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7884 min_width, grouping, thousands_sep);
7885 case PyUnicode_2BYTE_KIND:
7886 return _PyUnicode_ucs2_InsertThousandsGrouping(
7887 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7888 min_width, grouping, thousands_sep);
7889 case PyUnicode_4BYTE_KIND:
7890 return _PyUnicode_ucs4_InsertThousandsGrouping(
7891 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7892 min_width, grouping, thousands_sep);
7893 }
7894 assert(0);
7895 return -1;
7896}
7897
7898
Eric Smith8c663262007-08-25 02:26:07 +00007899#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007900#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007901
Thomas Wouters477c8d52006-05-27 19:21:47 +00007902#include "stringlib/count.h"
7903#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007904
Thomas Wouters477c8d52006-05-27 19:21:47 +00007905/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007906#define ADJUST_INDICES(start, end, len) \
7907 if (end > len) \
7908 end = len; \
7909 else if (end < 0) { \
7910 end += len; \
7911 if (end < 0) \
7912 end = 0; \
7913 } \
7914 if (start < 0) { \
7915 start += len; \
7916 if (start < 0) \
7917 start = 0; \
7918 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007919
Alexander Belopolsky40018472011-02-26 01:02:56 +00007920Py_ssize_t
7921PyUnicode_Count(PyObject *str,
7922 PyObject *substr,
7923 Py_ssize_t start,
7924 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007925{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007926 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007927 PyUnicodeObject* str_obj;
7928 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007929 int kind1, kind2, kind;
7930 void *buf1 = NULL, *buf2 = NULL;
7931 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007932
Thomas Wouters477c8d52006-05-27 19:21:47 +00007933 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007934 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007936 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02007937 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007938 Py_DECREF(str_obj);
7939 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940 }
Tim Petersced69f82003-09-16 20:30:58 +00007941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007942 kind1 = PyUnicode_KIND(str_obj);
7943 kind2 = PyUnicode_KIND(sub_obj);
7944 kind = kind1 > kind2 ? kind1 : kind2;
7945 buf1 = PyUnicode_DATA(str_obj);
7946 if (kind1 != kind)
7947 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7948 if (!buf1)
7949 goto onError;
7950 buf2 = PyUnicode_DATA(sub_obj);
7951 if (kind2 != kind)
7952 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7953 if (!buf2)
7954 goto onError;
7955 len1 = PyUnicode_GET_LENGTH(str_obj);
7956 len2 = PyUnicode_GET_LENGTH(sub_obj);
7957
7958 ADJUST_INDICES(start, end, len1);
7959 switch(kind) {
7960 case PyUnicode_1BYTE_KIND:
7961 result = ucs1lib_count(
7962 ((Py_UCS1*)buf1) + start, end - start,
7963 buf2, len2, PY_SSIZE_T_MAX
7964 );
7965 break;
7966 case PyUnicode_2BYTE_KIND:
7967 result = ucs2lib_count(
7968 ((Py_UCS2*)buf1) + start, end - start,
7969 buf2, len2, PY_SSIZE_T_MAX
7970 );
7971 break;
7972 case PyUnicode_4BYTE_KIND:
7973 result = ucs4lib_count(
7974 ((Py_UCS4*)buf1) + start, end - start,
7975 buf2, len2, PY_SSIZE_T_MAX
7976 );
7977 break;
7978 default:
7979 assert(0); result = 0;
7980 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007981
7982 Py_DECREF(sub_obj);
7983 Py_DECREF(str_obj);
7984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007985 if (kind1 != kind)
7986 PyMem_Free(buf1);
7987 if (kind2 != kind)
7988 PyMem_Free(buf2);
7989
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007991 onError:
7992 Py_DECREF(sub_obj);
7993 Py_DECREF(str_obj);
7994 if (kind1 != kind && buf1)
7995 PyMem_Free(buf1);
7996 if (kind2 != kind && buf2)
7997 PyMem_Free(buf2);
7998 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999}
8000
Alexander Belopolsky40018472011-02-26 01:02:56 +00008001Py_ssize_t
8002PyUnicode_Find(PyObject *str,
8003 PyObject *sub,
8004 Py_ssize_t start,
8005 Py_ssize_t end,
8006 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008008 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008009
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008011 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008013 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008014 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 Py_DECREF(str);
8016 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 }
Tim Petersced69f82003-09-16 20:30:58 +00008018
Thomas Wouters477c8d52006-05-27 19:21:47 +00008019 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008020 result = any_find_slice(
8021 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8022 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008023 );
8024 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008025 result = any_find_slice(
8026 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8027 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008028 );
8029
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008031 Py_DECREF(sub);
8032
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 return result;
8034}
8035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008036Py_ssize_t
8037PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8038 Py_ssize_t start, Py_ssize_t end,
8039 int direction)
8040{
8041 char *result;
8042 int kind;
8043 if (PyUnicode_READY(str) == -1)
8044 return -2;
8045 if (end > PyUnicode_GET_LENGTH(str))
8046 end = PyUnicode_GET_LENGTH(str);
8047 kind = PyUnicode_KIND(str);
8048 result = findchar(PyUnicode_1BYTE_DATA(str)
8049 + PyUnicode_KIND_SIZE(kind, start),
8050 kind,
8051 end-start, ch, direction);
8052 if (!result)
8053 return -1;
8054 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8055}
8056
Alexander Belopolsky40018472011-02-26 01:02:56 +00008057static int
8058tailmatch(PyUnicodeObject *self,
8059 PyUnicodeObject *substring,
8060 Py_ssize_t start,
8061 Py_ssize_t end,
8062 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008064 int kind_self;
8065 int kind_sub;
8066 void *data_self;
8067 void *data_sub;
8068 Py_ssize_t offset;
8069 Py_ssize_t i;
8070 Py_ssize_t end_sub;
8071
8072 if (PyUnicode_READY(self) == -1 ||
8073 PyUnicode_READY(substring) == -1)
8074 return 0;
8075
8076 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077 return 1;
8078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008079 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8080 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008082 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008084 kind_self = PyUnicode_KIND(self);
8085 data_self = PyUnicode_DATA(self);
8086 kind_sub = PyUnicode_KIND(substring);
8087 data_sub = PyUnicode_DATA(substring);
8088 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8089
8090 if (direction > 0)
8091 offset = end;
8092 else
8093 offset = start;
8094
8095 if (PyUnicode_READ(kind_self, data_self, offset) ==
8096 PyUnicode_READ(kind_sub, data_sub, 0) &&
8097 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8098 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8099 /* If both are of the same kind, memcmp is sufficient */
8100 if (kind_self == kind_sub) {
8101 return ! memcmp((char *)data_self +
8102 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8103 data_sub,
8104 PyUnicode_GET_LENGTH(substring) *
8105 PyUnicode_CHARACTER_SIZE(substring));
8106 }
8107 /* otherwise we have to compare each character by first accesing it */
8108 else {
8109 /* We do not need to compare 0 and len(substring)-1 because
8110 the if statement above ensured already that they are equal
8111 when we end up here. */
8112 // TODO: honor direction and do a forward or backwards search
8113 for (i = 1; i < end_sub; ++i) {
8114 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8115 PyUnicode_READ(kind_sub, data_sub, i))
8116 return 0;
8117 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120 }
8121
8122 return 0;
8123}
8124
Alexander Belopolsky40018472011-02-26 01:02:56 +00008125Py_ssize_t
8126PyUnicode_Tailmatch(PyObject *str,
8127 PyObject *substr,
8128 Py_ssize_t start,
8129 Py_ssize_t end,
8130 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008132 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008133
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 str = PyUnicode_FromObject(str);
8135 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137 substr = PyUnicode_FromObject(substr);
8138 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 Py_DECREF(str);
8140 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141 }
Tim Petersced69f82003-09-16 20:30:58 +00008142
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 (PyUnicodeObject *)substr,
8145 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 Py_DECREF(str);
8147 Py_DECREF(substr);
8148 return result;
8149}
8150
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151/* Apply fixfct filter to the Unicode object self and return a
8152 reference to the modified object */
8153
Alexander Belopolsky40018472011-02-26 01:02:56 +00008154static PyObject *
8155fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008158 PyObject *u;
8159 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008161 if (PyUnicode_READY(self) == -1)
8162 return NULL;
8163 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8164 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8165 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008169 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8170 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008172 /* fix functions return the new maximum character in a string,
8173 if the kind of the resulting unicode object does not change,
8174 everything is fine. Otherwise we need to change the string kind
8175 and re-run the fix function. */
8176 maxchar_new = fixfct((PyUnicodeObject*)u);
8177 if (maxchar_new == 0)
8178 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8179 else if (maxchar_new <= 127)
8180 maxchar_new = 127;
8181 else if (maxchar_new <= 255)
8182 maxchar_new = 255;
8183 else if (maxchar_new <= 65535)
8184 maxchar_new = 65535;
8185 else
8186 maxchar_new = 1114111; /* 0x10ffff */
8187
8188 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 /* fixfct should return TRUE if it modified the buffer. If
8190 FALSE, return a reference to the original buffer instead
8191 (to save space, not time) */
8192 Py_INCREF(self);
8193 Py_DECREF(u);
8194 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008196 else if (maxchar_new == maxchar_old) {
8197 return u;
8198 }
8199 else {
8200 /* In case the maximum character changed, we need to
8201 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008202 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008203 if (v == NULL) {
8204 Py_DECREF(u);
8205 return NULL;
8206 }
8207 if (maxchar_new > maxchar_old) {
8208 /* If the maxchar increased so that the kind changed, not all
8209 characters are representable anymore and we need to fix the
8210 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008211 if (PyUnicode_CopyCharacters(v, 0,
8212 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008213 PyUnicode_GET_LENGTH(self)) < 0)
8214 {
8215 Py_DECREF(u);
8216 return NULL;
8217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008218 maxchar_old = fixfct((PyUnicodeObject*)v);
8219 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8220 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008221 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008222 if (PyUnicode_CopyCharacters(v, 0,
8223 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008224 PyUnicode_GET_LENGTH(self)) < 0)
8225 {
8226 Py_DECREF(u);
8227 return NULL;
8228 }
8229 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008230
8231 Py_DECREF(u);
8232 return v;
8233 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234}
8235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008236static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008237fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008239 /* No need to call PyUnicode_READY(self) because this function is only
8240 called as a callback from fixup() which does it already. */
8241 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8242 const int kind = PyUnicode_KIND(self);
8243 void *data = PyUnicode_DATA(self);
8244 int touched = 0;
8245 Py_UCS4 maxchar = 0;
8246 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248 for (i = 0; i < len; ++i) {
8249 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8250 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8251 if (up != ch) {
8252 if (up > maxchar)
8253 maxchar = up;
8254 PyUnicode_WRITE(kind, data, i, up);
8255 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008257 else if (ch > maxchar)
8258 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259 }
8260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008261 if (touched)
8262 return maxchar;
8263 else
8264 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265}
8266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008267static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008268fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008270 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8271 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8272 const int kind = PyUnicode_KIND(self);
8273 void *data = PyUnicode_DATA(self);
8274 int touched = 0;
8275 Py_UCS4 maxchar = 0;
8276 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008278 for(i = 0; i < len; ++i) {
8279 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8280 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8281 if (lo != ch) {
8282 if (lo > maxchar)
8283 maxchar = lo;
8284 PyUnicode_WRITE(kind, data, i, lo);
8285 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008286 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008287 else if (ch > maxchar)
8288 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 }
8290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008291 if (touched)
8292 return maxchar;
8293 else
8294 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295}
8296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008297static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008298fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008300 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8301 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8302 const int kind = PyUnicode_KIND(self);
8303 void *data = PyUnicode_DATA(self);
8304 int touched = 0;
8305 Py_UCS4 maxchar = 0;
8306 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008308 for(i = 0; i < len; ++i) {
8309 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8310 Py_UCS4 nu = 0;
8311
8312 if (Py_UNICODE_ISUPPER(ch))
8313 nu = Py_UNICODE_TOLOWER(ch);
8314 else if (Py_UNICODE_ISLOWER(ch))
8315 nu = Py_UNICODE_TOUPPER(ch);
8316
8317 if (nu != 0) {
8318 if (nu > maxchar)
8319 maxchar = nu;
8320 PyUnicode_WRITE(kind, data, i, nu);
8321 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008323 else if (ch > maxchar)
8324 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325 }
8326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327 if (touched)
8328 return maxchar;
8329 else
8330 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331}
8332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008333static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008334fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008336 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8337 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8338 const int kind = PyUnicode_KIND(self);
8339 void *data = PyUnicode_DATA(self);
8340 int touched = 0;
8341 Py_UCS4 maxchar = 0;
8342 Py_ssize_t i = 0;
8343 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008344
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008345 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008347
8348 ch = PyUnicode_READ(kind, data, i);
8349 if (!Py_UNICODE_ISUPPER(ch)) {
8350 maxchar = Py_UNICODE_TOUPPER(ch);
8351 PyUnicode_WRITE(kind, data, i, maxchar);
8352 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008354 ++i;
8355 for(; i < len; ++i) {
8356 ch = PyUnicode_READ(kind, data, i);
8357 if (!Py_UNICODE_ISLOWER(ch)) {
8358 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8359 if (lo > maxchar)
8360 maxchar = lo;
8361 PyUnicode_WRITE(kind, data, i, lo);
8362 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008363 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364 else if (ch > maxchar)
8365 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367
8368 if (touched)
8369 return maxchar;
8370 else
8371 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372}
8373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008375fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8378 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8379 const int kind = PyUnicode_KIND(self);
8380 void *data = PyUnicode_DATA(self);
8381 Py_UCS4 maxchar = 0;
8382 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 int previous_is_cased;
8384
8385 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 if (len == 1) {
8387 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8388 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8389 if (ti != ch) {
8390 PyUnicode_WRITE(kind, data, i, ti);
8391 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 }
8393 else
8394 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008397 for(; i < len; ++i) {
8398 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8399 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008400
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 nu = Py_UNICODE_TOTITLE(ch);
8405
8406 if (nu > maxchar)
8407 maxchar = nu;
8408 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008409
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 if (Py_UNICODE_ISLOWER(ch) ||
8411 Py_UNICODE_ISUPPER(ch) ||
8412 Py_UNICODE_ISTITLE(ch))
8413 previous_is_cased = 1;
8414 else
8415 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418}
8419
Tim Peters8ce9f162004-08-27 01:49:32 +00008420PyObject *
8421PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008423 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008424 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008425 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008426 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008427 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8428 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008429 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008430 Py_ssize_t sz, i, res_offset;
8431 Py_UCS4 maxchar = 0;
8432 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433
Tim Peters05eba1f2004-08-27 21:32:02 +00008434 fseq = PySequence_Fast(seq, "");
8435 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008436 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008437 }
8438
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008439 /* NOTE: the following code can't call back into Python code,
8440 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008441 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008442
Tim Peters05eba1f2004-08-27 21:32:02 +00008443 seqlen = PySequence_Fast_GET_SIZE(fseq);
8444 /* If empty sequence, return u"". */
8445 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008446 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008447 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008448 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008449 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008450 /* If singleton sequence with an exact Unicode, return that. */
8451 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 item = items[0];
8453 if (PyUnicode_CheckExact(item)) {
8454 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 goto Done;
8457 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008458 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008459 else {
8460 /* Set up sep and seplen */
8461 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008462 /* fall back to a blank space separator */
8463 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008464 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008465 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008466 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008467 else {
8468 if (!PyUnicode_Check(separator)) {
8469 PyErr_Format(PyExc_TypeError,
8470 "separator: expected str instance,"
8471 " %.80s found",
8472 Py_TYPE(separator)->tp_name);
8473 goto onError;
8474 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008475 if (PyUnicode_READY(separator) == -1)
8476 goto onError;
8477 sep = separator;
8478 seplen = PyUnicode_GET_LENGTH(separator);
8479 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8480 /* inc refcount to keep this code path symetric with the
8481 above case of a blank separator */
8482 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008483 }
8484 }
8485
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008486 /* There are at least two things to join, or else we have a subclass
8487 * of str in the sequence.
8488 * Do a pre-pass to figure out the total amount of space we'll
8489 * need (sz), and see whether all argument are strings.
8490 */
8491 sz = 0;
8492 for (i = 0; i < seqlen; i++) {
8493 const Py_ssize_t old_sz = sz;
8494 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 if (!PyUnicode_Check(item)) {
8496 PyErr_Format(PyExc_TypeError,
8497 "sequence item %zd: expected str instance,"
8498 " %.80s found",
8499 i, Py_TYPE(item)->tp_name);
8500 goto onError;
8501 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 if (PyUnicode_READY(item) == -1)
8503 goto onError;
8504 sz += PyUnicode_GET_LENGTH(item);
8505 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8506 if (item_maxchar > maxchar)
8507 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008508 if (i != 0)
8509 sz += seplen;
8510 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8511 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008513 goto onError;
8514 }
8515 }
Tim Petersced69f82003-09-16 20:30:58 +00008516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008518 if (res == NULL)
8519 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008520
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008521 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008522 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008523 Py_ssize_t itemlen;
8524 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 /* Copy item, and maybe the separator. */
8527 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008528 if (PyUnicode_CopyCharacters(res, res_offset,
8529 sep, 0, seplen) < 0)
8530 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008533 if (PyUnicode_CopyCharacters(res, res_offset,
8534 item, 0, itemlen) < 0)
8535 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008536 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008537 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008539
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008541 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 Py_XDECREF(sep);
8543 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008546 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008547 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008548 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549 return NULL;
8550}
8551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008552#define FILL(kind, data, value, start, length) \
8553 do { \
8554 Py_ssize_t i_ = 0; \
8555 assert(kind != PyUnicode_WCHAR_KIND); \
8556 switch ((kind)) { \
8557 case PyUnicode_1BYTE_KIND: { \
8558 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8559 memset(to_, (unsigned char)value, length); \
8560 break; \
8561 } \
8562 case PyUnicode_2BYTE_KIND: { \
8563 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8564 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8565 break; \
8566 } \
8567 default: { \
8568 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8569 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8570 break; \
8571 } \
8572 } \
8573 } while (0)
8574
Alexander Belopolsky40018472011-02-26 01:02:56 +00008575static PyUnicodeObject *
8576pad(PyUnicodeObject *self,
8577 Py_ssize_t left,
8578 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008581 PyObject *u;
8582 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008583 int kind;
8584 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585
8586 if (left < 0)
8587 left = 0;
8588 if (right < 0)
8589 right = 0;
8590
Tim Peters7a29bd52001-09-12 03:03:31 +00008591 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 Py_INCREF(self);
8593 return self;
8594 }
8595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8597 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008598 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8599 return NULL;
8600 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8602 if (fill > maxchar)
8603 maxchar = fill;
8604 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008605 if (!u)
8606 return NULL;
8607
8608 kind = PyUnicode_KIND(u);
8609 data = PyUnicode_DATA(u);
8610 if (left)
8611 FILL(kind, data, fill, 0, left);
8612 if (right)
8613 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008614 if (PyUnicode_CopyCharacters(u, left,
8615 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008616 _PyUnicode_LENGTH(self)) < 0)
8617 {
8618 Py_DECREF(u);
8619 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 }
8621
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625
Alexander Belopolsky40018472011-02-26 01:02:56 +00008626PyObject *
8627PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630
8631 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 switch(PyUnicode_KIND(string)) {
8636 case PyUnicode_1BYTE_KIND:
8637 list = ucs1lib_splitlines(
8638 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8639 PyUnicode_GET_LENGTH(string), keepends);
8640 break;
8641 case PyUnicode_2BYTE_KIND:
8642 list = ucs2lib_splitlines(
8643 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8644 PyUnicode_GET_LENGTH(string), keepends);
8645 break;
8646 case PyUnicode_4BYTE_KIND:
8647 list = ucs4lib_splitlines(
8648 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8649 PyUnicode_GET_LENGTH(string), keepends);
8650 break;
8651 default:
8652 assert(0);
8653 list = 0;
8654 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655 Py_DECREF(string);
8656 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657}
8658
Alexander Belopolsky40018472011-02-26 01:02:56 +00008659static PyObject *
8660split(PyUnicodeObject *self,
8661 PyUnicodeObject *substring,
8662 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 int kind1, kind2, kind;
8665 void *buf1, *buf2;
8666 Py_ssize_t len1, len2;
8667 PyObject* out;
8668
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008670 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672 if (PyUnicode_READY(self) == -1)
8673 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675 if (substring == NULL)
8676 switch(PyUnicode_KIND(self)) {
8677 case PyUnicode_1BYTE_KIND:
8678 return ucs1lib_split_whitespace(
8679 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8680 PyUnicode_GET_LENGTH(self), maxcount
8681 );
8682 case PyUnicode_2BYTE_KIND:
8683 return ucs2lib_split_whitespace(
8684 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8685 PyUnicode_GET_LENGTH(self), maxcount
8686 );
8687 case PyUnicode_4BYTE_KIND:
8688 return ucs4lib_split_whitespace(
8689 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8690 PyUnicode_GET_LENGTH(self), maxcount
8691 );
8692 default:
8693 assert(0);
8694 return NULL;
8695 }
8696
8697 if (PyUnicode_READY(substring) == -1)
8698 return NULL;
8699
8700 kind1 = PyUnicode_KIND(self);
8701 kind2 = PyUnicode_KIND(substring);
8702 kind = kind1 > kind2 ? kind1 : kind2;
8703 buf1 = PyUnicode_DATA(self);
8704 buf2 = PyUnicode_DATA(substring);
8705 if (kind1 != kind)
8706 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8707 if (!buf1)
8708 return NULL;
8709 if (kind2 != kind)
8710 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8711 if (!buf2) {
8712 if (kind1 != kind) PyMem_Free(buf1);
8713 return NULL;
8714 }
8715 len1 = PyUnicode_GET_LENGTH(self);
8716 len2 = PyUnicode_GET_LENGTH(substring);
8717
8718 switch(kind) {
8719 case PyUnicode_1BYTE_KIND:
8720 out = ucs1lib_split(
8721 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8722 break;
8723 case PyUnicode_2BYTE_KIND:
8724 out = ucs2lib_split(
8725 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8726 break;
8727 case PyUnicode_4BYTE_KIND:
8728 out = ucs4lib_split(
8729 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8730 break;
8731 default:
8732 out = NULL;
8733 }
8734 if (kind1 != kind)
8735 PyMem_Free(buf1);
8736 if (kind2 != kind)
8737 PyMem_Free(buf2);
8738 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739}
8740
Alexander Belopolsky40018472011-02-26 01:02:56 +00008741static PyObject *
8742rsplit(PyUnicodeObject *self,
8743 PyUnicodeObject *substring,
8744 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008745{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 int kind1, kind2, kind;
8747 void *buf1, *buf2;
8748 Py_ssize_t len1, len2;
8749 PyObject* out;
8750
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008751 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008752 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008754 if (PyUnicode_READY(self) == -1)
8755 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008757 if (substring == NULL)
8758 switch(PyUnicode_KIND(self)) {
8759 case PyUnicode_1BYTE_KIND:
8760 return ucs1lib_rsplit_whitespace(
8761 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8762 PyUnicode_GET_LENGTH(self), maxcount
8763 );
8764 case PyUnicode_2BYTE_KIND:
8765 return ucs2lib_rsplit_whitespace(
8766 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8767 PyUnicode_GET_LENGTH(self), maxcount
8768 );
8769 case PyUnicode_4BYTE_KIND:
8770 return ucs4lib_rsplit_whitespace(
8771 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8772 PyUnicode_GET_LENGTH(self), maxcount
8773 );
8774 default:
8775 assert(0);
8776 return NULL;
8777 }
8778
8779 if (PyUnicode_READY(substring) == -1)
8780 return NULL;
8781
8782 kind1 = PyUnicode_KIND(self);
8783 kind2 = PyUnicode_KIND(substring);
8784 kind = kind1 > kind2 ? kind1 : kind2;
8785 buf1 = PyUnicode_DATA(self);
8786 buf2 = PyUnicode_DATA(substring);
8787 if (kind1 != kind)
8788 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8789 if (!buf1)
8790 return NULL;
8791 if (kind2 != kind)
8792 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8793 if (!buf2) {
8794 if (kind1 != kind) PyMem_Free(buf1);
8795 return NULL;
8796 }
8797 len1 = PyUnicode_GET_LENGTH(self);
8798 len2 = PyUnicode_GET_LENGTH(substring);
8799
8800 switch(kind) {
8801 case PyUnicode_1BYTE_KIND:
8802 out = ucs1lib_rsplit(
8803 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8804 break;
8805 case PyUnicode_2BYTE_KIND:
8806 out = ucs2lib_rsplit(
8807 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8808 break;
8809 case PyUnicode_4BYTE_KIND:
8810 out = ucs4lib_rsplit(
8811 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8812 break;
8813 default:
8814 out = NULL;
8815 }
8816 if (kind1 != kind)
8817 PyMem_Free(buf1);
8818 if (kind2 != kind)
8819 PyMem_Free(buf2);
8820 return out;
8821}
8822
8823static Py_ssize_t
8824anylib_find(int kind, void *buf1, Py_ssize_t len1,
8825 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8826{
8827 switch(kind) {
8828 case PyUnicode_1BYTE_KIND:
8829 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8830 case PyUnicode_2BYTE_KIND:
8831 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8832 case PyUnicode_4BYTE_KIND:
8833 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8834 }
8835 assert(0);
8836 return -1;
8837}
8838
8839static Py_ssize_t
8840anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8841 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8842{
8843 switch(kind) {
8844 case PyUnicode_1BYTE_KIND:
8845 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8846 case PyUnicode_2BYTE_KIND:
8847 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8848 case PyUnicode_4BYTE_KIND:
8849 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8850 }
8851 assert(0);
8852 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008853}
8854
Alexander Belopolsky40018472011-02-26 01:02:56 +00008855static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856replace(PyObject *self, PyObject *str1,
8857 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008859 PyObject *u;
8860 char *sbuf = PyUnicode_DATA(self);
8861 char *buf1 = PyUnicode_DATA(str1);
8862 char *buf2 = PyUnicode_DATA(str2);
8863 int srelease = 0, release1 = 0, release2 = 0;
8864 int skind = PyUnicode_KIND(self);
8865 int kind1 = PyUnicode_KIND(str1);
8866 int kind2 = PyUnicode_KIND(str2);
8867 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8868 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8869 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870
8871 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008872 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008874 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 if (skind < kind1)
8877 /* substring too wide to be present */
8878 goto nothing;
8879
8880 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008881 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008882 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008884 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008886 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008887 Py_UCS4 u1, u2, maxchar;
8888 int mayshrink, rkind;
8889 u1 = PyUnicode_READ_CHAR(str1, 0);
8890 if (!findchar(sbuf, PyUnicode_KIND(self),
8891 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008892 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 u2 = PyUnicode_READ_CHAR(str2, 0);
8894 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8895 /* Replacing u1 with u2 may cause a maxchar reduction in the
8896 result string. */
8897 mayshrink = maxchar > 127;
8898 if (u2 > maxchar) {
8899 maxchar = u2;
8900 mayshrink = 0;
8901 }
8902 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008903 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008905 if (PyUnicode_CopyCharacters(u, 0,
8906 (PyObject*)self, 0, slen) < 0)
8907 {
8908 Py_DECREF(u);
8909 return NULL;
8910 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911 rkind = PyUnicode_KIND(u);
8912 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8913 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008914 if (--maxcount < 0)
8915 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008917 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008918 if (mayshrink) {
8919 PyObject *tmp = u;
8920 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8921 PyUnicode_GET_LENGTH(tmp));
8922 Py_DECREF(tmp);
8923 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 int rkind = skind;
8926 char *res;
8927 if (kind1 < rkind) {
8928 /* widen substring */
8929 buf1 = _PyUnicode_AsKind(str1, rkind);
8930 if (!buf1) goto error;
8931 release1 = 1;
8932 }
8933 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008934 if (i < 0)
8935 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936 if (rkind > kind2) {
8937 /* widen replacement */
8938 buf2 = _PyUnicode_AsKind(str2, rkind);
8939 if (!buf2) goto error;
8940 release2 = 1;
8941 }
8942 else if (rkind < kind2) {
8943 /* widen self and buf1 */
8944 rkind = kind2;
8945 if (release1) PyMem_Free(buf1);
8946 sbuf = _PyUnicode_AsKind(self, rkind);
8947 if (!sbuf) goto error;
8948 srelease = 1;
8949 buf1 = _PyUnicode_AsKind(str1, rkind);
8950 if (!buf1) goto error;
8951 release1 = 1;
8952 }
8953 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8954 if (!res) {
8955 PyErr_NoMemory();
8956 goto error;
8957 }
8958 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008959 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8961 buf2,
8962 PyUnicode_KIND_SIZE(rkind, len2));
8963 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008964
8965 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8967 slen-i,
8968 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008969 if (i == -1)
8970 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8972 buf2,
8973 PyUnicode_KIND_SIZE(rkind, len2));
8974 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008975 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976
8977 u = PyUnicode_FromKindAndData(rkind, res, slen);
8978 PyMem_Free(res);
8979 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008983 Py_ssize_t n, i, j, ires;
8984 Py_ssize_t product, new_size;
8985 int rkind = skind;
8986 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008988 if (kind1 < rkind) {
8989 buf1 = _PyUnicode_AsKind(str1, rkind);
8990 if (!buf1) goto error;
8991 release1 = 1;
8992 }
8993 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008994 if (n == 0)
8995 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008996 if (kind2 < rkind) {
8997 buf2 = _PyUnicode_AsKind(str2, rkind);
8998 if (!buf2) goto error;
8999 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 else if (kind2 > rkind) {
9002 rkind = kind2;
9003 sbuf = _PyUnicode_AsKind(self, rkind);
9004 if (!sbuf) goto error;
9005 srelease = 1;
9006 if (release1) PyMem_Free(buf1);
9007 buf1 = _PyUnicode_AsKind(str1, rkind);
9008 if (!buf1) goto error;
9009 release1 = 1;
9010 }
9011 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9012 PyUnicode_GET_LENGTH(str1))); */
9013 product = n * (len2-len1);
9014 if ((product / (len2-len1)) != n) {
9015 PyErr_SetString(PyExc_OverflowError,
9016 "replace string is too long");
9017 goto error;
9018 }
9019 new_size = slen + product;
9020 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9021 PyErr_SetString(PyExc_OverflowError,
9022 "replace string is too long");
9023 goto error;
9024 }
9025 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9026 if (!res)
9027 goto error;
9028 ires = i = 0;
9029 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009030 while (n-- > 0) {
9031 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 j = anylib_find(rkind,
9033 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9034 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009035 if (j == -1)
9036 break;
9037 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009038 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9040 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9041 PyUnicode_KIND_SIZE(rkind, j-i));
9042 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009043 }
9044 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045 if (len2 > 0) {
9046 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9047 buf2,
9048 PyUnicode_KIND_SIZE(rkind, len2));
9049 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009054 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9056 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9057 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009058 } else {
9059 /* interleave */
9060 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9062 buf2,
9063 PyUnicode_KIND_SIZE(rkind, len2));
9064 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009065 if (--n <= 0)
9066 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9068 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9069 PyUnicode_KIND_SIZE(rkind, 1));
9070 ires++;
9071 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009072 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9074 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9075 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009076 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009078 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 if (srelease)
9081 PyMem_FREE(sbuf);
9082 if (release1)
9083 PyMem_FREE(buf1);
9084 if (release2)
9085 PyMem_FREE(buf2);
9086 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009087
Benjamin Peterson29060642009-01-31 22:14:21 +00009088 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009089 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 if (srelease)
9091 PyMem_FREE(sbuf);
9092 if (release1)
9093 PyMem_FREE(buf1);
9094 if (release2)
9095 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009096 if (PyUnicode_CheckExact(self)) {
9097 Py_INCREF(self);
9098 return (PyObject *) self;
9099 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009100 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101 error:
9102 if (srelease && sbuf)
9103 PyMem_FREE(sbuf);
9104 if (release1 && buf1)
9105 PyMem_FREE(buf1);
9106 if (release2 && buf2)
9107 PyMem_FREE(buf2);
9108 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109}
9110
9111/* --- Unicode Object Methods --------------------------------------------- */
9112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009113PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009114 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009115\n\
9116Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009117characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118
9119static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009120unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122 return fixup(self, fixtitle);
9123}
9124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009125PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127\n\
9128Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009129have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130
9131static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009132unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134 return fixup(self, fixcapitalize);
9135}
9136
9137#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009138PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140\n\
9141Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009142normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009143
9144static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009145unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146{
9147 PyObject *list;
9148 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009149 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151 /* Split into words */
9152 list = split(self, NULL, -1);
9153 if (!list)
9154 return NULL;
9155
9156 /* Capitalize each word */
9157 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9158 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009160 if (item == NULL)
9161 goto onError;
9162 Py_DECREF(PyList_GET_ITEM(list, i));
9163 PyList_SET_ITEM(list, i, item);
9164 }
9165
9166 /* Join the words to form a new string */
9167 item = PyUnicode_Join(NULL, list);
9168
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009170 Py_DECREF(list);
9171 return (PyObject *)item;
9172}
9173#endif
9174
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009175/* Argument converter. Coerces to a single unicode character */
9176
9177static int
9178convert_uc(PyObject *obj, void *addr)
9179{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009181 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009182
Benjamin Peterson14339b62009-01-31 16:36:08 +00009183 uniobj = PyUnicode_FromObject(obj);
9184 if (uniobj == NULL) {
9185 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009187 return 0;
9188 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009190 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009192 Py_DECREF(uniobj);
9193 return 0;
9194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009196 Py_DECREF(uniobj);
9197 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009198}
9199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009200PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009201 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009203Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009204done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205
9206static PyObject *
9207unicode_center(PyUnicodeObject *self, PyObject *args)
9208{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009209 Py_ssize_t marg, left;
9210 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009211 Py_UCS4 fillchar = ' ';
9212
Victor Stinnere9a29352011-10-01 02:14:59 +02009213 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215
Victor Stinnere9a29352011-10-01 02:14:59 +02009216 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217 return NULL;
9218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220 Py_INCREF(self);
9221 return (PyObject*) self;
9222 }
9223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225 left = marg / 2 + (marg & width & 1);
9226
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009227 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228}
9229
Marc-André Lemburge5034372000-08-08 08:04:29 +00009230#if 0
9231
9232/* This code should go into some future Unicode collation support
9233 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009234 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009235
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009236/* speedy UTF-16 code point order comparison */
9237/* gleaned from: */
9238/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9239
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009240static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009241{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009242 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009243 0, 0, 0, 0, 0, 0, 0, 0,
9244 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009245 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009246};
9247
Guido van Rossumd57fd912000-03-10 22:53:23 +00009248static int
9249unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9250{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009251 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009252
Guido van Rossumd57fd912000-03-10 22:53:23 +00009253 Py_UNICODE *s1 = str1->str;
9254 Py_UNICODE *s2 = str2->str;
9255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256 len1 = str1->_base._base.length;
9257 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009258
Guido van Rossumd57fd912000-03-10 22:53:23 +00009259 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009260 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009261
9262 c1 = *s1++;
9263 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009264
Benjamin Peterson29060642009-01-31 22:14:21 +00009265 if (c1 > (1<<11) * 26)
9266 c1 += utf16Fixup[c1>>11];
9267 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009268 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009269 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009270
9271 if (c1 != c2)
9272 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009273
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009274 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275 }
9276
9277 return (len1 < len2) ? -1 : (len1 != len2);
9278}
9279
Marc-André Lemburge5034372000-08-08 08:04:29 +00009280#else
9281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282/* This function assumes that str1 and str2 are readied by the caller. */
9283
Marc-André Lemburge5034372000-08-08 08:04:29 +00009284static int
9285unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287 int kind1, kind2;
9288 void *data1, *data2;
9289 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 kind1 = PyUnicode_KIND(str1);
9292 kind2 = PyUnicode_KIND(str2);
9293 data1 = PyUnicode_DATA(str1);
9294 data2 = PyUnicode_DATA(str2);
9295 len1 = PyUnicode_GET_LENGTH(str1);
9296 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 for (i = 0; i < len1 && i < len2; ++i) {
9299 Py_UCS4 c1, c2;
9300 c1 = PyUnicode_READ(kind1, data1, i);
9301 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009302
9303 if (c1 != c2)
9304 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009305 }
9306
9307 return (len1 < len2) ? -1 : (len1 != len2);
9308}
9309
9310#endif
9311
Alexander Belopolsky40018472011-02-26 01:02:56 +00009312int
9313PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009314{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9316 if (PyUnicode_READY(left) == -1 ||
9317 PyUnicode_READY(right) == -1)
9318 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009319 return unicode_compare((PyUnicodeObject *)left,
9320 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009321 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009322 PyErr_Format(PyExc_TypeError,
9323 "Can't compare %.100s and %.100s",
9324 left->ob_type->tp_name,
9325 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326 return -1;
9327}
9328
Martin v. Löwis5b222132007-06-10 09:51:05 +00009329int
9330PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9331{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009332 Py_ssize_t i;
9333 int kind;
9334 void *data;
9335 Py_UCS4 chr;
9336
Martin v. Löwis5b222132007-06-10 09:51:05 +00009337 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 if (PyUnicode_READY(uni) == -1)
9339 return -1;
9340 kind = PyUnicode_KIND(uni);
9341 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009342 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9344 if (chr != str[i])
9345 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009346 /* This check keeps Python strings that end in '\0' from comparing equal
9347 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009349 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009350 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009352 return 0;
9353}
9354
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009355
Benjamin Peterson29060642009-01-31 22:14:21 +00009356#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009357 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009358
Alexander Belopolsky40018472011-02-26 01:02:56 +00009359PyObject *
9360PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009361{
9362 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009363
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009364 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9365 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009366 if (PyUnicode_READY(left) == -1 ||
9367 PyUnicode_READY(right) == -1)
9368 return NULL;
9369 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9370 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009371 if (op == Py_EQ) {
9372 Py_INCREF(Py_False);
9373 return Py_False;
9374 }
9375 if (op == Py_NE) {
9376 Py_INCREF(Py_True);
9377 return Py_True;
9378 }
9379 }
9380 if (left == right)
9381 result = 0;
9382 else
9383 result = unicode_compare((PyUnicodeObject *)left,
9384 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009385
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009386 /* Convert the return value to a Boolean */
9387 switch (op) {
9388 case Py_EQ:
9389 v = TEST_COND(result == 0);
9390 break;
9391 case Py_NE:
9392 v = TEST_COND(result != 0);
9393 break;
9394 case Py_LE:
9395 v = TEST_COND(result <= 0);
9396 break;
9397 case Py_GE:
9398 v = TEST_COND(result >= 0);
9399 break;
9400 case Py_LT:
9401 v = TEST_COND(result == -1);
9402 break;
9403 case Py_GT:
9404 v = TEST_COND(result == 1);
9405 break;
9406 default:
9407 PyErr_BadArgument();
9408 return NULL;
9409 }
9410 Py_INCREF(v);
9411 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009412 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009413
Brian Curtindfc80e32011-08-10 20:28:54 -05009414 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009415}
9416
Alexander Belopolsky40018472011-02-26 01:02:56 +00009417int
9418PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009419{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009420 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421 int kind1, kind2, kind;
9422 void *buf1, *buf2;
9423 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009424 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009425
9426 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009427 sub = PyUnicode_FromObject(element);
9428 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009429 PyErr_Format(PyExc_TypeError,
9430 "'in <string>' requires string as left operand, not %s",
9431 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009432 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009433 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434 if (PyUnicode_READY(sub) == -1)
9435 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009436
Thomas Wouters477c8d52006-05-27 19:21:47 +00009437 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009438 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009439 Py_DECREF(sub);
9440 return -1;
9441 }
9442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 kind1 = PyUnicode_KIND(str);
9444 kind2 = PyUnicode_KIND(sub);
9445 kind = kind1 > kind2 ? kind1 : kind2;
9446 buf1 = PyUnicode_DATA(str);
9447 buf2 = PyUnicode_DATA(sub);
9448 if (kind1 != kind)
9449 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9450 if (!buf1) {
9451 Py_DECREF(sub);
9452 return -1;
9453 }
9454 if (kind2 != kind)
9455 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9456 if (!buf2) {
9457 Py_DECREF(sub);
9458 if (kind1 != kind) PyMem_Free(buf1);
9459 return -1;
9460 }
9461 len1 = PyUnicode_GET_LENGTH(str);
9462 len2 = PyUnicode_GET_LENGTH(sub);
9463
9464 switch(kind) {
9465 case PyUnicode_1BYTE_KIND:
9466 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9467 break;
9468 case PyUnicode_2BYTE_KIND:
9469 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9470 break;
9471 case PyUnicode_4BYTE_KIND:
9472 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9473 break;
9474 default:
9475 result = -1;
9476 assert(0);
9477 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009478
9479 Py_DECREF(str);
9480 Py_DECREF(sub);
9481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 if (kind1 != kind)
9483 PyMem_Free(buf1);
9484 if (kind2 != kind)
9485 PyMem_Free(buf2);
9486
Guido van Rossum403d68b2000-03-13 15:55:09 +00009487 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009488}
9489
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490/* Concat to string or Unicode object giving a new Unicode object. */
9491
Alexander Belopolsky40018472011-02-26 01:02:56 +00009492PyObject *
9493PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 PyObject *u = NULL, *v = NULL, *w;
9496 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497
9498 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009501 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009504 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505
9506 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009512 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514 }
9515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009517 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 w = PyUnicode_New(
9521 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9522 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009524 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009525 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9526 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009527 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009528 v, 0,
9529 PyUnicode_GET_LENGTH(v)) < 0)
9530 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531 Py_DECREF(u);
9532 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536 Py_XDECREF(u);
9537 Py_XDECREF(v);
9538 return NULL;
9539}
9540
Walter Dörwald1ab83302007-05-18 17:15:44 +00009541void
9542PyUnicode_Append(PyObject **pleft, PyObject *right)
9543{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009544 PyObject *new;
9545 if (*pleft == NULL)
9546 return;
9547 if (right == NULL || !PyUnicode_Check(*pleft)) {
9548 Py_DECREF(*pleft);
9549 *pleft = NULL;
9550 return;
9551 }
9552 new = PyUnicode_Concat(*pleft, right);
9553 Py_DECREF(*pleft);
9554 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009555}
9556
9557void
9558PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9559{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009560 PyUnicode_Append(pleft, right);
9561 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009562}
9563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009564PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009565 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009567Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009568string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009569interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570
9571static PyObject *
9572unicode_count(PyUnicodeObject *self, PyObject *args)
9573{
9574 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009575 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009576 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009577 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009578 int kind1, kind2, kind;
9579 void *buf1, *buf2;
9580 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581
Jesus Ceaac451502011-04-20 17:09:23 +02009582 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9583 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009584 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586 kind1 = PyUnicode_KIND(self);
9587 kind2 = PyUnicode_KIND(substring);
9588 kind = kind1 > kind2 ? kind1 : kind2;
9589 buf1 = PyUnicode_DATA(self);
9590 buf2 = PyUnicode_DATA(substring);
9591 if (kind1 != kind)
9592 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9593 if (!buf1) {
9594 Py_DECREF(substring);
9595 return NULL;
9596 }
9597 if (kind2 != kind)
9598 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9599 if (!buf2) {
9600 Py_DECREF(substring);
9601 if (kind1 != kind) PyMem_Free(buf1);
9602 return NULL;
9603 }
9604 len1 = PyUnicode_GET_LENGTH(self);
9605 len2 = PyUnicode_GET_LENGTH(substring);
9606
9607 ADJUST_INDICES(start, end, len1);
9608 switch(kind) {
9609 case PyUnicode_1BYTE_KIND:
9610 iresult = ucs1lib_count(
9611 ((Py_UCS1*)buf1) + start, end - start,
9612 buf2, len2, PY_SSIZE_T_MAX
9613 );
9614 break;
9615 case PyUnicode_2BYTE_KIND:
9616 iresult = ucs2lib_count(
9617 ((Py_UCS2*)buf1) + start, end - start,
9618 buf2, len2, PY_SSIZE_T_MAX
9619 );
9620 break;
9621 case PyUnicode_4BYTE_KIND:
9622 iresult = ucs4lib_count(
9623 ((Py_UCS4*)buf1) + start, end - start,
9624 buf2, len2, PY_SSIZE_T_MAX
9625 );
9626 break;
9627 default:
9628 assert(0); iresult = 0;
9629 }
9630
9631 result = PyLong_FromSsize_t(iresult);
9632
9633 if (kind1 != kind)
9634 PyMem_Free(buf1);
9635 if (kind2 != kind)
9636 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637
9638 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009639
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640 return result;
9641}
9642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009643PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009644 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009646Encode S using the codec registered for encoding. Default encoding\n\
9647is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009648handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009649a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9650'xmlcharrefreplace' as well as any other name registered with\n\
9651codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652
9653static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009654unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009656 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657 char *encoding = NULL;
9658 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009659
Benjamin Peterson308d6372009-09-18 21:42:35 +00009660 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9661 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009663 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009664}
9665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009666PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009667 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668\n\
9669Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009670If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671
9672static PyObject*
9673unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9674{
9675 Py_UNICODE *e;
9676 Py_UNICODE *p;
9677 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009678 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680 PyUnicodeObject *u;
9681 int tabsize = 8;
9682
9683 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009686 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9687 return NULL;
9688
Thomas Wouters7e474022000-07-16 12:04:32 +00009689 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009690 i = 0; /* chars up to and including most recent \n or \r */
9691 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9693 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009695 if (tabsize > 0) {
9696 incr = tabsize - (j % tabsize); /* cannot overflow */
9697 if (j > PY_SSIZE_T_MAX - incr)
9698 goto overflow1;
9699 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009700 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009703 if (j > PY_SSIZE_T_MAX - 1)
9704 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705 j++;
9706 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009707 if (i > PY_SSIZE_T_MAX - j)
9708 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009710 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711 }
9712 }
9713
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009714 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009715 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009716
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717 /* Second pass: create output string and fill it */
9718 u = _PyUnicode_New(i + j);
9719 if (!u)
9720 return NULL;
9721
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009722 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009723 q = _PyUnicode_WSTR(u); /* next output char */
9724 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009728 if (tabsize > 0) {
9729 i = tabsize - (j % tabsize);
9730 j += i;
9731 while (i--) {
9732 if (q >= qe)
9733 goto overflow2;
9734 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009735 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009736 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009737 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009738 else {
9739 if (q >= qe)
9740 goto overflow2;
9741 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009742 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743 if (*p == '\n' || *p == '\r')
9744 j = 0;
9745 }
9746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009747 if (PyUnicode_READY(u) == -1) {
9748 Py_DECREF(u);
9749 return NULL;
9750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009751 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009752
9753 overflow2:
9754 Py_DECREF(u);
9755 overflow1:
9756 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9757 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758}
9759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009760PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009761 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762\n\
9763Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009764such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765arguments start and end are interpreted as in slice notation.\n\
9766\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009767Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768
9769static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771{
Jesus Ceaac451502011-04-20 17:09:23 +02009772 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009773 Py_ssize_t start;
9774 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009775 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776
Jesus Ceaac451502011-04-20 17:09:23 +02009777 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9778 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 if (PyUnicode_READY(self) == -1)
9782 return NULL;
9783 if (PyUnicode_READY(substring) == -1)
9784 return NULL;
9785
9786 result = any_find_slice(
9787 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9788 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009789 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790
9791 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 if (result == -2)
9794 return NULL;
9795
Christian Heimes217cfd12007-12-02 14:31:20 +00009796 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797}
9798
9799static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009800unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 Py_UCS4 ch;
9803
9804 if (PyUnicode_READY(self) == -1)
9805 return NULL;
9806 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807 PyErr_SetString(PyExc_IndexError, "string index out of range");
9808 return NULL;
9809 }
9810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9812 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813}
9814
Guido van Rossumc2504932007-09-18 19:42:40 +00009815/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009816 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009817static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009818unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819{
Guido van Rossumc2504932007-09-18 19:42:40 +00009820 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009821 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 if (_PyUnicode_HASH(self) != -1)
9824 return _PyUnicode_HASH(self);
9825 if (PyUnicode_READY(self) == -1)
9826 return -1;
9827 len = PyUnicode_GET_LENGTH(self);
9828
9829 /* The hash function as a macro, gets expanded three times below. */
9830#define HASH(P) \
9831 x = (Py_uhash_t)*P << 7; \
9832 while (--len >= 0) \
9833 x = (1000003*x) ^ (Py_uhash_t)*P++;
9834
9835 switch (PyUnicode_KIND(self)) {
9836 case PyUnicode_1BYTE_KIND: {
9837 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9838 HASH(c);
9839 break;
9840 }
9841 case PyUnicode_2BYTE_KIND: {
9842 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9843 HASH(s);
9844 break;
9845 }
9846 default: {
9847 Py_UCS4 *l;
9848 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9849 "Impossible switch case in unicode_hash");
9850 l = PyUnicode_4BYTE_DATA(self);
9851 HASH(l);
9852 break;
9853 }
9854 }
9855 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9856
Guido van Rossumc2504932007-09-18 19:42:40 +00009857 if (x == -1)
9858 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009860 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009864PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009865 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009867Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868
9869static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009871{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009872 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009873 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009874 Py_ssize_t start;
9875 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009876
Jesus Ceaac451502011-04-20 17:09:23 +02009877 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9878 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 if (PyUnicode_READY(self) == -1)
9882 return NULL;
9883 if (PyUnicode_READY(substring) == -1)
9884 return NULL;
9885
9886 result = any_find_slice(
9887 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9888 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009889 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009890
9891 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009893 if (result == -2)
9894 return NULL;
9895
Guido van Rossumd57fd912000-03-10 22:53:23 +00009896 if (result < 0) {
9897 PyErr_SetString(PyExc_ValueError, "substring not found");
9898 return NULL;
9899 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009900
Christian Heimes217cfd12007-12-02 14:31:20 +00009901 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902}
9903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009904PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009905 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009907Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009908at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909
9910static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009911unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 Py_ssize_t i, length;
9914 int kind;
9915 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916 int cased;
9917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 if (PyUnicode_READY(self) == -1)
9919 return NULL;
9920 length = PyUnicode_GET_LENGTH(self);
9921 kind = PyUnicode_KIND(self);
9922 data = PyUnicode_DATA(self);
9923
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 if (length == 1)
9926 return PyBool_FromLong(
9927 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009929 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009931 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009932
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 for (i = 0; i < length; i++) {
9935 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009936
Benjamin Peterson29060642009-01-31 22:14:21 +00009937 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9938 return PyBool_FromLong(0);
9939 else if (!cased && Py_UNICODE_ISLOWER(ch))
9940 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009942 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009943}
9944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009945PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009946 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009948Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009949at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950
9951static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009952unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 Py_ssize_t i, length;
9955 int kind;
9956 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009957 int cased;
9958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 if (PyUnicode_READY(self) == -1)
9960 return NULL;
9961 length = PyUnicode_GET_LENGTH(self);
9962 kind = PyUnicode_KIND(self);
9963 data = PyUnicode_DATA(self);
9964
Guido van Rossumd57fd912000-03-10 22:53:23 +00009965 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009966 if (length == 1)
9967 return PyBool_FromLong(
9968 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009970 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009972 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009973
Guido van Rossumd57fd912000-03-10 22:53:23 +00009974 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 for (i = 0; i < length; i++) {
9976 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009977
Benjamin Peterson29060642009-01-31 22:14:21 +00009978 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9979 return PyBool_FromLong(0);
9980 else if (!cased && Py_UNICODE_ISUPPER(ch))
9981 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009983 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984}
9985
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009986PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009987 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009988\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009989Return True if S is a titlecased string and there is at least one\n\
9990character in S, i.e. upper- and titlecase characters may only\n\
9991follow uncased characters and lowercase characters only cased ones.\n\
9992Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993
9994static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009995unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 Py_ssize_t i, length;
9998 int kind;
9999 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010000 int cased, previous_is_cased;
10001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 if (PyUnicode_READY(self) == -1)
10003 return NULL;
10004 length = PyUnicode_GET_LENGTH(self);
10005 kind = PyUnicode_KIND(self);
10006 data = PyUnicode_DATA(self);
10007
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 if (length == 1) {
10010 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10011 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10012 (Py_UNICODE_ISUPPER(ch) != 0));
10013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010015 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010017 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010018
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019 cased = 0;
10020 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 for (i = 0; i < length; i++) {
10022 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010023
Benjamin Peterson29060642009-01-31 22:14:21 +000010024 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10025 if (previous_is_cased)
10026 return PyBool_FromLong(0);
10027 previous_is_cased = 1;
10028 cased = 1;
10029 }
10030 else if (Py_UNICODE_ISLOWER(ch)) {
10031 if (!previous_is_cased)
10032 return PyBool_FromLong(0);
10033 previous_is_cased = 1;
10034 cased = 1;
10035 }
10036 else
10037 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010039 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040}
10041
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010042PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010043 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010044\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010045Return True if all characters in S are whitespace\n\
10046and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010047
10048static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010049unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 Py_ssize_t i, length;
10052 int kind;
10053 void *data;
10054
10055 if (PyUnicode_READY(self) == -1)
10056 return NULL;
10057 length = PyUnicode_GET_LENGTH(self);
10058 kind = PyUnicode_KIND(self);
10059 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060
Guido van Rossumd57fd912000-03-10 22:53:23 +000010061 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 if (length == 1)
10063 return PyBool_FromLong(
10064 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010065
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010066 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010068 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 for (i = 0; i < length; i++) {
10071 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010072 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010073 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010074 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010075 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010076}
10077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010078PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010079 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010080\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010081Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010082and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010083
10084static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010085unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010086{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 Py_ssize_t i, length;
10088 int kind;
10089 void *data;
10090
10091 if (PyUnicode_READY(self) == -1)
10092 return NULL;
10093 length = PyUnicode_GET_LENGTH(self);
10094 kind = PyUnicode_KIND(self);
10095 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010096
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010097 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 if (length == 1)
10099 return PyBool_FromLong(
10100 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010101
10102 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010104 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 for (i = 0; i < length; i++) {
10107 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010108 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010109 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010110 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010111}
10112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010113PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010114 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010115\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010116Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010117and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010118
10119static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010120unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010121{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 int kind;
10123 void *data;
10124 Py_ssize_t len, i;
10125
10126 if (PyUnicode_READY(self) == -1)
10127 return NULL;
10128
10129 kind = PyUnicode_KIND(self);
10130 data = PyUnicode_DATA(self);
10131 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010132
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010133 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 if (len == 1) {
10135 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10136 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10137 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010138
10139 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010140 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010141 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 for (i = 0; i < len; i++) {
10144 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010145 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010146 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010147 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010148 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010149}
10150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010151PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010152 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010154Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010155False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010156
10157static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010158unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 Py_ssize_t i, length;
10161 int kind;
10162 void *data;
10163
10164 if (PyUnicode_READY(self) == -1)
10165 return NULL;
10166 length = PyUnicode_GET_LENGTH(self);
10167 kind = PyUnicode_KIND(self);
10168 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 if (length == 1)
10172 return PyBool_FromLong(
10173 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010175 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010177 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 for (i = 0; i < length; i++) {
10180 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010181 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010183 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184}
10185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010186PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010187 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010189Return True if all characters in S are digits\n\
10190and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010191
10192static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010193unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 Py_ssize_t i, length;
10196 int kind;
10197 void *data;
10198
10199 if (PyUnicode_READY(self) == -1)
10200 return NULL;
10201 length = PyUnicode_GET_LENGTH(self);
10202 kind = PyUnicode_KIND(self);
10203 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 if (length == 1) {
10207 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10208 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010211 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010213 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 for (i = 0; i < length; i++) {
10216 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010217 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010218 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010219 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220}
10221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010222PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010223 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010225Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010226False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227
10228static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010229unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 Py_ssize_t i, length;
10232 int kind;
10233 void *data;
10234
10235 if (PyUnicode_READY(self) == -1)
10236 return NULL;
10237 length = PyUnicode_GET_LENGTH(self);
10238 kind = PyUnicode_KIND(self);
10239 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 if (length == 1)
10243 return PyBool_FromLong(
10244 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010246 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010248 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 for (i = 0; i < length; i++) {
10251 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010252 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010254 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255}
10256
Martin v. Löwis47383402007-08-15 07:32:56 +000010257int
10258PyUnicode_IsIdentifier(PyObject *self)
10259{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 int kind;
10261 void *data;
10262 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010263 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 if (PyUnicode_READY(self) == -1) {
10266 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010267 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 }
10269
10270 /* Special case for empty strings */
10271 if (PyUnicode_GET_LENGTH(self) == 0)
10272 return 0;
10273 kind = PyUnicode_KIND(self);
10274 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010275
10276 /* PEP 3131 says that the first character must be in
10277 XID_Start and subsequent characters in XID_Continue,
10278 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010279 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010280 letters, digits, underscore). However, given the current
10281 definition of XID_Start and XID_Continue, it is sufficient
10282 to check just for these, except that _ must be allowed
10283 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010285 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010286 return 0;
10287
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010288 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010290 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010291 return 1;
10292}
10293
10294PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010295 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010296\n\
10297Return True if S is a valid identifier according\n\
10298to the language definition.");
10299
10300static PyObject*
10301unicode_isidentifier(PyObject *self)
10302{
10303 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10304}
10305
Georg Brandl559e5d72008-06-11 18:37:52 +000010306PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010307 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010308\n\
10309Return True if all characters in S are considered\n\
10310printable in repr() or S is empty, False otherwise.");
10311
10312static PyObject*
10313unicode_isprintable(PyObject *self)
10314{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 Py_ssize_t i, length;
10316 int kind;
10317 void *data;
10318
10319 if (PyUnicode_READY(self) == -1)
10320 return NULL;
10321 length = PyUnicode_GET_LENGTH(self);
10322 kind = PyUnicode_KIND(self);
10323 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010324
10325 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 if (length == 1)
10327 return PyBool_FromLong(
10328 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 for (i = 0; i < length; i++) {
10331 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010332 Py_RETURN_FALSE;
10333 }
10334 }
10335 Py_RETURN_TRUE;
10336}
10337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010338PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010339 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340\n\
10341Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010342iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343
10344static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010345unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010347 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348}
10349
Martin v. Löwis18e16552006-02-15 17:27:45 +000010350static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351unicode_length(PyUnicodeObject *self)
10352{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 if (PyUnicode_READY(self) == -1)
10354 return -1;
10355 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010356}
10357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010358PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010359 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010360\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010361Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010362done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363
10364static PyObject *
10365unicode_ljust(PyUnicodeObject *self, PyObject *args)
10366{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010367 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 Py_UCS4 fillchar = ' ';
10369
10370 if (PyUnicode_READY(self) == -1)
10371 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010372
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010373 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374 return NULL;
10375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377 Py_INCREF(self);
10378 return (PyObject*) self;
10379 }
10380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382}
10383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010384PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010385 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010387Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388
10389static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010390unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392 return fixup(self, fixlower);
10393}
10394
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010395#define LEFTSTRIP 0
10396#define RIGHTSTRIP 1
10397#define BOTHSTRIP 2
10398
10399/* Arrays indexed by above */
10400static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10401
10402#define STRIPNAME(i) (stripformat[i]+3)
10403
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010404/* externally visible for str.strip(unicode) */
10405PyObject *
10406_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10407{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 void *data;
10409 int kind;
10410 Py_ssize_t i, j, len;
10411 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10414 return NULL;
10415
10416 kind = PyUnicode_KIND(self);
10417 data = PyUnicode_DATA(self);
10418 len = PyUnicode_GET_LENGTH(self);
10419 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10420 PyUnicode_DATA(sepobj),
10421 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010422
Benjamin Peterson14339b62009-01-31 16:36:08 +000010423 i = 0;
10424 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 while (i < len &&
10426 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010427 i++;
10428 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010429 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010430
Benjamin Peterson14339b62009-01-31 16:36:08 +000010431 j = len;
10432 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010433 do {
10434 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 } while (j >= i &&
10436 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010437 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010438 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010439
Victor Stinner12bab6d2011-10-01 01:53:49 +020010440 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441}
10442
10443PyObject*
10444PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10445{
10446 unsigned char *data;
10447 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010448 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449
Victor Stinnerde636f32011-10-01 03:55:54 +020010450 if (PyUnicode_READY(self) == -1)
10451 return NULL;
10452
10453 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10454
Victor Stinner12bab6d2011-10-01 01:53:49 +020010455 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010457 if (PyUnicode_CheckExact(self)) {
10458 Py_INCREF(self);
10459 return self;
10460 }
10461 else
10462 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 }
10464
Victor Stinner12bab6d2011-10-01 01:53:49 +020010465 length = end - start;
10466 if (length == 1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 return unicode_getitem((PyUnicodeObject*)self, start);
10468
Victor Stinnerde636f32011-10-01 03:55:54 +020010469 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010470 PyErr_SetString(PyExc_IndexError, "string index out of range");
10471 return NULL;
10472 }
10473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 kind = PyUnicode_KIND(self);
10475 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010476 return PyUnicode_FromKindAndData(kind,
10477 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010478 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010479}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480
10481static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010482do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 int kind;
10485 void *data;
10486 Py_ssize_t len, i, j;
10487
10488 if (PyUnicode_READY(self) == -1)
10489 return NULL;
10490
10491 kind = PyUnicode_KIND(self);
10492 data = PyUnicode_DATA(self);
10493 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010494
Benjamin Peterson14339b62009-01-31 16:36:08 +000010495 i = 0;
10496 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010498 i++;
10499 }
10500 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010501
Benjamin Peterson14339b62009-01-31 16:36:08 +000010502 j = len;
10503 if (striptype != LEFTSTRIP) {
10504 do {
10505 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010507 j++;
10508 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010509
Victor Stinner12bab6d2011-10-01 01:53:49 +020010510 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511}
10512
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010513
10514static PyObject *
10515do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10516{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010517 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010518
Benjamin Peterson14339b62009-01-31 16:36:08 +000010519 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10520 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010521
Benjamin Peterson14339b62009-01-31 16:36:08 +000010522 if (sep != NULL && sep != Py_None) {
10523 if (PyUnicode_Check(sep))
10524 return _PyUnicode_XStrip(self, striptype, sep);
10525 else {
10526 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010527 "%s arg must be None or str",
10528 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010529 return NULL;
10530 }
10531 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010532
Benjamin Peterson14339b62009-01-31 16:36:08 +000010533 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010534}
10535
10536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010537PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010538 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010539\n\
10540Return a copy of the string S with leading and trailing\n\
10541whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010542If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010543
10544static PyObject *
10545unicode_strip(PyUnicodeObject *self, PyObject *args)
10546{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010547 if (PyTuple_GET_SIZE(args) == 0)
10548 return do_strip(self, BOTHSTRIP); /* Common case */
10549 else
10550 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010551}
10552
10553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010554PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010555 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010556\n\
10557Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010558If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010559
10560static PyObject *
10561unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10562{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010563 if (PyTuple_GET_SIZE(args) == 0)
10564 return do_strip(self, LEFTSTRIP); /* Common case */
10565 else
10566 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010567}
10568
10569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010570PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010571 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010572\n\
10573Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010574If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010575
10576static PyObject *
10577unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10578{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010579 if (PyTuple_GET_SIZE(args) == 0)
10580 return do_strip(self, RIGHTSTRIP); /* Common case */
10581 else
10582 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010583}
10584
10585
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010587unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588{
10589 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591
Georg Brandl222de0f2009-04-12 12:01:50 +000010592 if (len < 1) {
10593 Py_INCREF(unicode_empty);
10594 return (PyObject *)unicode_empty;
10595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596
Tim Peters7a29bd52001-09-12 03:03:31 +000010597 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598 /* no repeat, return original string */
10599 Py_INCREF(str);
10600 return (PyObject*) str;
10601 }
Tim Peters8f422462000-09-09 06:13:41 +000010602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 if (PyUnicode_READY(str) == -1)
10604 return NULL;
10605
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010606 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010607 PyErr_SetString(PyExc_OverflowError,
10608 "repeated string is too long");
10609 return NULL;
10610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614 if (!u)
10615 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010616 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 if (PyUnicode_GET_LENGTH(str) == 1) {
10619 const int kind = PyUnicode_KIND(str);
10620 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10621 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010622 if (kind == PyUnicode_1BYTE_KIND)
10623 memset(to, (unsigned char)fill_char, len);
10624 else {
10625 for (n = 0; n < len; ++n)
10626 PyUnicode_WRITE(kind, to, n, fill_char);
10627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 }
10629 else {
10630 /* number of characters copied this far */
10631 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10632 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10633 char *to = (char *) PyUnicode_DATA(u);
10634 Py_MEMCPY(to, PyUnicode_DATA(str),
10635 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010636 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 n = (done <= nchars-done) ? done : nchars-done;
10638 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010639 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010640 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641 }
10642
10643 return (PyObject*) u;
10644}
10645
Alexander Belopolsky40018472011-02-26 01:02:56 +000010646PyObject *
10647PyUnicode_Replace(PyObject *obj,
10648 PyObject *subobj,
10649 PyObject *replobj,
10650 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010651{
10652 PyObject *self;
10653 PyObject *str1;
10654 PyObject *str2;
10655 PyObject *result;
10656
10657 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010658 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010659 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010660 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010661 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010662 Py_DECREF(self);
10663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664 }
10665 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020010666 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010667 Py_DECREF(self);
10668 Py_DECREF(str1);
10669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672 Py_DECREF(self);
10673 Py_DECREF(str1);
10674 Py_DECREF(str2);
10675 return result;
10676}
10677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010678PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010679 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010680\n\
10681Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010682old replaced by new. If the optional argument count is\n\
10683given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010684
10685static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010687{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 PyObject *str1;
10689 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010690 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691 PyObject *result;
10692
Martin v. Löwis18e16552006-02-15 17:27:45 +000010693 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010696 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 str1 = PyUnicode_FromObject(str1);
10698 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10699 return NULL;
10700 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020010701 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010702 Py_DECREF(str1);
10703 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705
10706 result = replace(self, str1, str2, maxcount);
10707
10708 Py_DECREF(str1);
10709 Py_DECREF(str2);
10710 return result;
10711}
10712
Alexander Belopolsky40018472011-02-26 01:02:56 +000010713static PyObject *
10714unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010716 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 Py_ssize_t isize;
10718 Py_ssize_t osize, squote, dquote, i, o;
10719 Py_UCS4 max, quote;
10720 int ikind, okind;
10721 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010724 return NULL;
10725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 isize = PyUnicode_GET_LENGTH(unicode);
10727 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 /* Compute length of output, quote characters, and
10730 maximum character */
10731 osize = 2; /* quotes */
10732 max = 127;
10733 squote = dquote = 0;
10734 ikind = PyUnicode_KIND(unicode);
10735 for (i = 0; i < isize; i++) {
10736 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10737 switch (ch) {
10738 case '\'': squote++; osize++; break;
10739 case '"': dquote++; osize++; break;
10740 case '\\': case '\t': case '\r': case '\n':
10741 osize += 2; break;
10742 default:
10743 /* Fast-path ASCII */
10744 if (ch < ' ' || ch == 0x7f)
10745 osize += 4; /* \xHH */
10746 else if (ch < 0x7f)
10747 osize++;
10748 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10749 osize++;
10750 max = ch > max ? ch : max;
10751 }
10752 else if (ch < 0x100)
10753 osize += 4; /* \xHH */
10754 else if (ch < 0x10000)
10755 osize += 6; /* \uHHHH */
10756 else
10757 osize += 10; /* \uHHHHHHHH */
10758 }
10759 }
10760
10761 quote = '\'';
10762 if (squote) {
10763 if (dquote)
10764 /* Both squote and dquote present. Use squote,
10765 and escape them */
10766 osize += squote;
10767 else
10768 quote = '"';
10769 }
10770
10771 repr = PyUnicode_New(osize, max);
10772 if (repr == NULL)
10773 return NULL;
10774 okind = PyUnicode_KIND(repr);
10775 odata = PyUnicode_DATA(repr);
10776
10777 PyUnicode_WRITE(okind, odata, 0, quote);
10778 PyUnicode_WRITE(okind, odata, osize-1, quote);
10779
10780 for (i = 0, o = 1; i < isize; i++) {
10781 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010782
10783 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 if ((ch == quote) || (ch == '\\')) {
10785 PyUnicode_WRITE(okind, odata, o++, '\\');
10786 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010787 continue;
10788 }
10789
Benjamin Peterson29060642009-01-31 22:14:21 +000010790 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010791 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792 PyUnicode_WRITE(okind, odata, o++, '\\');
10793 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010794 }
10795 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 PyUnicode_WRITE(okind, odata, o++, '\\');
10797 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010798 }
10799 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 PyUnicode_WRITE(okind, odata, o++, '\\');
10801 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010802 }
10803
10804 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010805 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 PyUnicode_WRITE(okind, odata, o++, '\\');
10807 PyUnicode_WRITE(okind, odata, o++, 'x');
10808 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10809 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010810 }
10811
Georg Brandl559e5d72008-06-11 18:37:52 +000010812 /* Copy ASCII characters as-is */
10813 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010815 }
10816
Benjamin Peterson29060642009-01-31 22:14:21 +000010817 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010818 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010819 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010820 (categories Z* and C* except ASCII space)
10821 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010823 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824 if (ch <= 0xff) {
10825 PyUnicode_WRITE(okind, odata, o++, '\\');
10826 PyUnicode_WRITE(okind, odata, o++, 'x');
10827 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10828 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010829 }
10830 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 else if (ch >= 0x10000) {
10832 PyUnicode_WRITE(okind, odata, o++, '\\');
10833 PyUnicode_WRITE(okind, odata, o++, 'U');
10834 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10835 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10836 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10837 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10838 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10839 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10840 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10841 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010842 }
10843 /* Map 16-bit characters to '\uxxxx' */
10844 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845 PyUnicode_WRITE(okind, odata, o++, '\\');
10846 PyUnicode_WRITE(okind, odata, o++, 'u');
10847 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10848 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10849 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10850 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010851 }
10852 }
10853 /* Copy characters as-is */
10854 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010856 }
10857 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010858 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010860 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861}
10862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010863PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010864 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865\n\
10866Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010867such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868arguments start and end are interpreted as in slice notation.\n\
10869\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010870Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871
10872static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874{
Jesus Ceaac451502011-04-20 17:09:23 +020010875 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010876 Py_ssize_t start;
10877 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010878 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879
Jesus Ceaac451502011-04-20 17:09:23 +020010880 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10881 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010882 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010884 if (PyUnicode_READY(self) == -1)
10885 return NULL;
10886 if (PyUnicode_READY(substring) == -1)
10887 return NULL;
10888
10889 result = any_find_slice(
10890 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10891 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010892 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893
10894 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010896 if (result == -2)
10897 return NULL;
10898
Christian Heimes217cfd12007-12-02 14:31:20 +000010899 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010900}
10901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010902PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010903 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010905Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906
10907static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909{
Jesus Ceaac451502011-04-20 17:09:23 +020010910 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010911 Py_ssize_t start;
10912 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010913 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914
Jesus Ceaac451502011-04-20 17:09:23 +020010915 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10916 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010917 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 if (PyUnicode_READY(self) == -1)
10920 return NULL;
10921 if (PyUnicode_READY(substring) == -1)
10922 return NULL;
10923
10924 result = any_find_slice(
10925 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10926 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010927 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928
10929 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 if (result == -2)
10932 return NULL;
10933
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934 if (result < 0) {
10935 PyErr_SetString(PyExc_ValueError, "substring not found");
10936 return NULL;
10937 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938
Christian Heimes217cfd12007-12-02 14:31:20 +000010939 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940}
10941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010942PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010943 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010945Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010946done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947
10948static PyObject *
10949unicode_rjust(PyUnicodeObject *self, PyObject *args)
10950{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010951 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 Py_UCS4 fillchar = ' ';
10953
Victor Stinnere9a29352011-10-01 02:14:59 +020010954 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010956
Victor Stinnere9a29352011-10-01 02:14:59 +020010957 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958 return NULL;
10959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010960 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961 Py_INCREF(self);
10962 return (PyObject*) self;
10963 }
10964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010965 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966}
10967
Alexander Belopolsky40018472011-02-26 01:02:56 +000010968PyObject *
10969PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970{
10971 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010972
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973 s = PyUnicode_FromObject(s);
10974 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010975 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010976 if (sep != NULL) {
10977 sep = PyUnicode_FromObject(sep);
10978 if (sep == NULL) {
10979 Py_DECREF(s);
10980 return NULL;
10981 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010982 }
10983
10984 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
10985
10986 Py_DECREF(s);
10987 Py_XDECREF(sep);
10988 return result;
10989}
10990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010991PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010992 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993\n\
10994Return a list of the words in S, using sep as the\n\
10995delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000010996splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000010997whitespace string is a separator and empty strings are\n\
10998removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999
11000static PyObject*
11001unicode_split(PyUnicodeObject *self, PyObject *args)
11002{
11003 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011004 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005
Martin v. Löwis18e16552006-02-15 17:27:45 +000011006 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007 return NULL;
11008
11009 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011010 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011012 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015}
11016
Thomas Wouters477c8d52006-05-27 19:21:47 +000011017PyObject *
11018PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11019{
11020 PyObject* str_obj;
11021 PyObject* sep_obj;
11022 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 int kind1, kind2, kind;
11024 void *buf1 = NULL, *buf2 = NULL;
11025 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011026
11027 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011028 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011029 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011030 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011032 Py_DECREF(str_obj);
11033 return NULL;
11034 }
11035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 kind1 = PyUnicode_KIND(str_in);
11037 kind2 = PyUnicode_KIND(sep_obj);
11038 kind = kind1 > kind2 ? kind1 : kind2;
11039 buf1 = PyUnicode_DATA(str_in);
11040 if (kind1 != kind)
11041 buf1 = _PyUnicode_AsKind(str_in, kind);
11042 if (!buf1)
11043 goto onError;
11044 buf2 = PyUnicode_DATA(sep_obj);
11045 if (kind2 != kind)
11046 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11047 if (!buf2)
11048 goto onError;
11049 len1 = PyUnicode_GET_LENGTH(str_obj);
11050 len2 = PyUnicode_GET_LENGTH(sep_obj);
11051
11052 switch(PyUnicode_KIND(str_in)) {
11053 case PyUnicode_1BYTE_KIND:
11054 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11055 break;
11056 case PyUnicode_2BYTE_KIND:
11057 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11058 break;
11059 case PyUnicode_4BYTE_KIND:
11060 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11061 break;
11062 default:
11063 assert(0);
11064 out = 0;
11065 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011066
11067 Py_DECREF(sep_obj);
11068 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069 if (kind1 != kind)
11070 PyMem_Free(buf1);
11071 if (kind2 != kind)
11072 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011073
11074 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011075 onError:
11076 Py_DECREF(sep_obj);
11077 Py_DECREF(str_obj);
11078 if (kind1 != kind && buf1)
11079 PyMem_Free(buf1);
11080 if (kind2 != kind && buf2)
11081 PyMem_Free(buf2);
11082 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011083}
11084
11085
11086PyObject *
11087PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11088{
11089 PyObject* str_obj;
11090 PyObject* sep_obj;
11091 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 int kind1, kind2, kind;
11093 void *buf1 = NULL, *buf2 = NULL;
11094 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011095
11096 str_obj = PyUnicode_FromObject(str_in);
11097 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011098 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011099 sep_obj = PyUnicode_FromObject(sep_in);
11100 if (!sep_obj) {
11101 Py_DECREF(str_obj);
11102 return NULL;
11103 }
11104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 kind1 = PyUnicode_KIND(str_in);
11106 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011107 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011108 buf1 = PyUnicode_DATA(str_in);
11109 if (kind1 != kind)
11110 buf1 = _PyUnicode_AsKind(str_in, kind);
11111 if (!buf1)
11112 goto onError;
11113 buf2 = PyUnicode_DATA(sep_obj);
11114 if (kind2 != kind)
11115 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11116 if (!buf2)
11117 goto onError;
11118 len1 = PyUnicode_GET_LENGTH(str_obj);
11119 len2 = PyUnicode_GET_LENGTH(sep_obj);
11120
11121 switch(PyUnicode_KIND(str_in)) {
11122 case PyUnicode_1BYTE_KIND:
11123 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11124 break;
11125 case PyUnicode_2BYTE_KIND:
11126 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11127 break;
11128 case PyUnicode_4BYTE_KIND:
11129 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11130 break;
11131 default:
11132 assert(0);
11133 out = 0;
11134 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011135
11136 Py_DECREF(sep_obj);
11137 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 if (kind1 != kind)
11139 PyMem_Free(buf1);
11140 if (kind2 != kind)
11141 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011142
11143 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 onError:
11145 Py_DECREF(sep_obj);
11146 Py_DECREF(str_obj);
11147 if (kind1 != kind && buf1)
11148 PyMem_Free(buf1);
11149 if (kind2 != kind && buf2)
11150 PyMem_Free(buf2);
11151 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011152}
11153
11154PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011155 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011156\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011157Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011158the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011159found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011160
11161static PyObject*
11162unicode_partition(PyUnicodeObject *self, PyObject *separator)
11163{
11164 return PyUnicode_Partition((PyObject *)self, separator);
11165}
11166
11167PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011168 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011169\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011170Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011171the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011172separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011173
11174static PyObject*
11175unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11176{
11177 return PyUnicode_RPartition((PyObject *)self, separator);
11178}
11179
Alexander Belopolsky40018472011-02-26 01:02:56 +000011180PyObject *
11181PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011182{
11183 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011184
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011185 s = PyUnicode_FromObject(s);
11186 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011187 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011188 if (sep != NULL) {
11189 sep = PyUnicode_FromObject(sep);
11190 if (sep == NULL) {
11191 Py_DECREF(s);
11192 return NULL;
11193 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011194 }
11195
11196 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11197
11198 Py_DECREF(s);
11199 Py_XDECREF(sep);
11200 return result;
11201}
11202
11203PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011204 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011205\n\
11206Return a list of the words in S, using sep as the\n\
11207delimiter string, starting at the end of the string and\n\
11208working to the front. If maxsplit is given, at most maxsplit\n\
11209splits are done. If sep is not specified, any whitespace string\n\
11210is a separator.");
11211
11212static PyObject*
11213unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11214{
11215 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011216 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011217
Martin v. Löwis18e16552006-02-15 17:27:45 +000011218 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011219 return NULL;
11220
11221 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011222 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011223 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011224 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011225 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011226 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011227}
11228
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011229PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011230 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231\n\
11232Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011233Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011234is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235
11236static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011237unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011239 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011240 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011242 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11243 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244 return NULL;
11245
Guido van Rossum86662912000-04-11 15:38:46 +000011246 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247}
11248
11249static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011250PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251{
Walter Dörwald346737f2007-05-31 10:44:43 +000011252 if (PyUnicode_CheckExact(self)) {
11253 Py_INCREF(self);
11254 return self;
11255 } else
11256 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011257 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258}
11259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011260PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011261 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262\n\
11263Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011264and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265
11266static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011267unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269 return fixup(self, fixswapcase);
11270}
11271
Georg Brandlceee0772007-11-27 23:48:05 +000011272PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011273 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011274\n\
11275Return a translation table usable for str.translate().\n\
11276If there is only one argument, it must be a dictionary mapping Unicode\n\
11277ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011278Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011279If there are two arguments, they must be strings of equal length, and\n\
11280in the resulting dictionary, each character in x will be mapped to the\n\
11281character at the same position in y. If there is a third argument, it\n\
11282must be a string, whose characters will be mapped to None in the result.");
11283
11284static PyObject*
11285unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11286{
11287 PyObject *x, *y = NULL, *z = NULL;
11288 PyObject *new = NULL, *key, *value;
11289 Py_ssize_t i = 0;
11290 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011291
Georg Brandlceee0772007-11-27 23:48:05 +000011292 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11293 return NULL;
11294 new = PyDict_New();
11295 if (!new)
11296 return NULL;
11297 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 int x_kind, y_kind, z_kind;
11299 void *x_data, *y_data, *z_data;
11300
Georg Brandlceee0772007-11-27 23:48:05 +000011301 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011302 if (!PyUnicode_Check(x)) {
11303 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11304 "be a string if there is a second argument");
11305 goto err;
11306 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011307 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011308 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11309 "arguments must have equal length");
11310 goto err;
11311 }
11312 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 x_kind = PyUnicode_KIND(x);
11314 y_kind = PyUnicode_KIND(y);
11315 x_data = PyUnicode_DATA(x);
11316 y_data = PyUnicode_DATA(y);
11317 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11318 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11319 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011320 if (!key || !value)
11321 goto err;
11322 res = PyDict_SetItem(new, key, value);
11323 Py_DECREF(key);
11324 Py_DECREF(value);
11325 if (res < 0)
11326 goto err;
11327 }
11328 /* create entries for deleting chars in z */
11329 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 z_kind = PyUnicode_KIND(z);
11331 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011332 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011334 if (!key)
11335 goto err;
11336 res = PyDict_SetItem(new, key, Py_None);
11337 Py_DECREF(key);
11338 if (res < 0)
11339 goto err;
11340 }
11341 }
11342 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 int kind;
11344 void *data;
11345
Georg Brandlceee0772007-11-27 23:48:05 +000011346 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011347 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011348 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11349 "to maketrans it must be a dict");
11350 goto err;
11351 }
11352 /* copy entries into the new dict, converting string keys to int keys */
11353 while (PyDict_Next(x, &i, &key, &value)) {
11354 if (PyUnicode_Check(key)) {
11355 /* convert string keys to integer keys */
11356 PyObject *newkey;
11357 if (PyUnicode_GET_SIZE(key) != 1) {
11358 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11359 "table must be of length 1");
11360 goto err;
11361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 kind = PyUnicode_KIND(key);
11363 data = PyUnicode_DATA(key);
11364 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011365 if (!newkey)
11366 goto err;
11367 res = PyDict_SetItem(new, newkey, value);
11368 Py_DECREF(newkey);
11369 if (res < 0)
11370 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011371 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011372 /* just keep integer keys */
11373 if (PyDict_SetItem(new, key, value) < 0)
11374 goto err;
11375 } else {
11376 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11377 "be strings or integers");
11378 goto err;
11379 }
11380 }
11381 }
11382 return new;
11383 err:
11384 Py_DECREF(new);
11385 return NULL;
11386}
11387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011388PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011389 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390\n\
11391Return a copy of the string S, where all characters have been mapped\n\
11392through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011393Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011394Unmapped characters are left untouched. Characters mapped to None\n\
11395are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396
11397static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401}
11402
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011403PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011404 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011406Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407
11408static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011409unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411 return fixup(self, fixupper);
11412}
11413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011417Pad a numeric string S with zeros on the left, to fill a field\n\
11418of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419
11420static PyObject *
11421unicode_zfill(PyUnicodeObject *self, PyObject *args)
11422{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011423 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011425 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 int kind;
11427 void *data;
11428 Py_UCS4 chr;
11429
11430 if (PyUnicode_READY(self) == -1)
11431 return NULL;
11432
Martin v. Löwis18e16552006-02-15 17:27:45 +000011433 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434 return NULL;
11435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011437 if (PyUnicode_CheckExact(self)) {
11438 Py_INCREF(self);
11439 return (PyObject*) self;
11440 }
11441 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011442 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443 }
11444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446
11447 u = pad(self, fill, 0, '0');
11448
Walter Dörwald068325e2002-04-15 13:36:47 +000011449 if (u == NULL)
11450 return NULL;
11451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 kind = PyUnicode_KIND(u);
11453 data = PyUnicode_DATA(u);
11454 chr = PyUnicode_READ(kind, data, fill);
11455
11456 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 PyUnicode_WRITE(kind, data, 0, chr);
11459 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460 }
11461
11462 return (PyObject*) u;
11463}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464
11465#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011466static PyObject *
11467unicode__decimal2ascii(PyObject *self)
11468{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011470}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471#endif
11472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011473PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011476Return True if S starts with the specified prefix, False otherwise.\n\
11477With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011478With optional end, stop comparing S at that position.\n\
11479prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480
11481static PyObject *
11482unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011483 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011485 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011487 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011488 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011489 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
Jesus Ceaac451502011-04-20 17:09:23 +020011491 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011492 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011493 if (PyTuple_Check(subobj)) {
11494 Py_ssize_t i;
11495 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11496 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011497 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011498 if (substring == NULL)
11499 return NULL;
11500 result = tailmatch(self, substring, start, end, -1);
11501 Py_DECREF(substring);
11502 if (result) {
11503 Py_RETURN_TRUE;
11504 }
11505 }
11506 /* nothing matched */
11507 Py_RETURN_FALSE;
11508 }
11509 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011510 if (substring == NULL) {
11511 if (PyErr_ExceptionMatches(PyExc_TypeError))
11512 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11513 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011515 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011516 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011518 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519}
11520
11521
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011522PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011525Return True if S ends with the specified suffix, False otherwise.\n\
11526With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011527With optional end, stop comparing S at that position.\n\
11528suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529
11530static PyObject *
11531unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011532 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011534 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011536 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011537 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011538 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539
Jesus Ceaac451502011-04-20 17:09:23 +020011540 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011541 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011542 if (PyTuple_Check(subobj)) {
11543 Py_ssize_t i;
11544 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11545 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011547 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011548 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011549 result = tailmatch(self, substring, start, end, +1);
11550 Py_DECREF(substring);
11551 if (result) {
11552 Py_RETURN_TRUE;
11553 }
11554 }
11555 Py_RETURN_FALSE;
11556 }
11557 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011558 if (substring == NULL) {
11559 if (PyErr_ExceptionMatches(PyExc_TypeError))
11560 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11561 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011562 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011563 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011564 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011566 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567}
11568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011570
11571PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011572 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011573\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011574Return a formatted version of S, using substitutions from args and kwargs.\n\
11575The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011576
Eric Smith27bbca62010-11-04 17:06:58 +000011577PyDoc_STRVAR(format_map__doc__,
11578 "S.format_map(mapping) -> str\n\
11579\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011580Return a formatted version of S, using substitutions from mapping.\n\
11581The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011582
Eric Smith4a7d76d2008-05-30 18:10:19 +000011583static PyObject *
11584unicode__format__(PyObject* self, PyObject* args)
11585{
11586 PyObject *format_spec;
11587
11588 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11589 return NULL;
11590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11592 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011593}
11594
Eric Smith8c663262007-08-25 02:26:07 +000011595PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011596 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011597\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011598Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011599
11600static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011601unicode__sizeof__(PyUnicodeObject *v)
11602{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011603 Py_ssize_t size;
11604
11605 /* If it's a compact object, account for base structure +
11606 character data. */
11607 if (PyUnicode_IS_COMPACT_ASCII(v))
11608 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11609 else if (PyUnicode_IS_COMPACT(v))
11610 size = sizeof(PyCompactUnicodeObject) +
11611 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11612 else {
11613 /* If it is a two-block object, account for base object, and
11614 for character block if present. */
11615 size = sizeof(PyUnicodeObject);
11616 if (v->data.any)
11617 size += (PyUnicode_GET_LENGTH(v) + 1) *
11618 PyUnicode_CHARACTER_SIZE(v);
11619 }
11620 /* If the wstr pointer is present, account for it unless it is shared
11621 with the data pointer. Since PyUnicode_DATA will crash if the object
11622 is not ready, check whether it's either not ready (in which case the
11623 data is entirely in wstr) or if the data is not shared. */
11624 if (_PyUnicode_WSTR(v) &&
11625 (!PyUnicode_IS_READY(v) ||
11626 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11627 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11628 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11629 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11630
11631 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011632}
11633
11634PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011635 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011636
11637static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011638unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011639{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011640 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 if (!copy)
11642 return NULL;
11643 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011644}
11645
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646static PyMethodDef unicode_methods[] = {
11647
11648 /* Order is according to common usage: often used methods should
11649 appear first, since lookup is done sequentially. */
11650
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011651 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011652 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11653 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011654 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011655 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11656 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11657 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11658 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11659 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11660 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11661 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011662 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011663 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11664 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11665 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011666 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011667 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11668 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11669 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011670 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011671 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011672 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011673 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011674 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11675 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11676 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11677 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11678 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11679 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11680 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11681 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11682 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11683 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11684 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11685 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11686 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11687 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011688 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011689 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011690 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011691 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011692 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011693 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011694 {"maketrans", (PyCFunction) unicode_maketrans,
11695 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011696 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011697#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011698 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699#endif
11700
11701#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011702 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011703 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704#endif
11705
Benjamin Peterson14339b62009-01-31 16:36:08 +000011706 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707 {NULL, NULL}
11708};
11709
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011710static PyObject *
11711unicode_mod(PyObject *v, PyObject *w)
11712{
Brian Curtindfc80e32011-08-10 20:28:54 -050011713 if (!PyUnicode_Check(v))
11714 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011716}
11717
11718static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011719 0, /*nb_add*/
11720 0, /*nb_subtract*/
11721 0, /*nb_multiply*/
11722 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011723};
11724
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011726 (lenfunc) unicode_length, /* sq_length */
11727 PyUnicode_Concat, /* sq_concat */
11728 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11729 (ssizeargfunc) unicode_getitem, /* sq_item */
11730 0, /* sq_slice */
11731 0, /* sq_ass_item */
11732 0, /* sq_ass_slice */
11733 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734};
11735
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011736static PyObject*
11737unicode_subscript(PyUnicodeObject* self, PyObject* item)
11738{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 if (PyUnicode_READY(self) == -1)
11740 return NULL;
11741
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011742 if (PyIndex_Check(item)) {
11743 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011744 if (i == -1 && PyErr_Occurred())
11745 return NULL;
11746 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011748 return unicode_getitem(self, i);
11749 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011750 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011752 Py_UNICODE* result_buf;
11753 PyObject* result;
11754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011756 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011757 return NULL;
11758 }
11759
11760 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 return PyUnicode_New(0, 0);
11762 } else if (start == 0 && step == 1 &&
11763 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011764 PyUnicode_CheckExact(self)) {
11765 Py_INCREF(self);
11766 return (PyObject *)self;
11767 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011768 return PyUnicode_Substring((PyObject*)self,
11769 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011770 } else {
11771 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011772 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11773 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011774
Benjamin Peterson29060642009-01-31 22:14:21 +000011775 if (result_buf == NULL)
11776 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011777
11778 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11779 result_buf[i] = source_buf[cur];
11780 }
Tim Petersced69f82003-09-16 20:30:58 +000011781
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011782 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011783 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011784 return result;
11785 }
11786 } else {
11787 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11788 return NULL;
11789 }
11790}
11791
11792static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011793 (lenfunc)unicode_length, /* mp_length */
11794 (binaryfunc)unicode_subscript, /* mp_subscript */
11795 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011796};
11797
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799/* Helpers for PyUnicode_Format() */
11800
11801static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011802getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011804 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011806 (*p_argidx)++;
11807 if (arglen < 0)
11808 return args;
11809 else
11810 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811 }
11812 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011813 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814 return NULL;
11815}
11816
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011817/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011819static PyObject *
11820formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011822 char *p;
11823 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011825
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826 x = PyFloat_AsDouble(v);
11827 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011828 return NULL;
11829
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011831 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011832
Eric Smith0923d1d2009-04-16 20:16:10 +000011833 p = PyOS_double_to_string(x, type, prec,
11834 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011835 if (p == NULL)
11836 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011838 PyMem_Free(p);
11839 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840}
11841
Tim Peters38fd5b62000-09-21 05:43:11 +000011842static PyObject*
11843formatlong(PyObject *val, int flags, int prec, int type)
11844{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011845 char *buf;
11846 int len;
11847 PyObject *str; /* temporary string object. */
11848 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011849
Benjamin Peterson14339b62009-01-31 16:36:08 +000011850 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11851 if (!str)
11852 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011854 Py_DECREF(str);
11855 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011856}
11857
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011860 size_t buflen,
11861 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011863 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011864 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 if (PyUnicode_GET_LENGTH(v) == 1) {
11866 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011867 buf[1] = '\0';
11868 return 1;
11869 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 goto onError;
11871 }
11872 else {
11873 /* Integer input truncated to a character */
11874 long x;
11875 x = PyLong_AsLong(v);
11876 if (x == -1 && PyErr_Occurred())
11877 goto onError;
11878
11879 if (x < 0 || x > 0x10ffff) {
11880 PyErr_SetString(PyExc_OverflowError,
11881 "%c arg not in range(0x110000)");
11882 return -1;
11883 }
11884
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011886 buf[1] = '\0';
11887 return 1;
11888 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011889
Benjamin Peterson29060642009-01-31 22:14:21 +000011890 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011891 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011892 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011893 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894}
11895
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011896/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011897 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011898*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011899#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011900
Alexander Belopolsky40018472011-02-26 01:02:56 +000011901PyObject *
11902PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 void *fmt;
11905 int fmtkind;
11906 PyObject *result;
11907 Py_UCS4 *res, *res0;
11908 Py_UCS4 max;
11909 int kind;
11910 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011914
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011916 PyErr_BadInternalCall();
11917 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11920 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011921 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 fmt = PyUnicode_DATA(uformat);
11923 fmtkind = PyUnicode_KIND(uformat);
11924 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11925 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926
11927 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11929 if (res0 == NULL) {
11930 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011931 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933
11934 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011935 arglen = PyTuple_Size(args);
11936 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937 }
11938 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011939 arglen = -1;
11940 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011942 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011943 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945
11946 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011948 if (--rescnt < 0) {
11949 rescnt = fmtcnt + 100;
11950 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11952 if (res0 == NULL){
11953 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 }
11956 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011957 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011960 }
11961 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011962 /* Got a format specifier */
11963 int flags = 0;
11964 Py_ssize_t width = -1;
11965 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 Py_UCS4 c = '\0';
11967 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011968 int isnumok;
11969 PyObject *v = NULL;
11970 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 void *pbuf;
11972 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011973 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 Py_ssize_t len, len1;
11975 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 fmtpos++;
11978 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
11979 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000011980 Py_ssize_t keylen;
11981 PyObject *key;
11982 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000011983
Benjamin Peterson29060642009-01-31 22:14:21 +000011984 if (dict == NULL) {
11985 PyErr_SetString(PyExc_TypeError,
11986 "format requires a mapping");
11987 goto onError;
11988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011990 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 /* Skip over balanced parentheses */
11993 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000011995 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000011999 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012001 if (fmtcnt < 0 || pcount > 0) {
12002 PyErr_SetString(PyExc_ValueError,
12003 "incomplete format key");
12004 goto onError;
12005 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012006 key = PyUnicode_Substring((PyObject*)uformat,
12007 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012008 if (key == NULL)
12009 goto onError;
12010 if (args_owned) {
12011 Py_DECREF(args);
12012 args_owned = 0;
12013 }
12014 args = PyObject_GetItem(dict, key);
12015 Py_DECREF(key);
12016 if (args == NULL) {
12017 goto onError;
12018 }
12019 args_owned = 1;
12020 arglen = -1;
12021 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012022 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012023 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012025 case '-': flags |= F_LJUST; continue;
12026 case '+': flags |= F_SIGN; continue;
12027 case ' ': flags |= F_BLANK; continue;
12028 case '#': flags |= F_ALT; continue;
12029 case '0': flags |= F_ZERO; continue;
12030 }
12031 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012032 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012033 if (c == '*') {
12034 v = getnextarg(args, arglen, &argidx);
12035 if (v == NULL)
12036 goto onError;
12037 if (!PyLong_Check(v)) {
12038 PyErr_SetString(PyExc_TypeError,
12039 "* wants int");
12040 goto onError;
12041 }
12042 width = PyLong_AsLong(v);
12043 if (width == -1 && PyErr_Occurred())
12044 goto onError;
12045 if (width < 0) {
12046 flags |= F_LJUST;
12047 width = -width;
12048 }
12049 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012050 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012051 }
12052 else if (c >= '0' && c <= '9') {
12053 width = c - '0';
12054 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012056 if (c < '0' || c > '9')
12057 break;
12058 if ((width*10) / 10 != width) {
12059 PyErr_SetString(PyExc_ValueError,
12060 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012061 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012062 }
12063 width = width*10 + (c - '0');
12064 }
12065 }
12066 if (c == '.') {
12067 prec = 0;
12068 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012070 if (c == '*') {
12071 v = getnextarg(args, arglen, &argidx);
12072 if (v == NULL)
12073 goto onError;
12074 if (!PyLong_Check(v)) {
12075 PyErr_SetString(PyExc_TypeError,
12076 "* wants int");
12077 goto onError;
12078 }
12079 prec = PyLong_AsLong(v);
12080 if (prec == -1 && PyErr_Occurred())
12081 goto onError;
12082 if (prec < 0)
12083 prec = 0;
12084 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012086 }
12087 else if (c >= '0' && c <= '9') {
12088 prec = c - '0';
12089 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012091 if (c < '0' || c > '9')
12092 break;
12093 if ((prec*10) / 10 != prec) {
12094 PyErr_SetString(PyExc_ValueError,
12095 "prec too big");
12096 goto onError;
12097 }
12098 prec = prec*10 + (c - '0');
12099 }
12100 }
12101 } /* prec */
12102 if (fmtcnt >= 0) {
12103 if (c == 'h' || c == 'l' || c == 'L') {
12104 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012106 }
12107 }
12108 if (fmtcnt < 0) {
12109 PyErr_SetString(PyExc_ValueError,
12110 "incomplete format");
12111 goto onError;
12112 }
12113 if (c != '%') {
12114 v = getnextarg(args, arglen, &argidx);
12115 if (v == NULL)
12116 goto onError;
12117 }
12118 sign = 0;
12119 fill = ' ';
12120 switch (c) {
12121
12122 case '%':
12123 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012125 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012127 len = 1;
12128 break;
12129
12130 case 's':
12131 case 'r':
12132 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012133 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 temp = v;
12135 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012136 }
12137 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012138 if (c == 's')
12139 temp = PyObject_Str(v);
12140 else if (c == 'r')
12141 temp = PyObject_Repr(v);
12142 else
12143 temp = PyObject_ASCII(v);
12144 if (temp == NULL)
12145 goto onError;
12146 if (PyUnicode_Check(temp))
12147 /* nothing to do */;
12148 else {
12149 Py_DECREF(temp);
12150 PyErr_SetString(PyExc_TypeError,
12151 "%s argument has non-string str()");
12152 goto onError;
12153 }
12154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155 if (PyUnicode_READY(temp) == -1) {
12156 Py_CLEAR(temp);
12157 goto onError;
12158 }
12159 pbuf = PyUnicode_DATA(temp);
12160 kind = PyUnicode_KIND(temp);
12161 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012162 if (prec >= 0 && len > prec)
12163 len = prec;
12164 break;
12165
12166 case 'i':
12167 case 'd':
12168 case 'u':
12169 case 'o':
12170 case 'x':
12171 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012172 isnumok = 0;
12173 if (PyNumber_Check(v)) {
12174 PyObject *iobj=NULL;
12175
12176 if (PyLong_Check(v)) {
12177 iobj = v;
12178 Py_INCREF(iobj);
12179 }
12180 else {
12181 iobj = PyNumber_Long(v);
12182 }
12183 if (iobj!=NULL) {
12184 if (PyLong_Check(iobj)) {
12185 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012186 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012187 Py_DECREF(iobj);
12188 if (!temp)
12189 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 if (PyUnicode_READY(temp) == -1) {
12191 Py_CLEAR(temp);
12192 goto onError;
12193 }
12194 pbuf = PyUnicode_DATA(temp);
12195 kind = PyUnicode_KIND(temp);
12196 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012197 sign = 1;
12198 }
12199 else {
12200 Py_DECREF(iobj);
12201 }
12202 }
12203 }
12204 if (!isnumok) {
12205 PyErr_Format(PyExc_TypeError,
12206 "%%%c format: a number is required, "
12207 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12208 goto onError;
12209 }
12210 if (flags & F_ZERO)
12211 fill = '0';
12212 break;
12213
12214 case 'e':
12215 case 'E':
12216 case 'f':
12217 case 'F':
12218 case 'g':
12219 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012220 temp = formatfloat(v, flags, prec, c);
12221 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012222 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 if (PyUnicode_READY(temp) == -1) {
12224 Py_CLEAR(temp);
12225 goto onError;
12226 }
12227 pbuf = PyUnicode_DATA(temp);
12228 kind = PyUnicode_KIND(temp);
12229 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012230 sign = 1;
12231 if (flags & F_ZERO)
12232 fill = '0';
12233 break;
12234
12235 case 'c':
12236 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012238 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012239 if (len < 0)
12240 goto onError;
12241 break;
12242
12243 default:
12244 PyErr_Format(PyExc_ValueError,
12245 "unsupported format character '%c' (0x%x) "
12246 "at index %zd",
12247 (31<=c && c<=126) ? (char)c : '?',
12248 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 goto onError;
12251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 /* pbuf is initialized here. */
12253 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012254 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012255 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12256 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12257 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012258 len--;
12259 }
12260 else if (flags & F_SIGN)
12261 sign = '+';
12262 else if (flags & F_BLANK)
12263 sign = ' ';
12264 else
12265 sign = 0;
12266 }
12267 if (width < len)
12268 width = len;
12269 if (rescnt - (sign != 0) < width) {
12270 reslen -= rescnt;
12271 rescnt = width + fmtcnt + 100;
12272 reslen += rescnt;
12273 if (reslen < 0) {
12274 Py_XDECREF(temp);
12275 PyErr_NoMemory();
12276 goto onError;
12277 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012278 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12279 if (res0 == 0) {
12280 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012281 Py_XDECREF(temp);
12282 goto onError;
12283 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 }
12286 if (sign) {
12287 if (fill != ' ')
12288 *res++ = sign;
12289 rescnt--;
12290 if (width > len)
12291 width--;
12292 }
12293 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12295 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012296 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12298 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012299 }
12300 rescnt -= 2;
12301 width -= 2;
12302 if (width < 0)
12303 width = 0;
12304 len -= 2;
12305 }
12306 if (width > len && !(flags & F_LJUST)) {
12307 do {
12308 --rescnt;
12309 *res++ = fill;
12310 } while (--width > len);
12311 }
12312 if (fill == ' ') {
12313 if (sign)
12314 *res++ = sign;
12315 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12317 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12318 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12319 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012320 }
12321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 /* Copy all characters, preserving len */
12323 len1 = len;
12324 while (len1--) {
12325 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12326 rescnt--;
12327 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012328 while (--width >= len) {
12329 --rescnt;
12330 *res++ = ' ';
12331 }
12332 if (dict && (argidx < arglen) && c != '%') {
12333 PyErr_SetString(PyExc_TypeError,
12334 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012335 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012336 goto onError;
12337 }
12338 Py_XDECREF(temp);
12339 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340 } /* until end */
12341 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012342 PyErr_SetString(PyExc_TypeError,
12343 "not all arguments converted during string formatting");
12344 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345 }
12346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347
12348 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12349 if (*res > max)
12350 max = *res;
12351 result = PyUnicode_New(reslen - rescnt, max);
12352 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012353 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 kind = PyUnicode_KIND(result);
12355 for (res = res0; res < res0+reslen-rescnt; res++)
12356 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12357 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012359 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360 }
12361 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362 return (PyObject *)result;
12363
Benjamin Peterson29060642009-01-31 22:14:21 +000012364 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366 Py_DECREF(uformat);
12367 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012368 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369 }
12370 return NULL;
12371}
12372
Jeremy Hylton938ace62002-07-17 16:30:39 +000012373static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012374unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12375
Tim Peters6d6c1a32001-08-02 04:15:00 +000012376static PyObject *
12377unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12378{
Benjamin Peterson29060642009-01-31 22:14:21 +000012379 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012380 static char *kwlist[] = {"object", "encoding", "errors", 0};
12381 char *encoding = NULL;
12382 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012383
Benjamin Peterson14339b62009-01-31 16:36:08 +000012384 if (type != &PyUnicode_Type)
12385 return unicode_subtype_new(type, args, kwds);
12386 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012387 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012388 return NULL;
12389 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012391 if (encoding == NULL && errors == NULL)
12392 return PyObject_Str(x);
12393 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012394 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012395}
12396
Guido van Rossume023fe02001-08-30 03:12:59 +000012397static PyObject *
12398unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12399{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012400 PyUnicodeObject *tmp, *pnew;
12401 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012403
Benjamin Peterson14339b62009-01-31 16:36:08 +000012404 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12405 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12406 if (tmp == NULL)
12407 return NULL;
12408 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12410 // it seems kind of strange that tp_alloc gets passed the size
12411 // of the unicode string because there will follow another
12412 // malloc.
12413 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12414 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012415 if (pnew == NULL) {
12416 Py_DECREF(tmp);
12417 return NULL;
12418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012419 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12420 if (_PyUnicode_WSTR(pnew) == NULL) {
12421 err = PyErr_NoMemory();
12422 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012423 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12425 _PyUnicode_WSTR_LENGTH(pnew) = n;
12426 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12427 _PyUnicode_STATE(pnew).interned = 0;
12428 _PyUnicode_STATE(pnew).kind = 0;
12429 _PyUnicode_STATE(pnew).compact = 0;
12430 _PyUnicode_STATE(pnew).ready = 0;
12431 _PyUnicode_STATE(pnew).ascii = 0;
12432 pnew->data.any = NULL;
12433 _PyUnicode_LENGTH(pnew) = 0;
12434 pnew->_base.utf8 = NULL;
12435 pnew->_base.utf8_length = 0;
12436
12437 if (PyUnicode_READY(pnew) == -1) {
12438 PyObject_FREE(_PyUnicode_WSTR(pnew));
12439 goto onError;
12440 }
12441
Benjamin Peterson14339b62009-01-31 16:36:08 +000012442 Py_DECREF(tmp);
12443 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444
12445 onError:
12446 _Py_ForgetReference((PyObject *)pnew);
12447 PyObject_Del(pnew);
12448 Py_DECREF(tmp);
12449 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012450}
12451
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012452PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012453 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012454\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012455Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012456encoding defaults to the current default string encoding.\n\
12457errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012458
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012459static PyObject *unicode_iter(PyObject *seq);
12460
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012462 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012463 "str", /* tp_name */
12464 sizeof(PyUnicodeObject), /* tp_size */
12465 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012467 (destructor)unicode_dealloc, /* tp_dealloc */
12468 0, /* tp_print */
12469 0, /* tp_getattr */
12470 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012471 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012472 unicode_repr, /* tp_repr */
12473 &unicode_as_number, /* tp_as_number */
12474 &unicode_as_sequence, /* tp_as_sequence */
12475 &unicode_as_mapping, /* tp_as_mapping */
12476 (hashfunc) unicode_hash, /* tp_hash*/
12477 0, /* tp_call*/
12478 (reprfunc) unicode_str, /* tp_str */
12479 PyObject_GenericGetAttr, /* tp_getattro */
12480 0, /* tp_setattro */
12481 0, /* tp_as_buffer */
12482 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012483 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012484 unicode_doc, /* tp_doc */
12485 0, /* tp_traverse */
12486 0, /* tp_clear */
12487 PyUnicode_RichCompare, /* tp_richcompare */
12488 0, /* tp_weaklistoffset */
12489 unicode_iter, /* tp_iter */
12490 0, /* tp_iternext */
12491 unicode_methods, /* tp_methods */
12492 0, /* tp_members */
12493 0, /* tp_getset */
12494 &PyBaseObject_Type, /* tp_base */
12495 0, /* tp_dict */
12496 0, /* tp_descr_get */
12497 0, /* tp_descr_set */
12498 0, /* tp_dictoffset */
12499 0, /* tp_init */
12500 0, /* tp_alloc */
12501 unicode_new, /* tp_new */
12502 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503};
12504
12505/* Initialize the Unicode implementation */
12506
Thomas Wouters78890102000-07-22 19:25:51 +000012507void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012509 int i;
12510
Thomas Wouters477c8d52006-05-27 19:21:47 +000012511 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012513 0x000A, /* LINE FEED */
12514 0x000D, /* CARRIAGE RETURN */
12515 0x001C, /* FILE SEPARATOR */
12516 0x001D, /* GROUP SEPARATOR */
12517 0x001E, /* RECORD SEPARATOR */
12518 0x0085, /* NEXT LINE */
12519 0x2028, /* LINE SEPARATOR */
12520 0x2029, /* PARAGRAPH SEPARATOR */
12521 };
12522
Fred Drakee4315f52000-05-09 19:53:39 +000012523 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012525 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012527
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012528 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012529 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012530 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012531 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012532
12533 /* initialize the linebreak bloom filter */
12534 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012536 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012537
12538 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539}
12540
12541/* Finalize the Unicode implementation */
12542
Christian Heimesa156e092008-02-16 07:38:31 +000012543int
12544PyUnicode_ClearFreeList(void)
12545{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012547}
12548
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549void
Thomas Wouters78890102000-07-22 19:25:51 +000012550_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012552 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012554 Py_XDECREF(unicode_empty);
12555 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012556
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012557 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012558 if (unicode_latin1[i]) {
12559 Py_DECREF(unicode_latin1[i]);
12560 unicode_latin1[i] = NULL;
12561 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012562 }
Christian Heimesa156e092008-02-16 07:38:31 +000012563 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012565
Walter Dörwald16807132007-05-25 13:52:07 +000012566void
12567PyUnicode_InternInPlace(PyObject **p)
12568{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012569 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12570 PyObject *t;
12571 if (s == NULL || !PyUnicode_Check(s))
12572 Py_FatalError(
12573 "PyUnicode_InternInPlace: unicode strings only please!");
12574 /* If it's a subclass, we don't really know what putting
12575 it in the interned dict might do. */
12576 if (!PyUnicode_CheckExact(s))
12577 return;
12578 if (PyUnicode_CHECK_INTERNED(s))
12579 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 if (PyUnicode_READY(s) == -1) {
12581 assert(0 && "ready fail in intern...");
12582 return;
12583 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012584 if (interned == NULL) {
12585 interned = PyDict_New();
12586 if (interned == NULL) {
12587 PyErr_Clear(); /* Don't leave an exception */
12588 return;
12589 }
12590 }
12591 /* It might be that the GetItem call fails even
12592 though the key is present in the dictionary,
12593 namely when this happens during a stack overflow. */
12594 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012595 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012596 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012597
Benjamin Peterson29060642009-01-31 22:14:21 +000012598 if (t) {
12599 Py_INCREF(t);
12600 Py_DECREF(*p);
12601 *p = t;
12602 return;
12603 }
Walter Dörwald16807132007-05-25 13:52:07 +000012604
Benjamin Peterson14339b62009-01-31 16:36:08 +000012605 PyThreadState_GET()->recursion_critical = 1;
12606 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12607 PyErr_Clear();
12608 PyThreadState_GET()->recursion_critical = 0;
12609 return;
12610 }
12611 PyThreadState_GET()->recursion_critical = 0;
12612 /* The two references in interned are not counted by refcnt.
12613 The deallocator will take care of this */
12614 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012616}
12617
12618void
12619PyUnicode_InternImmortal(PyObject **p)
12620{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12622
Benjamin Peterson14339b62009-01-31 16:36:08 +000012623 PyUnicode_InternInPlace(p);
12624 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012626 Py_INCREF(*p);
12627 }
Walter Dörwald16807132007-05-25 13:52:07 +000012628}
12629
12630PyObject *
12631PyUnicode_InternFromString(const char *cp)
12632{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012633 PyObject *s = PyUnicode_FromString(cp);
12634 if (s == NULL)
12635 return NULL;
12636 PyUnicode_InternInPlace(&s);
12637 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012638}
12639
Alexander Belopolsky40018472011-02-26 01:02:56 +000012640void
12641_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012642{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012643 PyObject *keys;
12644 PyUnicodeObject *s;
12645 Py_ssize_t i, n;
12646 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012647
Benjamin Peterson14339b62009-01-31 16:36:08 +000012648 if (interned == NULL || !PyDict_Check(interned))
12649 return;
12650 keys = PyDict_Keys(interned);
12651 if (keys == NULL || !PyList_Check(keys)) {
12652 PyErr_Clear();
12653 return;
12654 }
Walter Dörwald16807132007-05-25 13:52:07 +000012655
Benjamin Peterson14339b62009-01-31 16:36:08 +000012656 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12657 detector, interned unicode strings are not forcibly deallocated;
12658 rather, we give them their stolen references back, and then clear
12659 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012660
Benjamin Peterson14339b62009-01-31 16:36:08 +000012661 n = PyList_GET_SIZE(keys);
12662 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012663 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012664 for (i = 0; i < n; i++) {
12665 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 if (PyUnicode_READY(s) == -1)
12667 fprintf(stderr, "could not ready string\n");
12668 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012669 case SSTATE_NOT_INTERNED:
12670 /* XXX Shouldn't happen */
12671 break;
12672 case SSTATE_INTERNED_IMMORTAL:
12673 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012674 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012675 break;
12676 case SSTATE_INTERNED_MORTAL:
12677 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012679 break;
12680 default:
12681 Py_FatalError("Inconsistent interned string state.");
12682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012684 }
12685 fprintf(stderr, "total size of all interned strings: "
12686 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12687 "mortal/immortal\n", mortal_size, immortal_size);
12688 Py_DECREF(keys);
12689 PyDict_Clear(interned);
12690 Py_DECREF(interned);
12691 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012692}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012693
12694
12695/********************* Unicode Iterator **************************/
12696
12697typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012698 PyObject_HEAD
12699 Py_ssize_t it_index;
12700 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012701} unicodeiterobject;
12702
12703static void
12704unicodeiter_dealloc(unicodeiterobject *it)
12705{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012706 _PyObject_GC_UNTRACK(it);
12707 Py_XDECREF(it->it_seq);
12708 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012709}
12710
12711static int
12712unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12713{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012714 Py_VISIT(it->it_seq);
12715 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012716}
12717
12718static PyObject *
12719unicodeiter_next(unicodeiterobject *it)
12720{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012721 PyUnicodeObject *seq;
12722 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012723
Benjamin Peterson14339b62009-01-31 16:36:08 +000012724 assert(it != NULL);
12725 seq = it->it_seq;
12726 if (seq == NULL)
12727 return NULL;
12728 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12731 int kind = PyUnicode_KIND(seq);
12732 void *data = PyUnicode_DATA(seq);
12733 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12734 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012735 if (item != NULL)
12736 ++it->it_index;
12737 return item;
12738 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012739
Benjamin Peterson14339b62009-01-31 16:36:08 +000012740 Py_DECREF(seq);
12741 it->it_seq = NULL;
12742 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012743}
12744
12745static PyObject *
12746unicodeiter_len(unicodeiterobject *it)
12747{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012748 Py_ssize_t len = 0;
12749 if (it->it_seq)
12750 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12751 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012752}
12753
12754PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12755
12756static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012757 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012758 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012759 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012760};
12761
12762PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012763 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12764 "str_iterator", /* tp_name */
12765 sizeof(unicodeiterobject), /* tp_basicsize */
12766 0, /* tp_itemsize */
12767 /* methods */
12768 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12769 0, /* tp_print */
12770 0, /* tp_getattr */
12771 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012772 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012773 0, /* tp_repr */
12774 0, /* tp_as_number */
12775 0, /* tp_as_sequence */
12776 0, /* tp_as_mapping */
12777 0, /* tp_hash */
12778 0, /* tp_call */
12779 0, /* tp_str */
12780 PyObject_GenericGetAttr, /* tp_getattro */
12781 0, /* tp_setattro */
12782 0, /* tp_as_buffer */
12783 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12784 0, /* tp_doc */
12785 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12786 0, /* tp_clear */
12787 0, /* tp_richcompare */
12788 0, /* tp_weaklistoffset */
12789 PyObject_SelfIter, /* tp_iter */
12790 (iternextfunc)unicodeiter_next, /* tp_iternext */
12791 unicodeiter_methods, /* tp_methods */
12792 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012793};
12794
12795static PyObject *
12796unicode_iter(PyObject *seq)
12797{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012798 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012799
Benjamin Peterson14339b62009-01-31 16:36:08 +000012800 if (!PyUnicode_Check(seq)) {
12801 PyErr_BadInternalCall();
12802 return NULL;
12803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804 if (PyUnicode_READY(seq) == -1)
12805 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012806 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12807 if (it == NULL)
12808 return NULL;
12809 it->it_index = 0;
12810 Py_INCREF(seq);
12811 it->it_seq = (PyUnicodeObject *)seq;
12812 _PyObject_GC_TRACK(it);
12813 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012814}
12815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816#define UNIOP(x) Py_UNICODE_##x
12817#define UNIOP_t Py_UNICODE
12818#include "uniops.h"
12819#undef UNIOP
12820#undef UNIOP_t
12821#define UNIOP(x) Py_UCS4_##x
12822#define UNIOP_t Py_UCS4
12823#include "uniops.h"
12824#undef UNIOP
12825#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012826
Victor Stinner71133ff2010-09-01 23:43:53 +000012827Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012828PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012829{
12830 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12831 Py_UNICODE *copy;
12832 Py_ssize_t size;
12833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012834 if (!PyUnicode_Check(unicode)) {
12835 PyErr_BadArgument();
12836 return NULL;
12837 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012838 /* Ensure we won't overflow the size. */
12839 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12840 PyErr_NoMemory();
12841 return NULL;
12842 }
12843 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12844 size *= sizeof(Py_UNICODE);
12845 copy = PyMem_Malloc(size);
12846 if (copy == NULL) {
12847 PyErr_NoMemory();
12848 return NULL;
12849 }
12850 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12851 return copy;
12852}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012853
Georg Brandl66c221e2010-10-14 07:04:07 +000012854/* A _string module, to export formatter_parser and formatter_field_name_split
12855 to the string.Formatter class implemented in Python. */
12856
12857static PyMethodDef _string_methods[] = {
12858 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12859 METH_O, PyDoc_STR("split the argument as a field name")},
12860 {"formatter_parser", (PyCFunction) formatter_parser,
12861 METH_O, PyDoc_STR("parse the argument as a format string")},
12862 {NULL, NULL}
12863};
12864
12865static struct PyModuleDef _string_module = {
12866 PyModuleDef_HEAD_INIT,
12867 "_string",
12868 PyDoc_STR("string helper module"),
12869 0,
12870 _string_methods,
12871 NULL,
12872 NULL,
12873 NULL,
12874 NULL
12875};
12876
12877PyMODINIT_FUNC
12878PyInit__string(void)
12879{
12880 return PyModule_Create(&_string_module);
12881}
12882
12883
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012884#ifdef __cplusplus
12885}
12886#endif