blob: 065b5e76d7e62e21064d1826a45e62541e9dc715 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200108#define _PyUnicode_UTF8(op) \
109 (PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 ((PyCompactUnicodeObject*)(op))->utf8)
112#define _PyUnicode_UTF8_LENGTH(op) \
113 (PyUnicode_IS_COMPACT_ASCII(op) ? \
114 ((PyASCIIObject*)(op))->length : \
115 ((PyCompactUnicodeObject*)(op))->utf8_length)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200116#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
117#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
119#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
120#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
121#define _PyUnicode_KIND(op) \
122 (assert(PyUnicode_Check(op)), \
123 ((PyASCIIObject *)(op))->state.kind)
124#define _PyUnicode_GET_LENGTH(op) \
125 (assert(PyUnicode_Check(op)), \
126 ((PyASCIIObject *)(op))->length)
127
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200128/* The Unicode string has been modified: reset the hash */
129#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131
Walter Dörwald16807132007-05-25 13:52:07 +0000132/* This dictionary holds all interned unicode strings. Note that references
133 to strings in this dictionary are *not* counted in the string's ob_refcnt.
134 When the interned string reaches a refcnt of 0 the string deallocation
135 function will delete the reference from this dictionary.
136
137 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000138 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000139*/
140static PyObject *interned;
141
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000142/* The empty Unicode object is shared to improve performance. */
143static PyUnicodeObject *unicode_empty;
144
145/* Single character Unicode strings in the Latin-1 range are being
146 shared as well. */
147static PyUnicodeObject *unicode_latin1[256];
148
Christian Heimes190d79e2008-01-30 11:58:22 +0000149/* Fast detection of the most frequent whitespace characters */
150const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000151 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000152/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000153/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000154/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000155/* case 0x000C: * FORM FEED */
156/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000157 0, 1, 1, 1, 1, 1, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* case 0x001C: * FILE SEPARATOR */
160/* case 0x001D: * GROUP SEPARATOR */
161/* case 0x001E: * RECORD SEPARATOR */
162/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000163 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000165 1, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000169
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000178};
179
Alexander Belopolsky40018472011-02-26 01:02:56 +0000180static PyObject *
181unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000182 PyObject **errorHandler,const char *encoding, const char *reason,
183 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
184 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
185
Alexander Belopolsky40018472011-02-26 01:02:56 +0000186static void
187raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300188 const char *encoding,
189 const Py_UNICODE *unicode, Py_ssize_t size,
190 Py_ssize_t startpos, Py_ssize_t endpos,
191 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000192
Christian Heimes190d79e2008-01-30 11:58:22 +0000193/* Same for linebreaks */
194static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000196/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* 0x000B, * LINE TABULATION */
198/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000202/* 0x001C, * FILE SEPARATOR */
203/* 0x001D, * GROUP SEPARATOR */
204/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 1, 1, 1, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000210
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000219};
220
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300221/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
222 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000223Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000224PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000225{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000226#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000227 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000228#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 /* This is actually an illegal character, so it should
230 not be passed to unichr. */
231 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000232#endif
233}
234
Thomas Wouters477c8d52006-05-27 19:21:47 +0000235/* --- Bloom Filters ----------------------------------------------------- */
236
237/* stuff to implement simple "bloom filters" for Unicode characters.
238 to keep things simple, we use a single bitmask, using the least 5
239 bits from each unicode characters as the bit index. */
240
241/* the linebreak mask is set up by Unicode_Init below */
242
Antoine Pitrouf068f942010-01-13 14:19:12 +0000243#if LONG_BIT >= 128
244#define BLOOM_WIDTH 128
245#elif LONG_BIT >= 64
246#define BLOOM_WIDTH 64
247#elif LONG_BIT >= 32
248#define BLOOM_WIDTH 32
249#else
250#error "LONG_BIT is smaller than 32"
251#endif
252
Thomas Wouters477c8d52006-05-27 19:21:47 +0000253#define BLOOM_MASK unsigned long
254
255static BLOOM_MASK bloom_linebreak;
256
Antoine Pitrouf068f942010-01-13 14:19:12 +0000257#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
258#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259
Benjamin Peterson29060642009-01-31 22:14:21 +0000260#define BLOOM_LINEBREAK(ch) \
261 ((ch) < 128U ? ascii_linebreak[(ch)] : \
262 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000263
Alexander Belopolsky40018472011-02-26 01:02:56 +0000264Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200265make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266{
267 /* calculate simple bloom-style bitmask for a given unicode string */
268
Antoine Pitrouf068f942010-01-13 14:19:12 +0000269 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270 Py_ssize_t i;
271
272 mask = 0;
273 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200274 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000275
276 return mask;
277}
278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279#define BLOOM_MEMBER(mask, chr, str) \
280 (BLOOM(mask, chr) \
281 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
Guido van Rossumd57fd912000-03-10 22:53:23 +0000283/* --- Unicode Object ----------------------------------------------------- */
284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200285static PyObject *
286substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len);
287
288static PyObject *
289fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
290
291Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
292 Py_ssize_t size, Py_UCS4 ch,
293 int direction)
294{
295 /* like wcschr, but doesn't stop at NULL characters */
296 Py_ssize_t i;
297 if (direction == 1) {
298 for(i = 0; i < size; i++)
299 if (PyUnicode_READ(kind, s, i) == ch)
300 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
301 }
302 else {
303 for(i = size-1; i >= 0; i--)
304 if (PyUnicode_READ(kind, s, i) == ch)
305 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
306 }
307 return NULL;
308}
309
Alexander Belopolsky40018472011-02-26 01:02:56 +0000310static int
311unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200312 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313{
314 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 /* Resizing is only supported for old unicode objects. */
317 assert(!PyUnicode_IS_COMPACT(unicode));
318 assert(_PyUnicode_WSTR(unicode) != NULL);
319
320 /* ... and only if they have not been readied yet, because
321 callees usually rely on the wstr representation when resizing. */
322 assert(unicode->data.any == NULL);
323
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000324 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200325 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000326 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 /* Resizing shared object (unicode_empty or single character
329 objects) in-place is not allowed. Use PyUnicode_Resize()
330 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000331
Benjamin Peterson14339b62009-01-31 16:36:08 +0000332 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200333 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
334 _PyUnicode_WSTR(unicode)[0] < 256U &&
335 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000337 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 return -1;
339 }
340
Thomas Wouters477c8d52006-05-27 19:21:47 +0000341 /* We allocate one more byte to make sure the string is Ux0000 terminated.
342 The overallocation is also used by fastsearch, which assumes that it's
343 safe to look at str[length] (without making any assumptions about what
344 it contains). */
345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 oldstr = _PyUnicode_WSTR(unicode);
347 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
348 sizeof(Py_UNICODE) * (length + 1));
349 if (!_PyUnicode_WSTR(unicode)) {
350 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 PyErr_NoMemory();
352 return -1;
353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 _PyUnicode_WSTR(unicode)[length] = 0;
355 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356
Benjamin Peterson29060642009-01-31 22:14:21 +0000357 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 if (unicode->data.any != NULL) {
359 PyObject_FREE(unicode->data.any);
360 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
361 PyObject_FREE(unicode->_base.utf8);
362 }
363 unicode->_base.utf8 = NULL;
364 unicode->_base.utf8_length = 0;
365 unicode->data.any = NULL;
366 _PyUnicode_LENGTH(unicode) = 0;
367 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
368 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200370 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000371
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 return 0;
373}
374
375/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000376 Ux0000 terminated; some code (e.g. new_identifier)
377 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378
379 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000380 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381
382*/
383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384#ifdef Py_DEBUG
385int unicode_old_new_calls = 0;
386#endif
387
Alexander Belopolsky40018472011-02-26 01:02:56 +0000388static PyUnicodeObject *
389_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000390{
391 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200392 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393
Thomas Wouters477c8d52006-05-27 19:21:47 +0000394 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000395 if (length == 0 && unicode_empty != NULL) {
396 Py_INCREF(unicode_empty);
397 return unicode_empty;
398 }
399
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000400 /* Ensure we won't overflow the size. */
401 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
402 return (PyUnicodeObject *)PyErr_NoMemory();
403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200404 if (length < 0) {
405 PyErr_SetString(PyExc_SystemError,
406 "Negative size passed to _PyUnicode_New");
407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 }
409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410#ifdef Py_DEBUG
411 ++unicode_old_new_calls;
412#endif
413
414 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
415 if (unicode == NULL)
416 return NULL;
417 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
418 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
419 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 PyErr_NoMemory();
421 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200423
Jeremy Hyltond8082792003-09-16 19:41:39 +0000424 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000425 * the caller fails before initializing str -- unicode_resize()
426 * reads str[0], and the Keep-Alive optimization can keep memory
427 * allocated for str alive across a call to unicode_dealloc(unicode).
428 * We don't want unicode_resize to read uninitialized memory in
429 * that case.
430 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431 _PyUnicode_WSTR(unicode)[0] = 0;
432 _PyUnicode_WSTR(unicode)[length] = 0;
433 _PyUnicode_WSTR_LENGTH(unicode) = length;
434 _PyUnicode_HASH(unicode) = -1;
435 _PyUnicode_STATE(unicode).interned = 0;
436 _PyUnicode_STATE(unicode).kind = 0;
437 _PyUnicode_STATE(unicode).compact = 0;
438 _PyUnicode_STATE(unicode).ready = 0;
439 _PyUnicode_STATE(unicode).ascii = 0;
440 unicode->data.any = NULL;
441 _PyUnicode_LENGTH(unicode) = 0;
442 unicode->_base.utf8 = NULL;
443 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000445
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000447 /* XXX UNREF/NEWREF interface should be more symmetrical */
448 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000449 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000450 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000451 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452}
453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454#ifdef Py_DEBUG
455int unicode_new_new_calls = 0;
456
457/* Functions wrapping macros for use in debugger */
458char *_PyUnicode_utf8(void *unicode){
459 return _PyUnicode_UTF8(unicode);
460}
461
462void *_PyUnicode_compact_data(void *unicode) {
463 return _PyUnicode_COMPACT_DATA(unicode);
464}
465void *_PyUnicode_data(void *unicode){
466 printf("obj %p\n", unicode);
467 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
468 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
469 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
470 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
471 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
472 return PyUnicode_DATA(unicode);
473}
474#endif
475
476PyObject *
477PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
478{
479 PyObject *obj;
480 PyCompactUnicodeObject *unicode;
481 void *data;
482 int kind_state;
483 int is_sharing = 0, is_ascii = 0;
484 Py_ssize_t char_size;
485 Py_ssize_t struct_size;
486
487 /* Optimization for empty strings */
488 if (size == 0 && unicode_empty != NULL) {
489 Py_INCREF(unicode_empty);
490 return (PyObject *)unicode_empty;
491 }
492
493#ifdef Py_DEBUG
494 ++unicode_new_new_calls;
495#endif
496
497 struct_size = sizeof(PyCompactUnicodeObject);
498 if (maxchar < 128) {
499 kind_state = PyUnicode_1BYTE_KIND;
500 char_size = 1;
501 is_ascii = 1;
502 struct_size = sizeof(PyASCIIObject);
503 }
504 else if (maxchar < 256) {
505 kind_state = PyUnicode_1BYTE_KIND;
506 char_size = 1;
507 }
508 else if (maxchar < 65536) {
509 kind_state = PyUnicode_2BYTE_KIND;
510 char_size = 2;
511 if (sizeof(wchar_t) == 2)
512 is_sharing = 1;
513 }
514 else {
515 kind_state = PyUnicode_4BYTE_KIND;
516 char_size = 4;
517 if (sizeof(wchar_t) == 4)
518 is_sharing = 1;
519 }
520
521 /* Ensure we won't overflow the size. */
522 if (size < 0) {
523 PyErr_SetString(PyExc_SystemError,
524 "Negative size passed to PyUnicode_New");
525 return NULL;
526 }
527 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
528 return PyErr_NoMemory();
529
530 /* Duplicated allocation code from _PyObject_New() instead of a call to
531 * PyObject_New() so we are able to allocate space for the object and
532 * it's data buffer.
533 */
534 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
535 if (obj == NULL)
536 return PyErr_NoMemory();
537 obj = PyObject_INIT(obj, &PyUnicode_Type);
538 if (obj == NULL)
539 return NULL;
540
541 unicode = (PyCompactUnicodeObject *)obj;
542 if (is_ascii)
543 data = ((PyASCIIObject*)obj) + 1;
544 else
545 data = unicode + 1;
546 _PyUnicode_LENGTH(unicode) = size;
547 _PyUnicode_HASH(unicode) = -1;
548 _PyUnicode_STATE(unicode).interned = 0;
549 _PyUnicode_STATE(unicode).kind = kind_state;
550 _PyUnicode_STATE(unicode).compact = 1;
551 _PyUnicode_STATE(unicode).ready = 1;
552 _PyUnicode_STATE(unicode).ascii = is_ascii;
553 if (is_ascii) {
554 ((char*)data)[size] = 0;
555 _PyUnicode_WSTR(unicode) = NULL;
556 }
557 else if (kind_state == PyUnicode_1BYTE_KIND) {
558 ((char*)data)[size] = 0;
559 _PyUnicode_WSTR(unicode) = NULL;
560 _PyUnicode_WSTR_LENGTH(unicode) = 0;
561 unicode->utf8_length = 0;
562 unicode->utf8 = NULL;
563 }
564 else {
565 unicode->utf8 = NULL;
566 if (kind_state == PyUnicode_2BYTE_KIND)
567 ((Py_UCS2*)data)[size] = 0;
568 else /* kind_state == PyUnicode_4BYTE_KIND */
569 ((Py_UCS4*)data)[size] = 0;
570 if (is_sharing) {
571 _PyUnicode_WSTR_LENGTH(unicode) = size;
572 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
573 }
574 else {
575 _PyUnicode_WSTR_LENGTH(unicode) = 0;
576 _PyUnicode_WSTR(unicode) = NULL;
577 }
578 }
579 return obj;
580}
581
582#if SIZEOF_WCHAR_T == 2
583/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
584 will decode surrogate pairs, the other conversions are implemented as macros
585 for efficency.
586
587 This function assumes that unicode can hold one more code point than wstr
588 characters for a terminating null character. */
589static int
590unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
591 PyUnicodeObject *unicode)
592{
593 const wchar_t *iter;
594 Py_UCS4 *ucs4_out;
595
596 assert(unicode && PyUnicode_Check(unicode));
597 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
598 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
599
600 for (iter = begin; iter < end; ) {
601 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
602 _PyUnicode_GET_LENGTH(unicode)));
603 if (*iter >= 0xD800 && *iter <= 0xDBFF
604 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
605 {
606 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
607 iter += 2;
608 }
609 else {
610 *ucs4_out++ = *iter;
611 iter++;
612 }
613 }
614 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
615 _PyUnicode_GET_LENGTH(unicode)));
616
617 return 0;
618}
619#endif
620
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200621Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200622PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
623 PyObject *from, Py_ssize_t from_start,
624 Py_ssize_t how_many)
625{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200626 unsigned int from_kind, to_kind;
627 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200628
629 assert(PyUnicode_Check(from));
630 assert(PyUnicode_Check(to));
631
632 if (PyUnicode_READY(from))
633 return -1;
634 if (PyUnicode_READY(to))
635 return -1;
636
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200637 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200638 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
639 PyErr_Format(PyExc_ValueError,
640 "Cannot write %zi characters at %zi "
641 "in a string of %zi characters",
642 how_many, to_start, PyUnicode_GET_LENGTH(to));
643 return -1;
644 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200645 if (how_many == 0)
646 return 0;
647
648 if (Py_REFCNT(to) != 1) {
649 PyErr_SetString(PyExc_ValueError,
650 "Cannot modify a string having more than 1 reference");
651 return -1;
652 }
Victor Stinnerc17f5402011-09-29 00:16:58 +0200653 _PyUnicode_DIRTY(to);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200656 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200657 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200658 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200659
660 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200661 /* fast path */
Victor Stinnera0702ab2011-09-29 14:14:38 +0200662 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200663 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200664 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200665 + PyUnicode_KIND_SIZE(from_kind, from_start),
666 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200667 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200668 else if (from_kind == PyUnicode_1BYTE_KIND
669 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200670 {
671 _PyUnicode_CONVERT_BYTES(
672 Py_UCS1, Py_UCS2,
673 PyUnicode_1BYTE_DATA(from) + from_start,
674 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
675 PyUnicode_2BYTE_DATA(to) + to_start
676 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200677 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200678 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200679 && to_kind == PyUnicode_4BYTE_KIND)
680 {
681 _PyUnicode_CONVERT_BYTES(
682 Py_UCS1, Py_UCS4,
683 PyUnicode_1BYTE_DATA(from) + from_start,
684 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
685 PyUnicode_4BYTE_DATA(to) + to_start
686 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200687 }
688 else if (from_kind == PyUnicode_2BYTE_KIND
689 && to_kind == PyUnicode_4BYTE_KIND)
690 {
691 _PyUnicode_CONVERT_BYTES(
692 Py_UCS2, Py_UCS4,
693 PyUnicode_2BYTE_DATA(from) + from_start,
694 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
695 PyUnicode_4BYTE_DATA(to) + to_start
696 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200697 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200698 else {
699 int invalid_kinds;
700 if (from_kind > to_kind) {
701 /* slow path to check for character overflow */
702 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
703 Py_UCS4 ch, maxchar;
704 Py_ssize_t i;
705
706 maxchar = 0;
707 invalid_kinds = 0;
708 for (i=0; i < how_many; i++) {
709 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
710 if (ch > maxchar) {
711 maxchar = ch;
712 if (maxchar > to_maxchar) {
713 invalid_kinds = 1;
714 break;
715 }
716 }
717 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
718 }
719 }
720 else
721 invalid_kinds = 1;
722 if (invalid_kinds) {
723 PyErr_Format(PyExc_ValueError,
724 "Cannot copy UCS%u characters "
725 "into a string of UCS%u characters",
726 1 << (from_kind - 1),
727 1 << (to_kind -1));
728 return -1;
729 }
730 }
731 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732}
733
Victor Stinner17222162011-09-28 22:15:37 +0200734/* Find the maximum code point and count the number of surrogate pairs so a
735 correct string length can be computed before converting a string to UCS4.
736 This function counts single surrogates as a character and not as a pair.
737
738 Return 0 on success, or -1 on error. */
739static int
740find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
741 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200742{
743 const wchar_t *iter;
744
745 if (num_surrogates == NULL || maxchar == NULL) {
746 PyErr_SetString(PyExc_SystemError,
747 "unexpected NULL arguments to "
748 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
749 return -1;
750 }
751
752 *num_surrogates = 0;
753 *maxchar = 0;
754
755 for (iter = begin; iter < end; ) {
756 if (*iter > *maxchar)
757 *maxchar = *iter;
758#if SIZEOF_WCHAR_T == 2
759 if (*iter >= 0xD800 && *iter <= 0xDBFF
760 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
761 {
762 Py_UCS4 surrogate_val;
763 surrogate_val = (((iter[0] & 0x3FF)<<10)
764 | (iter[1] & 0x3FF)) + 0x10000;
765 ++(*num_surrogates);
766 if (surrogate_val > *maxchar)
767 *maxchar = surrogate_val;
768 iter += 2;
769 }
770 else
771 iter++;
772#else
773 iter++;
774#endif
775 }
776 return 0;
777}
778
779#ifdef Py_DEBUG
780int unicode_ready_calls = 0;
781#endif
782
783int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200784_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200785{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200786 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787 wchar_t *end;
788 Py_UCS4 maxchar = 0;
789 Py_ssize_t num_surrogates;
790#if SIZEOF_WCHAR_T == 2
791 Py_ssize_t length_wo_surrogates;
792#endif
793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200794 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200795 strings were created using _PyObject_New() and where no canonical
796 representation (the str field) has been set yet aka strings
797 which are not yet ready. */
798 assert(PyUnicode_Check(obj));
799 assert(!PyUnicode_IS_READY(obj));
800 assert(!PyUnicode_IS_COMPACT(obj));
801 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200803 assert(unicode->data.any == NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200804 assert(unicode->_base.utf8 == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200805 /* Actually, it should neither be interned nor be anything else: */
806 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807
808#ifdef Py_DEBUG
809 ++unicode_ready_calls;
810#endif
811
812 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200813 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200814 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200815 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200816
817 if (maxchar < 256) {
818 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
819 if (!unicode->data.any) {
820 PyErr_NoMemory();
821 return -1;
822 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200823 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200824 _PyUnicode_WSTR(unicode), end,
825 PyUnicode_1BYTE_DATA(unicode));
826 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
827 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
828 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
829 if (maxchar < 128) {
830 unicode->_base.utf8 = unicode->data.any;
831 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
832 }
833 else {
834 unicode->_base.utf8 = NULL;
835 unicode->_base.utf8_length = 0;
836 }
837 PyObject_FREE(_PyUnicode_WSTR(unicode));
838 _PyUnicode_WSTR(unicode) = NULL;
839 _PyUnicode_WSTR_LENGTH(unicode) = 0;
840 }
841 /* In this case we might have to convert down from 4-byte native
842 wchar_t to 2-byte unicode. */
843 else if (maxchar < 65536) {
844 assert(num_surrogates == 0 &&
845 "FindMaxCharAndNumSurrogatePairs() messed up");
846
Victor Stinner506f5922011-09-28 22:34:18 +0200847#if SIZEOF_WCHAR_T == 2
848 /* We can share representations and are done. */
849 unicode->data.any = _PyUnicode_WSTR(unicode);
850 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
851 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
852 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
853 unicode->_base.utf8 = NULL;
854 unicode->_base.utf8_length = 0;
855#else
856 /* sizeof(wchar_t) == 4 */
857 unicode->data.any = PyObject_MALLOC(
858 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
859 if (!unicode->data.any) {
860 PyErr_NoMemory();
861 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200862 }
Victor Stinner506f5922011-09-28 22:34:18 +0200863 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
864 _PyUnicode_WSTR(unicode), end,
865 PyUnicode_2BYTE_DATA(unicode));
866 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
867 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
868 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
869 unicode->_base.utf8 = NULL;
870 unicode->_base.utf8_length = 0;
871 PyObject_FREE(_PyUnicode_WSTR(unicode));
872 _PyUnicode_WSTR(unicode) = NULL;
873 _PyUnicode_WSTR_LENGTH(unicode) = 0;
874#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200875 }
876 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
877 else {
878#if SIZEOF_WCHAR_T == 2
879 /* in case the native representation is 2-bytes, we need to allocate a
880 new normalized 4-byte version. */
881 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
882 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
883 if (!unicode->data.any) {
884 PyErr_NoMemory();
885 return -1;
886 }
887 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
888 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
889 unicode->_base.utf8 = NULL;
890 unicode->_base.utf8_length = 0;
891 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
892 unicode) < 0) {
893 assert(0 && "ConvertWideCharToUCS4 failed");
894 return -1;
895 }
896 PyObject_FREE(_PyUnicode_WSTR(unicode));
897 _PyUnicode_WSTR(unicode) = NULL;
898 _PyUnicode_WSTR_LENGTH(unicode) = 0;
899#else
900 assert(num_surrogates == 0);
901
902 unicode->data.any = _PyUnicode_WSTR(unicode);
903 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
904 unicode->_base.utf8 = NULL;
905 unicode->_base.utf8_length = 0;
906 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
907#endif
908 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
909 }
910 _PyUnicode_STATE(unicode).ready = 1;
911 return 0;
912}
913
Alexander Belopolsky40018472011-02-26 01:02:56 +0000914static void
915unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000916{
Walter Dörwald16807132007-05-25 13:52:07 +0000917 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000918 case SSTATE_NOT_INTERNED:
919 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000920
Benjamin Peterson29060642009-01-31 22:14:21 +0000921 case SSTATE_INTERNED_MORTAL:
922 /* revive dead object temporarily for DelItem */
923 Py_REFCNT(unicode) = 3;
924 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
925 Py_FatalError(
926 "deletion of interned string failed");
927 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000928
Benjamin Peterson29060642009-01-31 22:14:21 +0000929 case SSTATE_INTERNED_IMMORTAL:
930 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000931
Benjamin Peterson29060642009-01-31 22:14:21 +0000932 default:
933 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000934 }
935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936 if (_PyUnicode_WSTR(unicode) &&
937 (!PyUnicode_IS_READY(unicode) ||
938 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
939 PyObject_DEL(_PyUnicode_WSTR(unicode));
940 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
941 PyObject_DEL(unicode->_base.utf8);
942
943 if (PyUnicode_IS_COMPACT(unicode)) {
944 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000945 }
946 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200947 if (unicode->data.any)
948 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000949 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000950 }
951}
952
Alexander Belopolsky40018472011-02-26 01:02:56 +0000953static int
954_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000955{
956 register PyUnicodeObject *v;
957
958 /* Argument checks */
959 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000960 PyErr_BadInternalCall();
961 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000962 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000963 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200964 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
965 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000966 PyErr_BadInternalCall();
967 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000968 }
969
970 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971 possible since these are being shared.
972 The same goes for new-representation unicode objects or objects which
973 have already been readied.
974 For these, we simply return a fresh copy with the same Unicode content.
975 */
976 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
977 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
978 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000979 PyUnicodeObject *w = _PyUnicode_New(length);
980 if (w == NULL)
981 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
983 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000984 Py_DECREF(*unicode);
985 *unicode = w;
986 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000987 }
988
989 /* Note that we don't have to modify *unicode for unshared Unicode
990 objects, since we can modify them in-place. */
991 return unicode_resize(v, length);
992}
993
Alexander Belopolsky40018472011-02-26 01:02:56 +0000994int
995PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000996{
997 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
998}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001000static PyObject*
1001get_latin1_char(unsigned char ch)
1002{
1003 PyUnicodeObject *unicode = unicode_latin1[ch];
1004 if (!unicode) {
1005 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
1006 if (!unicode)
1007 return NULL;
1008 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1009 unicode_latin1[ch] = unicode;
1010 }
1011 Py_INCREF(unicode);
1012 return (PyObject *)unicode;
1013}
1014
Alexander Belopolsky40018472011-02-26 01:02:56 +00001015PyObject *
1016PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001017{
1018 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001019 Py_UCS4 maxchar = 0;
1020 Py_ssize_t num_surrogates;
1021
1022 if (u == NULL)
1023 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001025 /* If the Unicode data is known at construction time, we can apply
1026 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028 /* Optimization for empty strings */
1029 if (size == 0 && unicode_empty != NULL) {
1030 Py_INCREF(unicode_empty);
1031 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001032 }
Tim Petersced69f82003-09-16 20:30:58 +00001033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001034 /* Single character Unicode objects in the Latin-1 range are
1035 shared when using this constructor */
1036 if (size == 1 && *u < 256)
1037 return get_latin1_char((unsigned char)*u);
1038
1039 /* If not empty and not single character, copy the Unicode data
1040 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001041 if (find_maxchar_surrogates(u, u + size,
1042 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001043 return NULL;
1044
1045 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1046 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 if (!unicode)
1048 return NULL;
1049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 switch (PyUnicode_KIND(unicode)) {
1051 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001052 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1054 break;
1055 case PyUnicode_2BYTE_KIND:
1056#if Py_UNICODE_SIZE == 2
1057 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1058#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001059 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1061#endif
1062 break;
1063 case PyUnicode_4BYTE_KIND:
1064#if SIZEOF_WCHAR_T == 2
1065 /* This is the only case which has to process surrogates, thus
1066 a simple copy loop is not enough and we need a function. */
1067 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1068 Py_DECREF(unicode);
1069 return NULL;
1070 }
1071#else
1072 assert(num_surrogates == 0);
1073 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1074#endif
1075 break;
1076 default:
1077 assert(0 && "Impossible state");
1078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079
1080 return (PyObject *)unicode;
1081}
1082
Alexander Belopolsky40018472011-02-26 01:02:56 +00001083PyObject *
1084PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001085{
1086 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001087
Benjamin Peterson14339b62009-01-31 16:36:08 +00001088 if (size < 0) {
1089 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001090 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001091 return NULL;
1092 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001093
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001094 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001095 some optimizations which share commonly used objects.
1096 Also, this means the input must be UTF-8, so fall back to the
1097 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001098 if (u != NULL) {
1099
Benjamin Peterson29060642009-01-31 22:14:21 +00001100 /* Optimization for empty strings */
1101 if (size == 0 && unicode_empty != NULL) {
1102 Py_INCREF(unicode_empty);
1103 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001104 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001105
1106 /* Single characters are shared when using this constructor.
1107 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001108 if (size == 1 && Py_CHARMASK(*u) < 128)
1109 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001110
1111 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001112 }
1113
Walter Dörwald55507312007-05-18 13:12:10 +00001114 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001115 if (!unicode)
1116 return NULL;
1117
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001118 return (PyObject *)unicode;
1119}
1120
Alexander Belopolsky40018472011-02-26 01:02:56 +00001121PyObject *
1122PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001123{
1124 size_t size = strlen(u);
1125 if (size > PY_SSIZE_T_MAX) {
1126 PyErr_SetString(PyExc_OverflowError, "input too long");
1127 return NULL;
1128 }
1129
1130 return PyUnicode_FromStringAndSize(u, size);
1131}
1132
Victor Stinnere57b1c02011-09-28 22:20:48 +02001133static PyObject*
1134_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001135{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136 PyObject *res;
1137 unsigned char max = 127;
1138 Py_ssize_t i;
1139 for (i = 0; i < size; i++) {
1140 if (u[i] & 0x80) {
1141 max = 255;
1142 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001143 }
1144 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001145 res = PyUnicode_New(size, max);
1146 if (!res)
1147 return NULL;
1148 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1149 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001150}
1151
Victor Stinnere57b1c02011-09-28 22:20:48 +02001152static PyObject*
1153_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154{
1155 PyObject *res;
1156 Py_UCS2 max = 0;
1157 Py_ssize_t i;
1158 for (i = 0; i < size; i++)
1159 if (u[i] > max)
1160 max = u[i];
1161 res = PyUnicode_New(size, max);
1162 if (!res)
1163 return NULL;
1164 if (max >= 256)
1165 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1166 else
1167 for (i = 0; i < size; i++)
1168 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1169 return res;
1170}
1171
Victor Stinnere57b1c02011-09-28 22:20:48 +02001172static PyObject*
1173_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001174{
1175 PyObject *res;
1176 Py_UCS4 max = 0;
1177 Py_ssize_t i;
1178 for (i = 0; i < size; i++)
1179 if (u[i] > max)
1180 max = u[i];
1181 res = PyUnicode_New(size, max);
1182 if (!res)
1183 return NULL;
1184 if (max >= 0x10000)
1185 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1186 else {
1187 int kind = PyUnicode_KIND(res);
1188 void *data = PyUnicode_DATA(res);
1189 for (i = 0; i < size; i++)
1190 PyUnicode_WRITE(kind, data, i, u[i]);
1191 }
1192 return res;
1193}
1194
1195PyObject*
1196PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1197{
1198 switch(kind) {
1199 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001200 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001201 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001202 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001204 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001205 }
1206 assert(0);
1207 return NULL;
1208}
1209
1210
1211/* Widen Unicode objects to larger buffers.
1212 Return NULL if the string is too wide already. */
1213
1214void*
1215_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1216{
1217 Py_ssize_t i;
1218 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1219 void *d = PyUnicode_DATA(s);
1220 unsigned int skind = PyUnicode_KIND(s);
1221 if (PyUnicode_KIND(s) >= kind) {
1222 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1223 return NULL;
1224 }
1225 switch(kind) {
1226 case PyUnicode_2BYTE_KIND: {
1227 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1228 if (!result) {
1229 PyErr_NoMemory();
1230 return 0;
1231 }
1232 for (i = 0; i < len; i++)
1233 result[i] = ((Py_UCS1*)d)[i];
1234 return result;
1235 }
1236 case PyUnicode_4BYTE_KIND: {
1237 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1238 if (!result) {
1239 PyErr_NoMemory();
1240 return 0;
1241 }
1242 for (i = 0; i < len; i++)
1243 result[i] = PyUnicode_READ(skind, d, i);
1244 return result;
1245 }
1246 }
1247 Py_FatalError("invalid kind");
1248 return NULL;
1249}
1250
1251static Py_UCS4*
1252as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1253 int copy_null)
1254{
1255 int kind;
1256 void *data;
1257 Py_ssize_t len, targetlen;
1258 if (PyUnicode_READY(string) == -1)
1259 return NULL;
1260 kind = PyUnicode_KIND(string);
1261 data = PyUnicode_DATA(string);
1262 len = PyUnicode_GET_LENGTH(string);
1263 targetlen = len;
1264 if (copy_null)
1265 targetlen++;
1266 if (!target) {
1267 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1268 PyErr_NoMemory();
1269 return NULL;
1270 }
1271 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1272 if (!target) {
1273 PyErr_NoMemory();
1274 return NULL;
1275 }
1276 }
1277 else {
1278 if (targetsize < targetlen) {
1279 PyErr_Format(PyExc_SystemError,
1280 "string is longer than the buffer");
1281 if (copy_null && 0 < targetsize)
1282 target[0] = 0;
1283 return NULL;
1284 }
1285 }
1286 if (kind != PyUnicode_4BYTE_KIND) {
1287 Py_ssize_t i;
1288 for (i = 0; i < len; i++)
1289 target[i] = PyUnicode_READ(kind, data, i);
1290 }
1291 else
1292 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1293 if (copy_null)
1294 target[len] = 0;
1295 return target;
1296}
1297
1298Py_UCS4*
1299PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1300 int copy_null)
1301{
1302 if (target == NULL || targetsize < 1) {
1303 PyErr_BadInternalCall();
1304 return NULL;
1305 }
1306 return as_ucs4(string, target, targetsize, copy_null);
1307}
1308
1309Py_UCS4*
1310PyUnicode_AsUCS4Copy(PyObject *string)
1311{
1312 return as_ucs4(string, NULL, 0, 1);
1313}
1314
1315#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001316
Alexander Belopolsky40018472011-02-26 01:02:56 +00001317PyObject *
1318PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001319{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001321 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001323 PyErr_BadInternalCall();
1324 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325 }
1326
Martin v. Löwis790465f2008-04-05 20:41:37 +00001327 if (size == -1) {
1328 size = wcslen(w);
1329 }
1330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332}
1333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001335
Walter Dörwald346737f2007-05-31 10:44:43 +00001336static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001337makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1338 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001339{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001340 *fmt++ = '%';
1341 if (width) {
1342 if (zeropad)
1343 *fmt++ = '0';
1344 fmt += sprintf(fmt, "%d", width);
1345 }
1346 if (precision)
1347 fmt += sprintf(fmt, ".%d", precision);
1348 if (longflag)
1349 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001350 else if (longlongflag) {
1351 /* longlongflag should only ever be nonzero on machines with
1352 HAVE_LONG_LONG defined */
1353#ifdef HAVE_LONG_LONG
1354 char *f = PY_FORMAT_LONG_LONG;
1355 while (*f)
1356 *fmt++ = *f++;
1357#else
1358 /* we shouldn't ever get here */
1359 assert(0);
1360 *fmt++ = 'l';
1361#endif
1362 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001363 else if (size_tflag) {
1364 char *f = PY_FORMAT_SIZE_T;
1365 while (*f)
1366 *fmt++ = *f++;
1367 }
1368 *fmt++ = c;
1369 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001370}
1371
Victor Stinner96865452011-03-01 23:44:09 +00001372/* helper for PyUnicode_FromFormatV() */
1373
1374static const char*
1375parse_format_flags(const char *f,
1376 int *p_width, int *p_precision,
1377 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1378{
1379 int width, precision, longflag, longlongflag, size_tflag;
1380
1381 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1382 f++;
1383 width = 0;
1384 while (Py_ISDIGIT((unsigned)*f))
1385 width = (width*10) + *f++ - '0';
1386 precision = 0;
1387 if (*f == '.') {
1388 f++;
1389 while (Py_ISDIGIT((unsigned)*f))
1390 precision = (precision*10) + *f++ - '0';
1391 if (*f == '%') {
1392 /* "%.3%s" => f points to "3" */
1393 f--;
1394 }
1395 }
1396 if (*f == '\0') {
1397 /* bogus format "%.1" => go backward, f points to "1" */
1398 f--;
1399 }
1400 if (p_width != NULL)
1401 *p_width = width;
1402 if (p_precision != NULL)
1403 *p_precision = precision;
1404
1405 /* Handle %ld, %lu, %lld and %llu. */
1406 longflag = 0;
1407 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001408 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001409
1410 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001411 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001412 longflag = 1;
1413 ++f;
1414 }
1415#ifdef HAVE_LONG_LONG
1416 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001417 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001418 longlongflag = 1;
1419 f += 2;
1420 }
1421#endif
1422 }
1423 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001424 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001425 size_tflag = 1;
1426 ++f;
1427 }
1428 if (p_longflag != NULL)
1429 *p_longflag = longflag;
1430 if (p_longlongflag != NULL)
1431 *p_longlongflag = longlongflag;
1432 if (p_size_tflag != NULL)
1433 *p_size_tflag = size_tflag;
1434 return f;
1435}
1436
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001437/* maximum number of characters required for output of %ld. 21 characters
1438 allows for 64-bit integers (in decimal) and an optional sign. */
1439#define MAX_LONG_CHARS 21
1440/* maximum number of characters required for output of %lld.
1441 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1442 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1443#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1444
Walter Dörwaldd2034312007-05-18 16:29:38 +00001445PyObject *
1446PyUnicode_FromFormatV(const char *format, va_list vargs)
1447{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001448 va_list count;
1449 Py_ssize_t callcount = 0;
1450 PyObject **callresults = NULL;
1451 PyObject **callresult = NULL;
1452 Py_ssize_t n = 0;
1453 int width = 0;
1454 int precision = 0;
1455 int zeropad;
1456 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001458 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001459 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1461 Py_UCS4 argmaxchar;
1462 Py_ssize_t numbersize = 0;
1463 char *numberresults = NULL;
1464 char *numberresult = NULL;
1465 Py_ssize_t i;
1466 int kind;
1467 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001468
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001469 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001470 /* step 1: count the number of %S/%R/%A/%s format specifications
1471 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1472 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 * result in an array)
1474 * also esimate a upper bound for all the number formats in the string,
1475 * numbers will be formated in step 3 and be keept in a '\0'-separated
1476 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001477 for (f = format; *f; f++) {
1478 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001479 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1481 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1482 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1483 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001486#ifdef HAVE_LONG_LONG
1487 if (longlongflag) {
1488 if (width < MAX_LONG_LONG_CHARS)
1489 width = MAX_LONG_LONG_CHARS;
1490 }
1491 else
1492#endif
1493 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1494 including sign. Decimal takes the most space. This
1495 isn't enough for octal. If a width is specified we
1496 need more (which we allocate later). */
1497 if (width < MAX_LONG_CHARS)
1498 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499
1500 /* account for the size + '\0' to separate numbers
1501 inside of the numberresults buffer */
1502 numbersize += (width + 1);
1503 }
1504 }
1505 else if ((unsigned char)*f > 127) {
1506 PyErr_Format(PyExc_ValueError,
1507 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1508 "string, got a non-ASCII byte: 0x%02x",
1509 (unsigned char)*f);
1510 return NULL;
1511 }
1512 }
1513 /* step 2: allocate memory for the results of
1514 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1515 if (callcount) {
1516 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1517 if (!callresults) {
1518 PyErr_NoMemory();
1519 return NULL;
1520 }
1521 callresult = callresults;
1522 }
1523 /* step 2.5: allocate memory for the results of formating numbers */
1524 if (numbersize) {
1525 numberresults = PyObject_Malloc(numbersize);
1526 if (!numberresults) {
1527 PyErr_NoMemory();
1528 goto fail;
1529 }
1530 numberresult = numberresults;
1531 }
1532
1533 /* step 3: format numbers and figure out how large a buffer we need */
1534 for (f = format; *f; f++) {
1535 if (*f == '%') {
1536 const char* p;
1537 int longflag;
1538 int longlongflag;
1539 int size_tflag;
1540 int numprinted;
1541
1542 p = f;
1543 zeropad = (f[1] == '0');
1544 f = parse_format_flags(f, &width, &precision,
1545 &longflag, &longlongflag, &size_tflag);
1546 switch (*f) {
1547 case 'c':
1548 {
1549 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001550 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551 n++;
1552 break;
1553 }
1554 case '%':
1555 n++;
1556 break;
1557 case 'i':
1558 case 'd':
1559 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1560 width, precision, *f);
1561 if (longflag)
1562 numprinted = sprintf(numberresult, fmt,
1563 va_arg(count, long));
1564#ifdef HAVE_LONG_LONG
1565 else if (longlongflag)
1566 numprinted = sprintf(numberresult, fmt,
1567 va_arg(count, PY_LONG_LONG));
1568#endif
1569 else if (size_tflag)
1570 numprinted = sprintf(numberresult, fmt,
1571 va_arg(count, Py_ssize_t));
1572 else
1573 numprinted = sprintf(numberresult, fmt,
1574 va_arg(count, int));
1575 n += numprinted;
1576 /* advance by +1 to skip over the '\0' */
1577 numberresult += (numprinted + 1);
1578 assert(*(numberresult - 1) == '\0');
1579 assert(*(numberresult - 2) != '\0');
1580 assert(numprinted >= 0);
1581 assert(numberresult <= numberresults + numbersize);
1582 break;
1583 case 'u':
1584 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1585 width, precision, 'u');
1586 if (longflag)
1587 numprinted = sprintf(numberresult, fmt,
1588 va_arg(count, unsigned long));
1589#ifdef HAVE_LONG_LONG
1590 else if (longlongflag)
1591 numprinted = sprintf(numberresult, fmt,
1592 va_arg(count, unsigned PY_LONG_LONG));
1593#endif
1594 else if (size_tflag)
1595 numprinted = sprintf(numberresult, fmt,
1596 va_arg(count, size_t));
1597 else
1598 numprinted = sprintf(numberresult, fmt,
1599 va_arg(count, unsigned int));
1600 n += numprinted;
1601 numberresult += (numprinted + 1);
1602 assert(*(numberresult - 1) == '\0');
1603 assert(*(numberresult - 2) != '\0');
1604 assert(numprinted >= 0);
1605 assert(numberresult <= numberresults + numbersize);
1606 break;
1607 case 'x':
1608 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1609 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1610 n += numprinted;
1611 numberresult += (numprinted + 1);
1612 assert(*(numberresult - 1) == '\0');
1613 assert(*(numberresult - 2) != '\0');
1614 assert(numprinted >= 0);
1615 assert(numberresult <= numberresults + numbersize);
1616 break;
1617 case 'p':
1618 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1619 /* %p is ill-defined: ensure leading 0x. */
1620 if (numberresult[1] == 'X')
1621 numberresult[1] = 'x';
1622 else if (numberresult[1] != 'x') {
1623 memmove(numberresult + 2, numberresult,
1624 strlen(numberresult) + 1);
1625 numberresult[0] = '0';
1626 numberresult[1] = 'x';
1627 numprinted += 2;
1628 }
1629 n += numprinted;
1630 numberresult += (numprinted + 1);
1631 assert(*(numberresult - 1) == '\0');
1632 assert(*(numberresult - 2) != '\0');
1633 assert(numprinted >= 0);
1634 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001635 break;
1636 case 's':
1637 {
1638 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001639 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001640 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1641 if (!str)
1642 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001643 /* since PyUnicode_DecodeUTF8 returns already flexible
1644 unicode objects, there is no need to call ready on them */
1645 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001646 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001648 /* Remember the str and switch to the next slot */
1649 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001650 break;
1651 }
1652 case 'U':
1653 {
1654 PyObject *obj = va_arg(count, PyObject *);
1655 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 if (PyUnicode_READY(obj) == -1)
1657 goto fail;
1658 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001659 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001661 break;
1662 }
1663 case 'V':
1664 {
1665 PyObject *obj = va_arg(count, PyObject *);
1666 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001667 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001668 assert(obj || str);
1669 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001670 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 if (PyUnicode_READY(obj) == -1)
1672 goto fail;
1673 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001674 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001676 *callresult++ = NULL;
1677 }
1678 else {
1679 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1680 if (!str_obj)
1681 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001683 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001685 *callresult++ = str_obj;
1686 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001687 break;
1688 }
1689 case 'S':
1690 {
1691 PyObject *obj = va_arg(count, PyObject *);
1692 PyObject *str;
1693 assert(obj);
1694 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001695 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001696 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001698 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001700 /* Remember the str and switch to the next slot */
1701 *callresult++ = str;
1702 break;
1703 }
1704 case 'R':
1705 {
1706 PyObject *obj = va_arg(count, PyObject *);
1707 PyObject *repr;
1708 assert(obj);
1709 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001711 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001712 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001713 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001715 /* Remember the repr and switch to the next slot */
1716 *callresult++ = repr;
1717 break;
1718 }
1719 case 'A':
1720 {
1721 PyObject *obj = va_arg(count, PyObject *);
1722 PyObject *ascii;
1723 assert(obj);
1724 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001726 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001727 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001728 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001730 /* Remember the repr and switch to the next slot */
1731 *callresult++ = ascii;
1732 break;
1733 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001734 default:
1735 /* if we stumble upon an unknown
1736 formatting code, copy the rest of
1737 the format string to the output
1738 string. (we cannot just skip the
1739 code, since there's no way to know
1740 what's in the argument list) */
1741 n += strlen(p);
1742 goto expand;
1743 }
1744 } else
1745 n++;
1746 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001747 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001748 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001750 we don't have to resize the string.
1751 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001753 if (!string)
1754 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755 kind = PyUnicode_KIND(string);
1756 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001757 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001761 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001762 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001763
1764 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1766 /* checking for == because the last argument could be a empty
1767 string, which causes i to point to end, the assert at the end of
1768 the loop */
1769 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001770
Benjamin Peterson14339b62009-01-31 16:36:08 +00001771 switch (*f) {
1772 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001773 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774 const int ordinal = va_arg(vargs, int);
1775 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001776 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001777 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001778 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001779 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001780 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001781 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 case 'p':
1783 /* unused, since we already have the result */
1784 if (*f == 'p')
1785 (void) va_arg(vargs, void *);
1786 else
1787 (void) va_arg(vargs, int);
1788 /* extract the result from numberresults and append. */
1789 for (; *numberresult; ++i, ++numberresult)
1790 PyUnicode_WRITE(kind, data, i, *numberresult);
1791 /* skip over the separating '\0' */
1792 assert(*numberresult == '\0');
1793 numberresult++;
1794 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001795 break;
1796 case 's':
1797 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001798 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001800 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 size = PyUnicode_GET_LENGTH(*callresult);
1802 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001803 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1804 *callresult, 0,
1805 size) < 0)
1806 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001808 /* We're done with the unicode()/repr() => forget it */
1809 Py_DECREF(*callresult);
1810 /* switch to next unicode()/repr() result */
1811 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001812 break;
1813 }
1814 case 'U':
1815 {
1816 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 Py_ssize_t size;
1818 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1819 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001820 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1821 obj, 0,
1822 size) < 0)
1823 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001824 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001825 break;
1826 }
1827 case 'V':
1828 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001830 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001831 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001832 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833 size = PyUnicode_GET_LENGTH(obj);
1834 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001835 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1836 obj, 0,
1837 size) < 0)
1838 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001840 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 size = PyUnicode_GET_LENGTH(*callresult);
1842 assert(PyUnicode_KIND(*callresult) <=
1843 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001844 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1845 *callresult,
1846 0, size) < 0)
1847 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001849 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001850 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001851 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001852 break;
1853 }
1854 case 'S':
1855 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001856 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001857 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001858 /* unused, since we already have the result */
1859 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001861 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1862 *callresult, 0,
1863 PyUnicode_GET_LENGTH(*callresult)) < 0)
1864 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001865 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001866 /* We're done with the unicode()/repr() => forget it */
1867 Py_DECREF(*callresult);
1868 /* switch to next unicode()/repr() result */
1869 ++callresult;
1870 break;
1871 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001872 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001874 break;
1875 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001876 for (; *p; ++p, ++i)
1877 PyUnicode_WRITE(kind, data, i, *p);
1878 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001879 goto end;
1880 }
Victor Stinner1205f272010-09-11 00:54:47 +00001881 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001882 else {
1883 assert(i < PyUnicode_GET_LENGTH(string));
1884 PyUnicode_WRITE(kind, data, i++, *f);
1885 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001886 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001887 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001888
Benjamin Peterson29060642009-01-31 22:14:21 +00001889 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001890 if (callresults)
1891 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 if (numberresults)
1893 PyObject_Free(numberresults);
1894 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001895 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001896 if (callresults) {
1897 PyObject **callresult2 = callresults;
1898 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001899 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001900 ++callresult2;
1901 }
1902 PyObject_Free(callresults);
1903 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001904 if (numberresults)
1905 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001906 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001907}
1908
Walter Dörwaldd2034312007-05-18 16:29:38 +00001909PyObject *
1910PyUnicode_FromFormat(const char *format, ...)
1911{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001912 PyObject* ret;
1913 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001914
1915#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001916 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001917#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001918 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001919#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001920 ret = PyUnicode_FromFormatV(format, vargs);
1921 va_end(vargs);
1922 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001923}
1924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001925#ifdef HAVE_WCHAR_H
1926
Victor Stinner5593d8a2010-10-02 11:11:27 +00001927/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1928 convert a Unicode object to a wide character string.
1929
Victor Stinnerd88d9832011-09-06 02:00:05 +02001930 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001931 character) required to convert the unicode object. Ignore size argument.
1932
Victor Stinnerd88d9832011-09-06 02:00:05 +02001933 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001934 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001935 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001936static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001937unicode_aswidechar(PyUnicodeObject *unicode,
1938 wchar_t *w,
1939 Py_ssize_t size)
1940{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001941 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001942 const wchar_t *wstr;
1943
1944 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1945 if (wstr == NULL)
1946 return -1;
1947
Victor Stinner5593d8a2010-10-02 11:11:27 +00001948 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001949 if (size > res)
1950 size = res + 1;
1951 else
1952 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001954 return res;
1955 }
1956 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001958}
1959
1960Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001961PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001962 wchar_t *w,
1963 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964{
1965 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001966 PyErr_BadInternalCall();
1967 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001969 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970}
1971
Victor Stinner137c34c2010-09-29 10:25:54 +00001972wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001973PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001974 Py_ssize_t *size)
1975{
1976 wchar_t* buffer;
1977 Py_ssize_t buflen;
1978
1979 if (unicode == NULL) {
1980 PyErr_BadInternalCall();
1981 return NULL;
1982 }
1983
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001984 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 if (buflen == -1)
1986 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001987 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001988 PyErr_NoMemory();
1989 return NULL;
1990 }
1991
Victor Stinner137c34c2010-09-29 10:25:54 +00001992 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1993 if (buffer == NULL) {
1994 PyErr_NoMemory();
1995 return NULL;
1996 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001997 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 if (buflen == -1)
1999 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002000 if (size != NULL)
2001 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002002 return buffer;
2003}
2004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006
Alexander Belopolsky40018472011-02-26 01:02:56 +00002007PyObject *
2008PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002009{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002011 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002012 PyErr_SetString(PyExc_ValueError,
2013 "chr() arg not in range(0x110000)");
2014 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002015 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002017 if (ordinal < 256)
2018 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020 v = PyUnicode_New(1, ordinal);
2021 if (v == NULL)
2022 return NULL;
2023 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2024 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002025}
2026
Alexander Belopolsky40018472011-02-26 01:02:56 +00002027PyObject *
2028PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002030 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002032 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002033 Py_INCREF(obj);
2034 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002035 }
2036 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002037 /* For a Unicode subtype that's not a Unicode object,
2038 return a true Unicode object with the same data. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002039 if (PyUnicode_READY(obj) == -1)
2040 return NULL;
2041 return substring((PyUnicodeObject *)obj, 0, PyUnicode_GET_LENGTH(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002042 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002043 PyErr_Format(PyExc_TypeError,
2044 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002045 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002046 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002047}
2048
Alexander Belopolsky40018472011-02-26 01:02:56 +00002049PyObject *
2050PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002051 const char *encoding,
2052 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002053{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002054 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002055 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002056
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002058 PyErr_BadInternalCall();
2059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002061
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002062 /* Decoding bytes objects is the most common case and should be fast */
2063 if (PyBytes_Check(obj)) {
2064 if (PyBytes_GET_SIZE(obj) == 0) {
2065 Py_INCREF(unicode_empty);
2066 v = (PyObject *) unicode_empty;
2067 }
2068 else {
2069 v = PyUnicode_Decode(
2070 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2071 encoding, errors);
2072 }
2073 return v;
2074 }
2075
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002076 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002077 PyErr_SetString(PyExc_TypeError,
2078 "decoding str is not supported");
2079 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002080 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002081
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002082 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2083 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2084 PyErr_Format(PyExc_TypeError,
2085 "coercing to str: need bytes, bytearray "
2086 "or buffer-like object, %.80s found",
2087 Py_TYPE(obj)->tp_name);
2088 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002089 }
Tim Petersced69f82003-09-16 20:30:58 +00002090
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002091 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002092 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002093 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 }
Tim Petersced69f82003-09-16 20:30:58 +00002095 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002096 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002097
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002098 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002099 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100}
2101
Victor Stinner600d3be2010-06-10 12:00:55 +00002102/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002103 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2104 1 on success. */
2105static int
2106normalize_encoding(const char *encoding,
2107 char *lower,
2108 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002110 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002111 char *l;
2112 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002114 e = encoding;
2115 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002116 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002117 while (*e) {
2118 if (l == l_end)
2119 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002120 if (Py_ISUPPER(*e)) {
2121 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002122 }
2123 else if (*e == '_') {
2124 *l++ = '-';
2125 e++;
2126 }
2127 else {
2128 *l++ = *e++;
2129 }
2130 }
2131 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002132 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002133}
2134
Alexander Belopolsky40018472011-02-26 01:02:56 +00002135PyObject *
2136PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002137 Py_ssize_t size,
2138 const char *encoding,
2139 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002140{
2141 PyObject *buffer = NULL, *unicode;
2142 Py_buffer info;
2143 char lower[11]; /* Enough for any encoding shortcut */
2144
2145 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002146 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002147
2148 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002149 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002150 if ((strcmp(lower, "utf-8") == 0) ||
2151 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002152 return PyUnicode_DecodeUTF8(s, size, errors);
2153 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002154 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002155 (strcmp(lower, "iso-8859-1") == 0))
2156 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002157#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002158 else if (strcmp(lower, "mbcs") == 0)
2159 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002160#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002161 else if (strcmp(lower, "ascii") == 0)
2162 return PyUnicode_DecodeASCII(s, size, errors);
2163 else if (strcmp(lower, "utf-16") == 0)
2164 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2165 else if (strcmp(lower, "utf-32") == 0)
2166 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2167 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168
2169 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002170 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002171 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002172 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002173 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 if (buffer == NULL)
2175 goto onError;
2176 unicode = PyCodec_Decode(buffer, encoding, errors);
2177 if (unicode == NULL)
2178 goto onError;
2179 if (!PyUnicode_Check(unicode)) {
2180 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002181 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002182 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 Py_DECREF(unicode);
2184 goto onError;
2185 }
2186 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002187 if (PyUnicode_READY(unicode)) {
2188 Py_DECREF(unicode);
2189 return NULL;
2190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002192
Benjamin Peterson29060642009-01-31 22:14:21 +00002193 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002194 Py_XDECREF(buffer);
2195 return NULL;
2196}
2197
Alexander Belopolsky40018472011-02-26 01:02:56 +00002198PyObject *
2199PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002200 const char *encoding,
2201 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002202{
2203 PyObject *v;
2204
2205 if (!PyUnicode_Check(unicode)) {
2206 PyErr_BadArgument();
2207 goto onError;
2208 }
2209
2210 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002211 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002212
2213 /* Decode via the codec registry */
2214 v = PyCodec_Decode(unicode, encoding, errors);
2215 if (v == NULL)
2216 goto onError;
2217 return v;
2218
Benjamin Peterson29060642009-01-31 22:14:21 +00002219 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002220 return NULL;
2221}
2222
Alexander Belopolsky40018472011-02-26 01:02:56 +00002223PyObject *
2224PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002225 const char *encoding,
2226 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002227{
2228 PyObject *v;
2229
2230 if (!PyUnicode_Check(unicode)) {
2231 PyErr_BadArgument();
2232 goto onError;
2233 }
2234
2235 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002236 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002237
2238 /* Decode via the codec registry */
2239 v = PyCodec_Decode(unicode, encoding, errors);
2240 if (v == NULL)
2241 goto onError;
2242 if (!PyUnicode_Check(v)) {
2243 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002244 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002245 Py_TYPE(v)->tp_name);
2246 Py_DECREF(v);
2247 goto onError;
2248 }
2249 return v;
2250
Benjamin Peterson29060642009-01-31 22:14:21 +00002251 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002252 return NULL;
2253}
2254
Alexander Belopolsky40018472011-02-26 01:02:56 +00002255PyObject *
2256PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002257 Py_ssize_t size,
2258 const char *encoding,
2259 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260{
2261 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002262
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 unicode = PyUnicode_FromUnicode(s, size);
2264 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002265 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2267 Py_DECREF(unicode);
2268 return v;
2269}
2270
Alexander Belopolsky40018472011-02-26 01:02:56 +00002271PyObject *
2272PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002273 const char *encoding,
2274 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002275{
2276 PyObject *v;
2277
2278 if (!PyUnicode_Check(unicode)) {
2279 PyErr_BadArgument();
2280 goto onError;
2281 }
2282
2283 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002284 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002285
2286 /* Encode via the codec registry */
2287 v = PyCodec_Encode(unicode, encoding, errors);
2288 if (v == NULL)
2289 goto onError;
2290 return v;
2291
Benjamin Peterson29060642009-01-31 22:14:21 +00002292 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002293 return NULL;
2294}
2295
Victor Stinnerad158722010-10-27 00:25:46 +00002296PyObject *
2297PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002298{
Victor Stinner99b95382011-07-04 14:23:54 +02002299#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002300 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2301 PyUnicode_GET_SIZE(unicode),
2302 NULL);
2303#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002304 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002305#else
Victor Stinner793b5312011-04-27 00:24:21 +02002306 PyInterpreterState *interp = PyThreadState_GET()->interp;
2307 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2308 cannot use it to encode and decode filenames before it is loaded. Load
2309 the Python codec requires to encode at least its own filename. Use the C
2310 version of the locale codec until the codec registry is initialized and
2311 the Python codec is loaded.
2312
2313 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2314 cannot only rely on it: check also interp->fscodec_initialized for
2315 subinterpreters. */
2316 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002317 return PyUnicode_AsEncodedString(unicode,
2318 Py_FileSystemDefaultEncoding,
2319 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002320 }
2321 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002322 /* locale encoding with surrogateescape */
2323 wchar_t *wchar;
2324 char *bytes;
2325 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002326 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002327
2328 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2329 if (wchar == NULL)
2330 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002331 bytes = _Py_wchar2char(wchar, &error_pos);
2332 if (bytes == NULL) {
2333 if (error_pos != (size_t)-1) {
2334 char *errmsg = strerror(errno);
2335 PyObject *exc = NULL;
2336 if (errmsg == NULL)
2337 errmsg = "Py_wchar2char() failed";
2338 raise_encode_exception(&exc,
2339 "filesystemencoding",
2340 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2341 error_pos, error_pos+1,
2342 errmsg);
2343 Py_XDECREF(exc);
2344 }
2345 else
2346 PyErr_NoMemory();
2347 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002348 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002349 }
2350 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002351
2352 bytes_obj = PyBytes_FromString(bytes);
2353 PyMem_Free(bytes);
2354 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002355 }
Victor Stinnerad158722010-10-27 00:25:46 +00002356#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002357}
2358
Alexander Belopolsky40018472011-02-26 01:02:56 +00002359PyObject *
2360PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002361 const char *encoding,
2362 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002363{
2364 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002365 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002366
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367 if (!PyUnicode_Check(unicode)) {
2368 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002369 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370 }
Fred Drakee4315f52000-05-09 19:53:39 +00002371
Victor Stinner2f283c22011-03-02 01:21:46 +00002372 if (encoding == NULL) {
2373 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002375 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002376 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002377 }
Fred Drakee4315f52000-05-09 19:53:39 +00002378
2379 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002380 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002381 if ((strcmp(lower, "utf-8") == 0) ||
2382 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002383 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002384 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002386 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002388 }
Victor Stinner37296e82010-06-10 13:36:23 +00002389 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002390 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002391 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002393#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002394 else if (strcmp(lower, "mbcs") == 0)
2395 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2396 PyUnicode_GET_SIZE(unicode),
2397 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002398#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002399 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402
2403 /* Encode via the codec registry */
2404 v = PyCodec_Encode(unicode, encoding, errors);
2405 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002406 return NULL;
2407
2408 /* The normal path */
2409 if (PyBytes_Check(v))
2410 return v;
2411
2412 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002413 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002414 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002415 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002416
2417 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2418 "encoder %s returned bytearray instead of bytes",
2419 encoding);
2420 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002421 Py_DECREF(v);
2422 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002423 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002424
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002425 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2426 Py_DECREF(v);
2427 return b;
2428 }
2429
2430 PyErr_Format(PyExc_TypeError,
2431 "encoder did not return a bytes object (type=%.400s)",
2432 Py_TYPE(v)->tp_name);
2433 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002434 return NULL;
2435}
2436
Alexander Belopolsky40018472011-02-26 01:02:56 +00002437PyObject *
2438PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002439 const char *encoding,
2440 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002441{
2442 PyObject *v;
2443
2444 if (!PyUnicode_Check(unicode)) {
2445 PyErr_BadArgument();
2446 goto onError;
2447 }
2448
2449 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002450 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002451
2452 /* Encode via the codec registry */
2453 v = PyCodec_Encode(unicode, encoding, errors);
2454 if (v == NULL)
2455 goto onError;
2456 if (!PyUnicode_Check(v)) {
2457 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002458 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002459 Py_TYPE(v)->tp_name);
2460 Py_DECREF(v);
2461 goto onError;
2462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002464
Benjamin Peterson29060642009-01-31 22:14:21 +00002465 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 return NULL;
2467}
2468
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002469PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002470PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002471 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002472 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2473}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002474
Christian Heimes5894ba72007-11-04 11:43:14 +00002475PyObject*
2476PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2477{
Victor Stinner99b95382011-07-04 14:23:54 +02002478#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002479 return PyUnicode_DecodeMBCS(s, size, NULL);
2480#elif defined(__APPLE__)
2481 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2482#else
Victor Stinner793b5312011-04-27 00:24:21 +02002483 PyInterpreterState *interp = PyThreadState_GET()->interp;
2484 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2485 cannot use it to encode and decode filenames before it is loaded. Load
2486 the Python codec requires to encode at least its own filename. Use the C
2487 version of the locale codec until the codec registry is initialized and
2488 the Python codec is loaded.
2489
2490 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2491 cannot only rely on it: check also interp->fscodec_initialized for
2492 subinterpreters. */
2493 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002494 return PyUnicode_Decode(s, size,
2495 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002496 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002497 }
2498 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002499 /* locale encoding with surrogateescape */
2500 wchar_t *wchar;
2501 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002502 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002503
2504 if (s[size] != '\0' || size != strlen(s)) {
2505 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2506 return NULL;
2507 }
2508
Victor Stinner168e1172010-10-16 23:16:16 +00002509 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002510 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002511 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002512
Victor Stinner168e1172010-10-16 23:16:16 +00002513 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002514 PyMem_Free(wchar);
2515 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002516 }
Victor Stinnerad158722010-10-27 00:25:46 +00002517#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002518}
2519
Martin v. Löwis011e8422009-05-05 04:43:17 +00002520
2521int
2522PyUnicode_FSConverter(PyObject* arg, void* addr)
2523{
2524 PyObject *output = NULL;
2525 Py_ssize_t size;
2526 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002527 if (arg == NULL) {
2528 Py_DECREF(*(PyObject**)addr);
2529 return 1;
2530 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002531 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002532 output = arg;
2533 Py_INCREF(output);
2534 }
2535 else {
2536 arg = PyUnicode_FromObject(arg);
2537 if (!arg)
2538 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002539 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002540 Py_DECREF(arg);
2541 if (!output)
2542 return 0;
2543 if (!PyBytes_Check(output)) {
2544 Py_DECREF(output);
2545 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2546 return 0;
2547 }
2548 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002549 size = PyBytes_GET_SIZE(output);
2550 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002551 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002552 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002553 Py_DECREF(output);
2554 return 0;
2555 }
2556 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002557 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002558}
2559
2560
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002561int
2562PyUnicode_FSDecoder(PyObject* arg, void* addr)
2563{
2564 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002565 if (arg == NULL) {
2566 Py_DECREF(*(PyObject**)addr);
2567 return 1;
2568 }
2569 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 if (PyUnicode_READY(arg))
2571 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002572 output = arg;
2573 Py_INCREF(output);
2574 }
2575 else {
2576 arg = PyBytes_FromObject(arg);
2577 if (!arg)
2578 return 0;
2579 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2580 PyBytes_GET_SIZE(arg));
2581 Py_DECREF(arg);
2582 if (!output)
2583 return 0;
2584 if (!PyUnicode_Check(output)) {
2585 Py_DECREF(output);
2586 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2587 return 0;
2588 }
2589 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2591 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002592 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2593 Py_DECREF(output);
2594 return 0;
2595 }
2596 *(PyObject**)addr = output;
2597 return Py_CLEANUP_SUPPORTED;
2598}
2599
2600
Martin v. Löwis5b222132007-06-10 09:51:05 +00002601char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002603{
Christian Heimesf3863112007-11-22 07:46:41 +00002604 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002605 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2606
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002607 if (!PyUnicode_Check(unicode)) {
2608 PyErr_BadArgument();
2609 return NULL;
2610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002611 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002612 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613
2614 if (_PyUnicode_UTF8(unicode) == NULL) {
2615 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2616 if (bytes == NULL)
2617 return NULL;
2618 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2619 if (u->_base.utf8 == NULL) {
2620 Py_DECREF(bytes);
2621 return NULL;
2622 }
2623 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2624 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2625 Py_DECREF(bytes);
2626 }
2627
2628 if (psize)
2629 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2630 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002631}
2632
2633char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002635{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2637}
2638
2639#ifdef Py_DEBUG
2640int unicode_as_unicode_calls = 0;
2641#endif
2642
2643
2644Py_UNICODE *
2645PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2646{
2647 PyUnicodeObject *u;
2648 const unsigned char *one_byte;
2649#if SIZEOF_WCHAR_T == 4
2650 const Py_UCS2 *two_bytes;
2651#else
2652 const Py_UCS4 *four_bytes;
2653 const Py_UCS4 *ucs4_end;
2654 Py_ssize_t num_surrogates;
2655#endif
2656 wchar_t *w;
2657 wchar_t *wchar_end;
2658
2659 if (!PyUnicode_Check(unicode)) {
2660 PyErr_BadArgument();
2661 return NULL;
2662 }
2663 u = (PyUnicodeObject*)unicode;
2664 if (_PyUnicode_WSTR(u) == NULL) {
2665 /* Non-ASCII compact unicode object */
2666 assert(_PyUnicode_KIND(u) != 0);
2667 assert(PyUnicode_IS_READY(u));
2668
2669#ifdef Py_DEBUG
2670 ++unicode_as_unicode_calls;
2671#endif
2672
2673 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2674#if SIZEOF_WCHAR_T == 2
2675 four_bytes = PyUnicode_4BYTE_DATA(u);
2676 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2677 num_surrogates = 0;
2678
2679 for (; four_bytes < ucs4_end; ++four_bytes) {
2680 if (*four_bytes > 0xFFFF)
2681 ++num_surrogates;
2682 }
2683
2684 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2685 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2686 if (!_PyUnicode_WSTR(u)) {
2687 PyErr_NoMemory();
2688 return NULL;
2689 }
2690 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2691
2692 w = _PyUnicode_WSTR(u);
2693 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2694 four_bytes = PyUnicode_4BYTE_DATA(u);
2695 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2696 if (*four_bytes > 0xFFFF) {
2697 /* encode surrogate pair in this case */
2698 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2699 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2700 }
2701 else
2702 *w = *four_bytes;
2703
2704 if (w > wchar_end) {
2705 assert(0 && "Miscalculated string end");
2706 }
2707 }
2708 *w = 0;
2709#else
2710 /* sizeof(wchar_t) == 4 */
2711 Py_FatalError("Impossible unicode object state, wstr and str "
2712 "should share memory already.");
2713 return NULL;
2714#endif
2715 }
2716 else {
2717 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2718 (_PyUnicode_LENGTH(u) + 1));
2719 if (!_PyUnicode_WSTR(u)) {
2720 PyErr_NoMemory();
2721 return NULL;
2722 }
2723 if (!PyUnicode_IS_COMPACT_ASCII(u))
2724 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2725 w = _PyUnicode_WSTR(u);
2726 wchar_end = w + _PyUnicode_LENGTH(u);
2727
2728 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2729 one_byte = PyUnicode_1BYTE_DATA(u);
2730 for (; w < wchar_end; ++one_byte, ++w)
2731 *w = *one_byte;
2732 /* null-terminate the wstr */
2733 *w = 0;
2734 }
2735 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2736#if SIZEOF_WCHAR_T == 4
2737 two_bytes = PyUnicode_2BYTE_DATA(u);
2738 for (; w < wchar_end; ++two_bytes, ++w)
2739 *w = *two_bytes;
2740 /* null-terminate the wstr */
2741 *w = 0;
2742#else
2743 /* sizeof(wchar_t) == 2 */
2744 PyObject_FREE(_PyUnicode_WSTR(u));
2745 _PyUnicode_WSTR(u) = NULL;
2746 Py_FatalError("Impossible unicode object state, wstr "
2747 "and str should share memory already.");
2748 return NULL;
2749#endif
2750 }
2751 else {
2752 assert(0 && "This should never happen.");
2753 }
2754 }
2755 }
2756 if (size != NULL)
2757 *size = PyUnicode_WSTR_LENGTH(u);
2758 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002759}
2760
Alexander Belopolsky40018472011-02-26 01:02:56 +00002761Py_UNICODE *
2762PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002764 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765}
2766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002767
Alexander Belopolsky40018472011-02-26 01:02:56 +00002768Py_ssize_t
2769PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770{
2771 if (!PyUnicode_Check(unicode)) {
2772 PyErr_BadArgument();
2773 goto onError;
2774 }
2775 return PyUnicode_GET_SIZE(unicode);
2776
Benjamin Peterson29060642009-01-31 22:14:21 +00002777 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 return -1;
2779}
2780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002781Py_ssize_t
2782PyUnicode_GetLength(PyObject *unicode)
2783{
2784 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2785 PyErr_BadArgument();
2786 return -1;
2787 }
2788
2789 return PyUnicode_GET_LENGTH(unicode);
2790}
2791
2792Py_UCS4
2793PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2794{
2795 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2796 return PyErr_BadArgument();
2797 return (Py_UCS4)-1;
2798 }
2799 return PyUnicode_READ_CHAR(unicode, index);
2800}
2801
2802int
2803PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2804{
2805 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2806 return PyErr_BadArgument();
2807 return -1;
2808 }
2809
2810 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2811 index, ch);
2812 return 0;
2813}
2814
Alexander Belopolsky40018472011-02-26 01:02:56 +00002815const char *
2816PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002817{
Victor Stinner42cb4622010-09-01 19:39:01 +00002818 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002819}
2820
Victor Stinner554f3f02010-06-16 23:33:54 +00002821/* create or adjust a UnicodeDecodeError */
2822static void
2823make_decode_exception(PyObject **exceptionObject,
2824 const char *encoding,
2825 const char *input, Py_ssize_t length,
2826 Py_ssize_t startpos, Py_ssize_t endpos,
2827 const char *reason)
2828{
2829 if (*exceptionObject == NULL) {
2830 *exceptionObject = PyUnicodeDecodeError_Create(
2831 encoding, input, length, startpos, endpos, reason);
2832 }
2833 else {
2834 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2835 goto onError;
2836 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2837 goto onError;
2838 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2839 goto onError;
2840 }
2841 return;
2842
2843onError:
2844 Py_DECREF(*exceptionObject);
2845 *exceptionObject = NULL;
2846}
2847
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002848/* error handling callback helper:
2849 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002850 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002851 and adjust various state variables.
2852 return 0 on success, -1 on error
2853*/
2854
Alexander Belopolsky40018472011-02-26 01:02:56 +00002855static int
2856unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002857 const char *encoding, const char *reason,
2858 const char **input, const char **inend, Py_ssize_t *startinpos,
2859 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2860 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002861{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002862 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002863
2864 PyObject *restuple = NULL;
2865 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002866 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002867 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002868 Py_ssize_t requiredsize;
2869 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002870 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002871 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002872 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002873 int res = -1;
2874
2875 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002876 *errorHandler = PyCodec_LookupError(errors);
2877 if (*errorHandler == NULL)
2878 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879 }
2880
Victor Stinner554f3f02010-06-16 23:33:54 +00002881 make_decode_exception(exceptionObject,
2882 encoding,
2883 *input, *inend - *input,
2884 *startinpos, *endinpos,
2885 reason);
2886 if (*exceptionObject == NULL)
2887 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002888
2889 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2890 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002891 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002892 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002893 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002894 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002895 }
2896 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002897 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002898
2899 /* Copy back the bytes variables, which might have been modified by the
2900 callback */
2901 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2902 if (!inputobj)
2903 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002904 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002905 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002906 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002907 *input = PyBytes_AS_STRING(inputobj);
2908 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002909 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002910 /* we can DECREF safely, as the exception has another reference,
2911 so the object won't go away. */
2912 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002913
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002914 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002915 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002916 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002917 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2918 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002919 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002920
2921 /* need more space? (at least enough for what we
2922 have+the replacement+the rest of the string (starting
2923 at the new input position), so we won't have to check space
2924 when there are no errors in the rest of the string) */
2925 repptr = PyUnicode_AS_UNICODE(repunicode);
2926 repsize = PyUnicode_GET_SIZE(repunicode);
2927 requiredsize = *outpos + repsize + insize-newpos;
2928 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002929 if (requiredsize<2*outsize)
2930 requiredsize = 2*outsize;
2931 if (_PyUnicode_Resize(output, requiredsize) < 0)
2932 goto onError;
2933 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002934 }
2935 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002936 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 Py_UNICODE_COPY(*outptr, repptr, repsize);
2938 *outptr += repsize;
2939 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002940
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002941 /* we made it! */
2942 res = 0;
2943
Benjamin Peterson29060642009-01-31 22:14:21 +00002944 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002945 Py_XDECREF(restuple);
2946 return res;
2947}
2948
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002949/* --- UTF-7 Codec -------------------------------------------------------- */
2950
Antoine Pitrou244651a2009-05-04 18:56:13 +00002951/* See RFC2152 for details. We encode conservatively and decode liberally. */
2952
2953/* Three simple macros defining base-64. */
2954
2955/* Is c a base-64 character? */
2956
2957#define IS_BASE64(c) \
2958 (((c) >= 'A' && (c) <= 'Z') || \
2959 ((c) >= 'a' && (c) <= 'z') || \
2960 ((c) >= '0' && (c) <= '9') || \
2961 (c) == '+' || (c) == '/')
2962
2963/* given that c is a base-64 character, what is its base-64 value? */
2964
2965#define FROM_BASE64(c) \
2966 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2967 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2968 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2969 (c) == '+' ? 62 : 63)
2970
2971/* What is the base-64 character of the bottom 6 bits of n? */
2972
2973#define TO_BASE64(n) \
2974 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2975
2976/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2977 * decoded as itself. We are permissive on decoding; the only ASCII
2978 * byte not decoding to itself is the + which begins a base64
2979 * string. */
2980
2981#define DECODE_DIRECT(c) \
2982 ((c) <= 127 && (c) != '+')
2983
2984/* The UTF-7 encoder treats ASCII characters differently according to
2985 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2986 * the above). See RFC2152. This array identifies these different
2987 * sets:
2988 * 0 : "Set D"
2989 * alphanumeric and '(),-./:?
2990 * 1 : "Set O"
2991 * !"#$%&*;<=>@[]^_`{|}
2992 * 2 : "whitespace"
2993 * ht nl cr sp
2994 * 3 : special (must be base64 encoded)
2995 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2996 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002997
Tim Petersced69f82003-09-16 20:30:58 +00002998static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002999char utf7_category[128] = {
3000/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3001 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3002/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3003 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3004/* sp ! " # $ % & ' ( ) * + , - . / */
3005 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3006/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3007 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3008/* @ A B C D E F G H I J K L M N O */
3009 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3010/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3011 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3012/* ` a b c d e f g h i j k l m n o */
3013 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3014/* p q r s t u v w x y z { | } ~ del */
3015 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003016};
3017
Antoine Pitrou244651a2009-05-04 18:56:13 +00003018/* ENCODE_DIRECT: this character should be encoded as itself. The
3019 * answer depends on whether we are encoding set O as itself, and also
3020 * on whether we are encoding whitespace as itself. RFC2152 makes it
3021 * clear that the answers to these questions vary between
3022 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003023
Antoine Pitrou244651a2009-05-04 18:56:13 +00003024#define ENCODE_DIRECT(c, directO, directWS) \
3025 ((c) < 128 && (c) > 0 && \
3026 ((utf7_category[(c)] == 0) || \
3027 (directWS && (utf7_category[(c)] == 2)) || \
3028 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003029
Alexander Belopolsky40018472011-02-26 01:02:56 +00003030PyObject *
3031PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003032 Py_ssize_t size,
3033 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003034{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003035 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3036}
3037
Antoine Pitrou244651a2009-05-04 18:56:13 +00003038/* The decoder. The only state we preserve is our read position,
3039 * i.e. how many characters we have consumed. So if we end in the
3040 * middle of a shift sequence we have to back off the read position
3041 * and the output to the beginning of the sequence, otherwise we lose
3042 * all the shift state (seen bits, number of bits seen, high
3043 * surrogate). */
3044
Alexander Belopolsky40018472011-02-26 01:02:56 +00003045PyObject *
3046PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003047 Py_ssize_t size,
3048 const char *errors,
3049 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003050{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003052 Py_ssize_t startinpos;
3053 Py_ssize_t endinpos;
3054 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003055 const char *e;
3056 PyUnicodeObject *unicode;
3057 Py_UNICODE *p;
3058 const char *errmsg = "";
3059 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003060 Py_UNICODE *shiftOutStart;
3061 unsigned int base64bits = 0;
3062 unsigned long base64buffer = 0;
3063 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 PyObject *errorHandler = NULL;
3065 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003066
3067 unicode = _PyUnicode_New(size);
3068 if (!unicode)
3069 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003070 if (size == 0) {
3071 if (consumed)
3072 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003073 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003074 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003076 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003077 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003078 e = s + size;
3079
3080 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003081 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003082 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003083 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003084
Antoine Pitrou244651a2009-05-04 18:56:13 +00003085 if (inShift) { /* in a base-64 section */
3086 if (IS_BASE64(ch)) { /* consume a base-64 character */
3087 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3088 base64bits += 6;
3089 s++;
3090 if (base64bits >= 16) {
3091 /* we have enough bits for a UTF-16 value */
3092 Py_UNICODE outCh = (Py_UNICODE)
3093 (base64buffer >> (base64bits-16));
3094 base64bits -= 16;
3095 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3096 if (surrogate) {
3097 /* expecting a second surrogate */
3098 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3099#ifdef Py_UNICODE_WIDE
3100 *p++ = (((surrogate & 0x3FF)<<10)
3101 | (outCh & 0x3FF)) + 0x10000;
3102#else
3103 *p++ = surrogate;
3104 *p++ = outCh;
3105#endif
3106 surrogate = 0;
3107 }
3108 else {
3109 surrogate = 0;
3110 errmsg = "second surrogate missing";
3111 goto utf7Error;
3112 }
3113 }
3114 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3115 /* first surrogate */
3116 surrogate = outCh;
3117 }
3118 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3119 errmsg = "unexpected second surrogate";
3120 goto utf7Error;
3121 }
3122 else {
3123 *p++ = outCh;
3124 }
3125 }
3126 }
3127 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003128 inShift = 0;
3129 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003130 if (surrogate) {
3131 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003132 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003133 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003134 if (base64bits > 0) { /* left-over bits */
3135 if (base64bits >= 6) {
3136 /* We've seen at least one base-64 character */
3137 errmsg = "partial character in shift sequence";
3138 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003139 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003140 else {
3141 /* Some bits remain; they should be zero */
3142 if (base64buffer != 0) {
3143 errmsg = "non-zero padding bits in shift sequence";
3144 goto utf7Error;
3145 }
3146 }
3147 }
3148 if (ch != '-') {
3149 /* '-' is absorbed; other terminating
3150 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003151 *p++ = ch;
3152 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003153 }
3154 }
3155 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003156 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003157 s++; /* consume '+' */
3158 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003159 s++;
3160 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003161 }
3162 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003163 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003164 shiftOutStart = p;
3165 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003166 }
3167 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003168 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003169 *p++ = ch;
3170 s++;
3171 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003172 else {
3173 startinpos = s-starts;
3174 s++;
3175 errmsg = "unexpected special character";
3176 goto utf7Error;
3177 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003178 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003179utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003180 outpos = p-PyUnicode_AS_UNICODE(unicode);
3181 endinpos = s-starts;
3182 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003183 errors, &errorHandler,
3184 "utf7", errmsg,
3185 &starts, &e, &startinpos, &endinpos, &exc, &s,
3186 &unicode, &outpos, &p))
3187 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003188 }
3189
Antoine Pitrou244651a2009-05-04 18:56:13 +00003190 /* end of string */
3191
3192 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3193 /* if we're in an inconsistent state, that's an error */
3194 if (surrogate ||
3195 (base64bits >= 6) ||
3196 (base64bits > 0 && base64buffer != 0)) {
3197 outpos = p-PyUnicode_AS_UNICODE(unicode);
3198 endinpos = size;
3199 if (unicode_decode_call_errorhandler(
3200 errors, &errorHandler,
3201 "utf7", "unterminated shift sequence",
3202 &starts, &e, &startinpos, &endinpos, &exc, &s,
3203 &unicode, &outpos, &p))
3204 goto onError;
3205 if (s < e)
3206 goto restart;
3207 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003208 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003209
3210 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003211 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003212 if (inShift) {
3213 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003214 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003215 }
3216 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003217 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003218 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003219 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003220
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003221 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003222 goto onError;
3223
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003224 Py_XDECREF(errorHandler);
3225 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003226 if (PyUnicode_READY(unicode) == -1) {
3227 Py_DECREF(unicode);
3228 return NULL;
3229 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003230 return (PyObject *)unicode;
3231
Benjamin Peterson29060642009-01-31 22:14:21 +00003232 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003233 Py_XDECREF(errorHandler);
3234 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003235 Py_DECREF(unicode);
3236 return NULL;
3237}
3238
3239
Alexander Belopolsky40018472011-02-26 01:02:56 +00003240PyObject *
3241PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003242 Py_ssize_t size,
3243 int base64SetO,
3244 int base64WhiteSpace,
3245 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003246{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003247 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003248 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003249 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003250 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003251 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003252 unsigned int base64bits = 0;
3253 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003254 char * out;
3255 char * start;
3256
3257 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003258 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003259
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003260 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003261 return PyErr_NoMemory();
3262
Antoine Pitrou244651a2009-05-04 18:56:13 +00003263 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003264 if (v == NULL)
3265 return NULL;
3266
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003267 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003268 for (;i < size; ++i) {
3269 Py_UNICODE ch = s[i];
3270
Antoine Pitrou244651a2009-05-04 18:56:13 +00003271 if (inShift) {
3272 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3273 /* shifting out */
3274 if (base64bits) { /* output remaining bits */
3275 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3276 base64buffer = 0;
3277 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003278 }
3279 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003280 /* Characters not in the BASE64 set implicitly unshift the sequence
3281 so no '-' is required, except if the character is itself a '-' */
3282 if (IS_BASE64(ch) || ch == '-') {
3283 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003284 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003285 *out++ = (char) ch;
3286 }
3287 else {
3288 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003289 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003290 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003291 else { /* not in a shift sequence */
3292 if (ch == '+') {
3293 *out++ = '+';
3294 *out++ = '-';
3295 }
3296 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3297 *out++ = (char) ch;
3298 }
3299 else {
3300 *out++ = '+';
3301 inShift = 1;
3302 goto encode_char;
3303 }
3304 }
3305 continue;
3306encode_char:
3307#ifdef Py_UNICODE_WIDE
3308 if (ch >= 0x10000) {
3309 /* code first surrogate */
3310 base64bits += 16;
3311 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3312 while (base64bits >= 6) {
3313 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3314 base64bits -= 6;
3315 }
3316 /* prepare second surrogate */
3317 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3318 }
3319#endif
3320 base64bits += 16;
3321 base64buffer = (base64buffer << 16) | ch;
3322 while (base64bits >= 6) {
3323 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3324 base64bits -= 6;
3325 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003326 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003327 if (base64bits)
3328 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3329 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003330 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003331 if (_PyBytes_Resize(&v, out - start) < 0)
3332 return NULL;
3333 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003334}
3335
Antoine Pitrou244651a2009-05-04 18:56:13 +00003336#undef IS_BASE64
3337#undef FROM_BASE64
3338#undef TO_BASE64
3339#undef DECODE_DIRECT
3340#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003341
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342/* --- UTF-8 Codec -------------------------------------------------------- */
3343
Tim Petersced69f82003-09-16 20:30:58 +00003344static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003346 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3347 illegal prefix. See RFC 3629 for details */
3348 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3349 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003350 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3352 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3353 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3354 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003355 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3356 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3358 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003359 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3360 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3361 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3362 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3363 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364};
3365
Alexander Belopolsky40018472011-02-26 01:02:56 +00003366PyObject *
3367PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003368 Py_ssize_t size,
3369 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003370{
Walter Dörwald69652032004-09-07 20:24:22 +00003371 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3372}
3373
Antoine Pitrouab868312009-01-10 15:40:25 +00003374/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3375#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3376
3377/* Mask to quickly check whether a C 'long' contains a
3378 non-ASCII, UTF8-encoded char. */
3379#if (SIZEOF_LONG == 8)
3380# define ASCII_CHAR_MASK 0x8080808080808080L
3381#elif (SIZEOF_LONG == 4)
3382# define ASCII_CHAR_MASK 0x80808080L
3383#else
3384# error C 'long' size should be either 4 or 8!
3385#endif
3386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003387/* Scans a UTF-8 string and returns the maximum character to be expected,
3388 the size of the decoded unicode string and if any major errors were
3389 encountered.
3390
3391 This function does check basic UTF-8 sanity, it does however NOT CHECK
3392 if the string contains surrogates, and if all continuation bytes are
3393 within the correct ranges, these checks are performed in
3394 PyUnicode_DecodeUTF8Stateful.
3395
3396 If it sets has_errors to 1, it means the value of unicode_size and max_char
3397 will be bogus and you should not rely on useful information in them.
3398 */
3399static Py_UCS4
3400utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3401 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3402 int *has_errors)
3403{
3404 Py_ssize_t n;
3405 Py_ssize_t char_count = 0;
3406 Py_UCS4 max_char = 127, new_max;
3407 Py_UCS4 upper_bound;
3408 const unsigned char *p = (const unsigned char *)s;
3409 const unsigned char *end = p + string_size;
3410 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3411 int err = 0;
3412
3413 for (; p < end && !err; ++p, ++char_count) {
3414 /* Only check value if it's not a ASCII char... */
3415 if (*p < 0x80) {
3416 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3417 an explanation. */
3418 if (!((size_t) p & LONG_PTR_MASK)) {
3419 /* Help register allocation */
3420 register const unsigned char *_p = p;
3421 while (_p < aligned_end) {
3422 unsigned long value = *(unsigned long *) _p;
3423 if (value & ASCII_CHAR_MASK)
3424 break;
3425 _p += SIZEOF_LONG;
3426 char_count += SIZEOF_LONG;
3427 }
3428 p = _p;
3429 if (p == end)
3430 break;
3431 }
3432 }
3433 if (*p >= 0x80) {
3434 n = utf8_code_length[*p];
3435 new_max = max_char;
3436 switch (n) {
3437 /* invalid start byte */
3438 case 0:
3439 err = 1;
3440 break;
3441 case 2:
3442 /* Code points between 0x00FF and 0x07FF inclusive.
3443 Approximate the upper bound of the code point,
3444 if this flips over 255 we can be sure it will be more
3445 than 255 and the string will need 2 bytes per code coint,
3446 if it stays under or equal to 255, we can be sure 1 byte
3447 is enough.
3448 ((*p & 0b00011111) << 6) | 0b00111111 */
3449 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3450 if (max_char < upper_bound)
3451 new_max = upper_bound;
3452 /* Ensure we track at least that we left ASCII space. */
3453 if (new_max < 128)
3454 new_max = 128;
3455 break;
3456 case 3:
3457 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3458 always > 255 and <= 65535 and will always need 2 bytes. */
3459 if (max_char < 65535)
3460 new_max = 65535;
3461 break;
3462 case 4:
3463 /* Code point will be above 0xFFFF for sure in this case. */
3464 new_max = 65537;
3465 break;
3466 /* Internal error, this should be caught by the first if */
3467 case 1:
3468 default:
3469 assert(0 && "Impossible case in utf8_max_char_and_size");
3470 err = 1;
3471 }
3472 /* Instead of number of overall bytes for this code point,
3473 n containts the number of following bytes: */
3474 --n;
3475 /* Check if the follow up chars are all valid continuation bytes */
3476 if (n >= 1) {
3477 const unsigned char *cont;
3478 if ((p + n) >= end) {
3479 if (consumed == 0)
3480 /* incomplete data, non-incremental decoding */
3481 err = 1;
3482 break;
3483 }
3484 for (cont = p + 1; cont < (p + n); ++cont) {
3485 if ((*cont & 0xc0) != 0x80) {
3486 err = 1;
3487 break;
3488 }
3489 }
3490 p += n;
3491 }
3492 else
3493 err = 1;
3494 max_char = new_max;
3495 }
3496 }
3497
3498 if (unicode_size)
3499 *unicode_size = char_count;
3500 if (has_errors)
3501 *has_errors = err;
3502 return max_char;
3503}
3504
3505/* Similar to PyUnicode_WRITE but can also write into wstr field
3506 of the legacy unicode representation */
3507#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3508 do { \
3509 const int k_ = (kind); \
3510 if (k_ == PyUnicode_WCHAR_KIND) \
3511 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3512 else if (k_ == PyUnicode_1BYTE_KIND) \
3513 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3514 else if (k_ == PyUnicode_2BYTE_KIND) \
3515 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3516 else \
3517 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3518 } while (0)
3519
Alexander Belopolsky40018472011-02-26 01:02:56 +00003520PyObject *
3521PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003522 Py_ssize_t size,
3523 const char *errors,
3524 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003525{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003528 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003529 Py_ssize_t startinpos;
3530 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003531 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003533 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 PyObject *errorHandler = NULL;
3535 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003536 Py_UCS4 maxchar = 0;
3537 Py_ssize_t unicode_size;
3538 Py_ssize_t i;
3539 int kind;
3540 void *data;
3541 int has_errors;
3542 Py_UNICODE *error_outptr;
3543#if SIZEOF_WCHAR_T == 2
3544 Py_ssize_t wchar_offset = 0;
3545#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546
Walter Dörwald69652032004-09-07 20:24:22 +00003547 if (size == 0) {
3548 if (consumed)
3549 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003550 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003551 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003552 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3553 consumed, &has_errors);
3554 if (has_errors) {
3555 unicode = _PyUnicode_New(size);
3556 if (!unicode)
3557 return NULL;
3558 kind = PyUnicode_WCHAR_KIND;
3559 data = PyUnicode_AS_UNICODE(unicode);
3560 assert(data != NULL);
3561 }
3562 else {
3563 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3564 if (!unicode)
3565 return NULL;
3566 /* When the string is ASCII only, just use memcpy and return.
3567 unicode_size may be != size if there is an incomplete UTF-8
3568 sequence at the end of the ASCII block. */
3569 if (maxchar < 128 && size == unicode_size) {
3570 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3571 return (PyObject *)unicode;
3572 }
3573 kind = PyUnicode_KIND(unicode);
3574 data = PyUnicode_DATA(unicode);
3575 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003577 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003579 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580
3581 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003582 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583
3584 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003585 /* Fast path for runs of ASCII characters. Given that common UTF-8
3586 input will consist of an overwhelming majority of ASCII
3587 characters, we try to optimize for this case by checking
3588 as many characters as a C 'long' can contain.
3589 First, check if we can do an aligned read, as most CPUs have
3590 a penalty for unaligned reads.
3591 */
3592 if (!((size_t) s & LONG_PTR_MASK)) {
3593 /* Help register allocation */
3594 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003595 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003596 while (_s < aligned_end) {
3597 /* Read a whole long at a time (either 4 or 8 bytes),
3598 and do a fast unrolled copy if it only contains ASCII
3599 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003600 unsigned long value = *(unsigned long *) _s;
3601 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003602 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003603 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3604 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3605 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3606 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003607#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003608 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3609 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3610 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3611 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003612#endif
3613 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003614 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003615 }
3616 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003617 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003618 if (s == e)
3619 break;
3620 ch = (unsigned char)*s;
3621 }
3622 }
3623
3624 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003625 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 s++;
3627 continue;
3628 }
3629
3630 n = utf8_code_length[ch];
3631
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003632 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003633 if (consumed)
3634 break;
3635 else {
3636 errmsg = "unexpected end of data";
3637 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003638 endinpos = startinpos+1;
3639 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3640 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003641 goto utf8Error;
3642 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003643 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644
3645 switch (n) {
3646
3647 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003648 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003649 startinpos = s-starts;
3650 endinpos = startinpos+1;
3651 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652
3653 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003654 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003655 startinpos = s-starts;
3656 endinpos = startinpos+1;
3657 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658
3659 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003660 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003661 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003663 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003664 goto utf8Error;
3665 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003667 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003668 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003669 break;
3670
3671 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003672 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3673 will result in surrogates in range d800-dfff. Surrogates are
3674 not valid UTF-8 so they are rejected.
3675 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3676 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003677 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003678 (s[2] & 0xc0) != 0x80 ||
3679 ((unsigned char)s[0] == 0xE0 &&
3680 (unsigned char)s[1] < 0xA0) ||
3681 ((unsigned char)s[0] == 0xED &&
3682 (unsigned char)s[1] > 0x9F)) {
3683 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003684 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003685 endinpos = startinpos + 1;
3686
3687 /* if s[1] first two bits are 1 and 0, then the invalid
3688 continuation byte is s[2], so increment endinpos by 1,
3689 if not, s[1] is invalid and endinpos doesn't need to
3690 be incremented. */
3691 if ((s[1] & 0xC0) == 0x80)
3692 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 goto utf8Error;
3694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003696 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003697 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003698 break;
3699
3700 case 4:
3701 if ((s[1] & 0xc0) != 0x80 ||
3702 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003703 (s[3] & 0xc0) != 0x80 ||
3704 ((unsigned char)s[0] == 0xF0 &&
3705 (unsigned char)s[1] < 0x90) ||
3706 ((unsigned char)s[0] == 0xF4 &&
3707 (unsigned char)s[1] > 0x8F)) {
3708 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003709 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003710 endinpos = startinpos + 1;
3711 if ((s[1] & 0xC0) == 0x80) {
3712 endinpos++;
3713 if ((s[2] & 0xC0) == 0x80)
3714 endinpos++;
3715 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003716 goto utf8Error;
3717 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003718 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003719 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3720 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003722 /* If the string is flexible or we have native UCS-4, write
3723 directly.. */
3724 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3725 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003727 else {
3728 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003730 /* translate from 10000..10FFFF to 0..FFFF */
3731 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003733 /* high surrogate = top 10 bits added to D800 */
3734 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3735 (Py_UNICODE)(0xD800 + (ch >> 10)));
3736
3737 /* low surrogate = bottom 10 bits added to DC00 */
3738 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3739 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3740 }
3741#if SIZEOF_WCHAR_T == 2
3742 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003743#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745 }
3746 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003747 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003748
Benjamin Peterson29060642009-01-31 22:14:21 +00003749 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003750 /* If this is not yet a resizable string, make it one.. */
3751 if (kind != PyUnicode_WCHAR_KIND) {
3752 const Py_UNICODE *u;
3753 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3754 if (!new_unicode)
3755 goto onError;
3756 u = PyUnicode_AsUnicode((PyObject *)unicode);
3757 if (!u)
3758 goto onError;
3759#if SIZEOF_WCHAR_T == 2
3760 i += wchar_offset;
3761#endif
3762 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3763 Py_DECREF(unicode);
3764 unicode = new_unicode;
3765 kind = 0;
3766 data = PyUnicode_AS_UNICODE(new_unicode);
3767 assert(data != NULL);
3768 }
3769 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003770 if (unicode_decode_call_errorhandler(
3771 errors, &errorHandler,
3772 "utf8", errmsg,
3773 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003774 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003775 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776 /* Update data because unicode_decode_call_errorhandler might have
3777 re-created or resized the unicode object. */
3778 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003779 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 /* Ensure the unicode_size calculation above was correct: */
3782 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3783
Walter Dörwald69652032004-09-07 20:24:22 +00003784 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003785 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787 /* Adjust length and ready string when it contained errors and
3788 is of the old resizable kind. */
3789 if (kind == PyUnicode_WCHAR_KIND) {
3790 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3791 PyUnicode_READY(unicode) == -1)
3792 goto onError;
3793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795 Py_XDECREF(errorHandler);
3796 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003797 if (PyUnicode_READY(unicode) == -1) {
3798 Py_DECREF(unicode);
3799 return NULL;
3800 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801 return (PyObject *)unicode;
3802
Benjamin Peterson29060642009-01-31 22:14:21 +00003803 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003804 Py_XDECREF(errorHandler);
3805 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 Py_DECREF(unicode);
3807 return NULL;
3808}
3809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003810#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003811
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003812#ifdef __APPLE__
3813
3814/* Simplified UTF-8 decoder using surrogateescape error handler,
3815 used to decode the command line arguments on Mac OS X. */
3816
3817wchar_t*
3818_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3819{
3820 int n;
3821 const char *e;
3822 wchar_t *unicode, *p;
3823
3824 /* Note: size will always be longer than the resulting Unicode
3825 character count */
3826 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3827 PyErr_NoMemory();
3828 return NULL;
3829 }
3830 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3831 if (!unicode)
3832 return NULL;
3833
3834 /* Unpack UTF-8 encoded data */
3835 p = unicode;
3836 e = s + size;
3837 while (s < e) {
3838 Py_UCS4 ch = (unsigned char)*s;
3839
3840 if (ch < 0x80) {
3841 *p++ = (wchar_t)ch;
3842 s++;
3843 continue;
3844 }
3845
3846 n = utf8_code_length[ch];
3847 if (s + n > e) {
3848 goto surrogateescape;
3849 }
3850
3851 switch (n) {
3852 case 0:
3853 case 1:
3854 goto surrogateescape;
3855
3856 case 2:
3857 if ((s[1] & 0xc0) != 0x80)
3858 goto surrogateescape;
3859 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3860 assert ((ch > 0x007F) && (ch <= 0x07FF));
3861 *p++ = (wchar_t)ch;
3862 break;
3863
3864 case 3:
3865 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3866 will result in surrogates in range d800-dfff. Surrogates are
3867 not valid UTF-8 so they are rejected.
3868 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3869 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3870 if ((s[1] & 0xc0) != 0x80 ||
3871 (s[2] & 0xc0) != 0x80 ||
3872 ((unsigned char)s[0] == 0xE0 &&
3873 (unsigned char)s[1] < 0xA0) ||
3874 ((unsigned char)s[0] == 0xED &&
3875 (unsigned char)s[1] > 0x9F)) {
3876
3877 goto surrogateescape;
3878 }
3879 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3880 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003882 break;
3883
3884 case 4:
3885 if ((s[1] & 0xc0) != 0x80 ||
3886 (s[2] & 0xc0) != 0x80 ||
3887 (s[3] & 0xc0) != 0x80 ||
3888 ((unsigned char)s[0] == 0xF0 &&
3889 (unsigned char)s[1] < 0x90) ||
3890 ((unsigned char)s[0] == 0xF4 &&
3891 (unsigned char)s[1] > 0x8F)) {
3892 goto surrogateescape;
3893 }
3894 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3895 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3896 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3897
3898#if SIZEOF_WCHAR_T == 4
3899 *p++ = (wchar_t)ch;
3900#else
3901 /* compute and append the two surrogates: */
3902
3903 /* translate from 10000..10FFFF to 0..FFFF */
3904 ch -= 0x10000;
3905
3906 /* high surrogate = top 10 bits added to D800 */
3907 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3908
3909 /* low surrogate = bottom 10 bits added to DC00 */
3910 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3911#endif
3912 break;
3913 }
3914 s += n;
3915 continue;
3916
3917 surrogateescape:
3918 *p++ = 0xDC00 + ch;
3919 s++;
3920 }
3921 *p = L'\0';
3922 return unicode;
3923}
3924
3925#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003927/* Primary internal function which creates utf8 encoded bytes objects.
3928
3929 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003930 and allocate exactly as much space needed at the end. Else allocate the
3931 maximum possible needed (4 result bytes per Unicode character), and return
3932 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003933*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003934PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003935_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936{
Tim Peters602f7402002-04-27 18:03:26 +00003937#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003938
Guido van Rossum98297ee2007-11-06 21:34:58 +00003939 Py_ssize_t i; /* index into s of next input byte */
3940 PyObject *result; /* result string object */
3941 char *p; /* next free byte in output buffer */
3942 Py_ssize_t nallocated; /* number of result bytes allocated */
3943 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003944 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003945 PyObject *errorHandler = NULL;
3946 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 int kind;
3948 void *data;
3949 Py_ssize_t size;
3950 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3951#if SIZEOF_WCHAR_T == 2
3952 Py_ssize_t wchar_offset = 0;
3953#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003955 if (!PyUnicode_Check(unicode)) {
3956 PyErr_BadArgument();
3957 return NULL;
3958 }
3959
3960 if (PyUnicode_READY(unicode) == -1)
3961 return NULL;
3962
3963 if (_PyUnicode_UTF8(unicode))
3964 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
3965 _PyUnicode_UTF8_LENGTH(unicode));
3966
3967 kind = PyUnicode_KIND(unicode);
3968 data = PyUnicode_DATA(unicode);
3969 size = PyUnicode_GET_LENGTH(unicode);
3970
Tim Peters602f7402002-04-27 18:03:26 +00003971 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972
Tim Peters602f7402002-04-27 18:03:26 +00003973 if (size <= MAX_SHORT_UNICHARS) {
3974 /* Write into the stack buffer; nallocated can't overflow.
3975 * At the end, we'll allocate exactly as much heap space as it
3976 * turns out we need.
3977 */
3978 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003979 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00003980 p = stackbuf;
3981 }
3982 else {
3983 /* Overallocate on the heap, and give the excess back at the end. */
3984 nallocated = size * 4;
3985 if (nallocated / 4 != size) /* overflow! */
3986 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00003987 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003988 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00003989 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003990 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003991 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003992
Tim Peters602f7402002-04-27 18:03:26 +00003993 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003994 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003995
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003996 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00003997 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003999
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004001 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004002 *p++ = (char)(0xc0 | (ch >> 6));
4003 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004004 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005 Py_ssize_t newpos;
4006 PyObject *rep;
4007 Py_ssize_t repsize, k, startpos;
4008 startpos = i-1;
4009#if SIZEOF_WCHAR_T == 2
4010 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004011#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 rep = unicode_encode_call_errorhandler(
4013 errors, &errorHandler, "utf-8", "surrogates not allowed",
4014 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4015 &exc, startpos, startpos+1, &newpos);
4016 if (!rep)
4017 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 if (PyBytes_Check(rep))
4020 repsize = PyBytes_GET_SIZE(rep);
4021 else
4022 repsize = PyUnicode_GET_SIZE(rep);
4023
4024 if (repsize > 4) {
4025 Py_ssize_t offset;
4026
4027 if (result == NULL)
4028 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004029 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4033 /* integer overflow */
4034 PyErr_NoMemory();
4035 goto error;
4036 }
4037 nallocated += repsize - 4;
4038 if (result != NULL) {
4039 if (_PyBytes_Resize(&result, nallocated) < 0)
4040 goto error;
4041 } else {
4042 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004043 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004044 goto error;
4045 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4046 }
4047 p = PyBytes_AS_STRING(result) + offset;
4048 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 if (PyBytes_Check(rep)) {
4051 char *prep = PyBytes_AS_STRING(rep);
4052 for(k = repsize; k > 0; k--)
4053 *p++ = *prep++;
4054 } else /* rep is unicode */ {
4055 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4056 Py_UNICODE c;
4057
4058 for(k=0; k<repsize; k++) {
4059 c = prep[k];
4060 if (0x80 <= c) {
4061 raise_encode_exception(&exc, "utf-8",
4062 PyUnicode_AS_UNICODE(unicode),
4063 size, i-1, i,
4064 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004065 goto error;
4066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004067 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004068 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004070 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004071 } else if (ch < 0x10000) {
4072 *p++ = (char)(0xe0 | (ch >> 12));
4073 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4074 *p++ = (char)(0x80 | (ch & 0x3f));
4075 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004076 /* Encode UCS4 Unicode ordinals */
4077 *p++ = (char)(0xf0 | (ch >> 18));
4078 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4079 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4080 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004081#if SIZEOF_WCHAR_T == 2
4082 wchar_offset++;
4083#endif
Tim Peters602f7402002-04-27 18:03:26 +00004084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004086
Guido van Rossum98297ee2007-11-06 21:34:58 +00004087 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004088 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004089 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004090 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004091 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004092 }
4093 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004094 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004095 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004096 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004097 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004098 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004099
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004100 Py_XDECREF(errorHandler);
4101 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004102 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004103 error:
4104 Py_XDECREF(errorHandler);
4105 Py_XDECREF(exc);
4106 Py_XDECREF(result);
4107 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004108
Tim Peters602f7402002-04-27 18:03:26 +00004109#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004110}
4111
Alexander Belopolsky40018472011-02-26 01:02:56 +00004112PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004113PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4114 Py_ssize_t size,
4115 const char *errors)
4116{
4117 PyObject *v, *unicode;
4118
4119 unicode = PyUnicode_FromUnicode(s, size);
4120 if (unicode == NULL)
4121 return NULL;
4122 v = _PyUnicode_AsUTF8String(unicode, errors);
4123 Py_DECREF(unicode);
4124 return v;
4125}
4126
4127PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004128PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004130 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131}
4132
Walter Dörwald41980ca2007-08-16 21:55:45 +00004133/* --- UTF-32 Codec ------------------------------------------------------- */
4134
4135PyObject *
4136PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004137 Py_ssize_t size,
4138 const char *errors,
4139 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004140{
4141 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4142}
4143
4144PyObject *
4145PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 Py_ssize_t size,
4147 const char *errors,
4148 int *byteorder,
4149 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004150{
4151 const char *starts = s;
4152 Py_ssize_t startinpos;
4153 Py_ssize_t endinpos;
4154 Py_ssize_t outpos;
4155 PyUnicodeObject *unicode;
4156 Py_UNICODE *p;
4157#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004158 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004159 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004160#else
4161 const int pairs = 0;
4162#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004163 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004164 int bo = 0; /* assume native ordering by default */
4165 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004166 /* Offsets from q for retrieving bytes in the right order. */
4167#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4168 int iorder[] = {0, 1, 2, 3};
4169#else
4170 int iorder[] = {3, 2, 1, 0};
4171#endif
4172 PyObject *errorHandler = NULL;
4173 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004174
Walter Dörwald41980ca2007-08-16 21:55:45 +00004175 q = (unsigned char *)s;
4176 e = q + size;
4177
4178 if (byteorder)
4179 bo = *byteorder;
4180
4181 /* Check for BOM marks (U+FEFF) in the input and adjust current
4182 byte order setting accordingly. In native mode, the leading BOM
4183 mark is skipped, in all other modes, it is copied to the output
4184 stream as-is (giving a ZWNBSP character). */
4185 if (bo == 0) {
4186 if (size >= 4) {
4187 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004188 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004189#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 if (bom == 0x0000FEFF) {
4191 q += 4;
4192 bo = -1;
4193 }
4194 else if (bom == 0xFFFE0000) {
4195 q += 4;
4196 bo = 1;
4197 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004198#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004199 if (bom == 0x0000FEFF) {
4200 q += 4;
4201 bo = 1;
4202 }
4203 else if (bom == 0xFFFE0000) {
4204 q += 4;
4205 bo = -1;
4206 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004207#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004208 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004209 }
4210
4211 if (bo == -1) {
4212 /* force LE */
4213 iorder[0] = 0;
4214 iorder[1] = 1;
4215 iorder[2] = 2;
4216 iorder[3] = 3;
4217 }
4218 else if (bo == 1) {
4219 /* force BE */
4220 iorder[0] = 3;
4221 iorder[1] = 2;
4222 iorder[2] = 1;
4223 iorder[3] = 0;
4224 }
4225
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004226 /* On narrow builds we split characters outside the BMP into two
4227 codepoints => count how much extra space we need. */
4228#ifndef Py_UNICODE_WIDE
4229 for (qq = q; qq < e; qq += 4)
4230 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4231 pairs++;
4232#endif
4233
4234 /* This might be one to much, because of a BOM */
4235 unicode = _PyUnicode_New((size+3)/4+pairs);
4236 if (!unicode)
4237 return NULL;
4238 if (size == 0)
4239 return (PyObject *)unicode;
4240
4241 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004242 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004243
Walter Dörwald41980ca2007-08-16 21:55:45 +00004244 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004245 Py_UCS4 ch;
4246 /* remaining bytes at the end? (size should be divisible by 4) */
4247 if (e-q<4) {
4248 if (consumed)
4249 break;
4250 errmsg = "truncated data";
4251 startinpos = ((const char *)q)-starts;
4252 endinpos = ((const char *)e)-starts;
4253 goto utf32Error;
4254 /* The remaining input chars are ignored if the callback
4255 chooses to skip the input */
4256 }
4257 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4258 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004259
Benjamin Peterson29060642009-01-31 22:14:21 +00004260 if (ch >= 0x110000)
4261 {
4262 errmsg = "codepoint not in range(0x110000)";
4263 startinpos = ((const char *)q)-starts;
4264 endinpos = startinpos+4;
4265 goto utf32Error;
4266 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004267#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 if (ch >= 0x10000)
4269 {
4270 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4271 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4272 }
4273 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004274#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004275 *p++ = ch;
4276 q += 4;
4277 continue;
4278 utf32Error:
4279 outpos = p-PyUnicode_AS_UNICODE(unicode);
4280 if (unicode_decode_call_errorhandler(
4281 errors, &errorHandler,
4282 "utf32", errmsg,
4283 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4284 &unicode, &outpos, &p))
4285 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004286 }
4287
4288 if (byteorder)
4289 *byteorder = bo;
4290
4291 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004293
4294 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004295 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004296 goto onError;
4297
4298 Py_XDECREF(errorHandler);
4299 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004300 if (PyUnicode_READY(unicode) == -1) {
4301 Py_DECREF(unicode);
4302 return NULL;
4303 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004304 return (PyObject *)unicode;
4305
Benjamin Peterson29060642009-01-31 22:14:21 +00004306 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004307 Py_DECREF(unicode);
4308 Py_XDECREF(errorHandler);
4309 Py_XDECREF(exc);
4310 return NULL;
4311}
4312
4313PyObject *
4314PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 Py_ssize_t size,
4316 const char *errors,
4317 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004318{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004319 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004320 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004321 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004322#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004323 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004324#else
4325 const int pairs = 0;
4326#endif
4327 /* Offsets from p for storing byte pairs in the right order. */
4328#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4329 int iorder[] = {0, 1, 2, 3};
4330#else
4331 int iorder[] = {3, 2, 1, 0};
4332#endif
4333
Benjamin Peterson29060642009-01-31 22:14:21 +00004334#define STORECHAR(CH) \
4335 do { \
4336 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4337 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4338 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4339 p[iorder[0]] = (CH) & 0xff; \
4340 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004341 } while(0)
4342
4343 /* In narrow builds we can output surrogate pairs as one codepoint,
4344 so we need less space. */
4345#ifndef Py_UNICODE_WIDE
4346 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004347 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4348 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4349 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004350#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004351 nsize = (size - pairs + (byteorder == 0));
4352 bytesize = nsize * 4;
4353 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004354 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004355 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004356 if (v == NULL)
4357 return NULL;
4358
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004359 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004360 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004361 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004362 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004363 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004364
4365 if (byteorder == -1) {
4366 /* force LE */
4367 iorder[0] = 0;
4368 iorder[1] = 1;
4369 iorder[2] = 2;
4370 iorder[3] = 3;
4371 }
4372 else if (byteorder == 1) {
4373 /* force BE */
4374 iorder[0] = 3;
4375 iorder[1] = 2;
4376 iorder[2] = 1;
4377 iorder[3] = 0;
4378 }
4379
4380 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004381 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004382#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004383 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4384 Py_UCS4 ch2 = *s;
4385 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4386 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4387 s++;
4388 size--;
4389 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004390 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004391#endif
4392 STORECHAR(ch);
4393 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004394
4395 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004396 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004397#undef STORECHAR
4398}
4399
Alexander Belopolsky40018472011-02-26 01:02:56 +00004400PyObject *
4401PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004402{
4403 if (!PyUnicode_Check(unicode)) {
4404 PyErr_BadArgument();
4405 return NULL;
4406 }
4407 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 PyUnicode_GET_SIZE(unicode),
4409 NULL,
4410 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004411}
4412
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413/* --- UTF-16 Codec ------------------------------------------------------- */
4414
Tim Peters772747b2001-08-09 22:21:55 +00004415PyObject *
4416PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004417 Py_ssize_t size,
4418 const char *errors,
4419 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420{
Walter Dörwald69652032004-09-07 20:24:22 +00004421 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4422}
4423
Antoine Pitrouab868312009-01-10 15:40:25 +00004424/* Two masks for fast checking of whether a C 'long' may contain
4425 UTF16-encoded surrogate characters. This is an efficient heuristic,
4426 assuming that non-surrogate characters with a code point >= 0x8000 are
4427 rare in most input.
4428 FAST_CHAR_MASK is used when the input is in native byte ordering,
4429 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004430*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004431#if (SIZEOF_LONG == 8)
4432# define FAST_CHAR_MASK 0x8000800080008000L
4433# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4434#elif (SIZEOF_LONG == 4)
4435# define FAST_CHAR_MASK 0x80008000L
4436# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4437#else
4438# error C 'long' size should be either 4 or 8!
4439#endif
4440
Walter Dörwald69652032004-09-07 20:24:22 +00004441PyObject *
4442PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004443 Py_ssize_t size,
4444 const char *errors,
4445 int *byteorder,
4446 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004447{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004449 Py_ssize_t startinpos;
4450 Py_ssize_t endinpos;
4451 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 PyUnicodeObject *unicode;
4453 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004454 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004455 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004456 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004457 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004458 /* Offsets from q for retrieving byte pairs in the right order. */
4459#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4460 int ihi = 1, ilo = 0;
4461#else
4462 int ihi = 0, ilo = 1;
4463#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004464 PyObject *errorHandler = NULL;
4465 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466
4467 /* Note: size will always be longer than the resulting Unicode
4468 character count */
4469 unicode = _PyUnicode_New(size);
4470 if (!unicode)
4471 return NULL;
4472 if (size == 0)
4473 return (PyObject *)unicode;
4474
4475 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004476 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004477 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004478 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479
4480 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004481 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004483 /* Check for BOM marks (U+FEFF) in the input and adjust current
4484 byte order setting accordingly. In native mode, the leading BOM
4485 mark is skipped, in all other modes, it is copied to the output
4486 stream as-is (giving a ZWNBSP character). */
4487 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004488 if (size >= 2) {
4489 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004490#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 if (bom == 0xFEFF) {
4492 q += 2;
4493 bo = -1;
4494 }
4495 else if (bom == 0xFFFE) {
4496 q += 2;
4497 bo = 1;
4498 }
Tim Petersced69f82003-09-16 20:30:58 +00004499#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004500 if (bom == 0xFEFF) {
4501 q += 2;
4502 bo = 1;
4503 }
4504 else if (bom == 0xFFFE) {
4505 q += 2;
4506 bo = -1;
4507 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004508#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004510 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511
Tim Peters772747b2001-08-09 22:21:55 +00004512 if (bo == -1) {
4513 /* force LE */
4514 ihi = 1;
4515 ilo = 0;
4516 }
4517 else if (bo == 1) {
4518 /* force BE */
4519 ihi = 0;
4520 ilo = 1;
4521 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004522#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4523 native_ordering = ilo < ihi;
4524#else
4525 native_ordering = ilo > ihi;
4526#endif
Tim Peters772747b2001-08-09 22:21:55 +00004527
Antoine Pitrouab868312009-01-10 15:40:25 +00004528 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004529 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004531 /* First check for possible aligned read of a C 'long'. Unaligned
4532 reads are more expensive, better to defer to another iteration. */
4533 if (!((size_t) q & LONG_PTR_MASK)) {
4534 /* Fast path for runs of non-surrogate chars. */
4535 register const unsigned char *_q = q;
4536 Py_UNICODE *_p = p;
4537 if (native_ordering) {
4538 /* Native ordering is simple: as long as the input cannot
4539 possibly contain a surrogate char, do an unrolled copy
4540 of several 16-bit code points to the target object.
4541 The non-surrogate check is done on several input bytes
4542 at a time (as many as a C 'long' can contain). */
4543 while (_q < aligned_end) {
4544 unsigned long data = * (unsigned long *) _q;
4545 if (data & FAST_CHAR_MASK)
4546 break;
4547 _p[0] = ((unsigned short *) _q)[0];
4548 _p[1] = ((unsigned short *) _q)[1];
4549#if (SIZEOF_LONG == 8)
4550 _p[2] = ((unsigned short *) _q)[2];
4551 _p[3] = ((unsigned short *) _q)[3];
4552#endif
4553 _q += SIZEOF_LONG;
4554 _p += SIZEOF_LONG / 2;
4555 }
4556 }
4557 else {
4558 /* Byteswapped ordering is similar, but we must decompose
4559 the copy bytewise, and take care of zero'ing out the
4560 upper bytes if the target object is in 32-bit units
4561 (that is, in UCS-4 builds). */
4562 while (_q < aligned_end) {
4563 unsigned long data = * (unsigned long *) _q;
4564 if (data & SWAPPED_FAST_CHAR_MASK)
4565 break;
4566 /* Zero upper bytes in UCS-4 builds */
4567#if (Py_UNICODE_SIZE > 2)
4568 _p[0] = 0;
4569 _p[1] = 0;
4570#if (SIZEOF_LONG == 8)
4571 _p[2] = 0;
4572 _p[3] = 0;
4573#endif
4574#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004575 /* Issue #4916; UCS-4 builds on big endian machines must
4576 fill the two last bytes of each 4-byte unit. */
4577#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4578# define OFF 2
4579#else
4580# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004581#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004582 ((unsigned char *) _p)[OFF + 1] = _q[0];
4583 ((unsigned char *) _p)[OFF + 0] = _q[1];
4584 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4585 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4586#if (SIZEOF_LONG == 8)
4587 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4588 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4589 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4590 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4591#endif
4592#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004593 _q += SIZEOF_LONG;
4594 _p += SIZEOF_LONG / 2;
4595 }
4596 }
4597 p = _p;
4598 q = _q;
4599 if (q >= e)
4600 break;
4601 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004602 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004603
Benjamin Peterson14339b62009-01-31 16:36:08 +00004604 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004605
4606 if (ch < 0xD800 || ch > 0xDFFF) {
4607 *p++ = ch;
4608 continue;
4609 }
4610
4611 /* UTF-16 code pair: */
4612 if (q > e) {
4613 errmsg = "unexpected end of data";
4614 startinpos = (((const char *)q) - 2) - starts;
4615 endinpos = ((const char *)e) + 1 - starts;
4616 goto utf16Error;
4617 }
4618 if (0xD800 <= ch && ch <= 0xDBFF) {
4619 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4620 q += 2;
4621 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004622#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004623 *p++ = ch;
4624 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004625#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004626 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004627#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004628 continue;
4629 }
4630 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004631 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004632 startinpos = (((const char *)q)-4)-starts;
4633 endinpos = startinpos+2;
4634 goto utf16Error;
4635 }
4636
Benjamin Peterson14339b62009-01-31 16:36:08 +00004637 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004638 errmsg = "illegal encoding";
4639 startinpos = (((const char *)q)-2)-starts;
4640 endinpos = startinpos+2;
4641 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004642
Benjamin Peterson29060642009-01-31 22:14:21 +00004643 utf16Error:
4644 outpos = p - PyUnicode_AS_UNICODE(unicode);
4645 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004646 errors,
4647 &errorHandler,
4648 "utf16", errmsg,
4649 &starts,
4650 (const char **)&e,
4651 &startinpos,
4652 &endinpos,
4653 &exc,
4654 (const char **)&q,
4655 &unicode,
4656 &outpos,
4657 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004658 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004660 /* remaining byte at the end? (size should be even) */
4661 if (e == q) {
4662 if (!consumed) {
4663 errmsg = "truncated data";
4664 startinpos = ((const char *)q) - starts;
4665 endinpos = ((const char *)e) + 1 - starts;
4666 outpos = p - PyUnicode_AS_UNICODE(unicode);
4667 if (unicode_decode_call_errorhandler(
4668 errors,
4669 &errorHandler,
4670 "utf16", errmsg,
4671 &starts,
4672 (const char **)&e,
4673 &startinpos,
4674 &endinpos,
4675 &exc,
4676 (const char **)&q,
4677 &unicode,
4678 &outpos,
4679 &p))
4680 goto onError;
4681 /* The remaining input chars are ignored if the callback
4682 chooses to skip the input */
4683 }
4684 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685
4686 if (byteorder)
4687 *byteorder = bo;
4688
Walter Dörwald69652032004-09-07 20:24:22 +00004689 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004690 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004691
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004693 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 goto onError;
4695
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 Py_XDECREF(errorHandler);
4697 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004698 if (PyUnicode_READY(unicode) == -1) {
4699 Py_DECREF(unicode);
4700 return NULL;
4701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702 return (PyObject *)unicode;
4703
Benjamin Peterson29060642009-01-31 22:14:21 +00004704 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004706 Py_XDECREF(errorHandler);
4707 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708 return NULL;
4709}
4710
Antoine Pitrouab868312009-01-10 15:40:25 +00004711#undef FAST_CHAR_MASK
4712#undef SWAPPED_FAST_CHAR_MASK
4713
Tim Peters772747b2001-08-09 22:21:55 +00004714PyObject *
4715PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004716 Py_ssize_t size,
4717 const char *errors,
4718 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004720 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004721 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004722 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004723#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004724 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004725#else
4726 const int pairs = 0;
4727#endif
Tim Peters772747b2001-08-09 22:21:55 +00004728 /* Offsets from p for storing byte pairs in the right order. */
4729#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4730 int ihi = 1, ilo = 0;
4731#else
4732 int ihi = 0, ilo = 1;
4733#endif
4734
Benjamin Peterson29060642009-01-31 22:14:21 +00004735#define STORECHAR(CH) \
4736 do { \
4737 p[ihi] = ((CH) >> 8) & 0xff; \
4738 p[ilo] = (CH) & 0xff; \
4739 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004740 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004742#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004743 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004744 if (s[i] >= 0x10000)
4745 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004746#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004747 /* 2 * (size + pairs + (byteorder == 0)) */
4748 if (size > PY_SSIZE_T_MAX ||
4749 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004751 nsize = size + pairs + (byteorder == 0);
4752 bytesize = nsize * 2;
4753 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004754 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004755 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756 if (v == NULL)
4757 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004759 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004761 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004762 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004763 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004764
4765 if (byteorder == -1) {
4766 /* force LE */
4767 ihi = 1;
4768 ilo = 0;
4769 }
4770 else if (byteorder == 1) {
4771 /* force BE */
4772 ihi = 0;
4773 ilo = 1;
4774 }
4775
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004776 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004777 Py_UNICODE ch = *s++;
4778 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004779#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004780 if (ch >= 0x10000) {
4781 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4782 ch = 0xD800 | ((ch-0x10000) >> 10);
4783 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004784#endif
Tim Peters772747b2001-08-09 22:21:55 +00004785 STORECHAR(ch);
4786 if (ch2)
4787 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004788 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004789
4790 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004791 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004792#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793}
4794
Alexander Belopolsky40018472011-02-26 01:02:56 +00004795PyObject *
4796PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797{
4798 if (!PyUnicode_Check(unicode)) {
4799 PyErr_BadArgument();
4800 return NULL;
4801 }
4802 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004803 PyUnicode_GET_SIZE(unicode),
4804 NULL,
4805 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806}
4807
4808/* --- Unicode Escape Codec ----------------------------------------------- */
4809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004810/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4811 if all the escapes in the string make it still a valid ASCII string.
4812 Returns -1 if any escapes were found which cause the string to
4813 pop out of ASCII range. Otherwise returns the length of the
4814 required buffer to hold the string.
4815 */
4816Py_ssize_t
4817length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4818{
4819 const unsigned char *p = (const unsigned char *)s;
4820 const unsigned char *end = p + size;
4821 Py_ssize_t length = 0;
4822
4823 if (size < 0)
4824 return -1;
4825
4826 for (; p < end; ++p) {
4827 if (*p > 127) {
4828 /* Non-ASCII */
4829 return -1;
4830 }
4831 else if (*p != '\\') {
4832 /* Normal character */
4833 ++length;
4834 }
4835 else {
4836 /* Backslash-escape, check next char */
4837 ++p;
4838 /* Escape sequence reaches till end of string or
4839 non-ASCII follow-up. */
4840 if (p >= end || *p > 127)
4841 return -1;
4842 switch (*p) {
4843 case '\n':
4844 /* backslash + \n result in zero characters */
4845 break;
4846 case '\\': case '\'': case '\"':
4847 case 'b': case 'f': case 't':
4848 case 'n': case 'r': case 'v': case 'a':
4849 ++length;
4850 break;
4851 case '0': case '1': case '2': case '3':
4852 case '4': case '5': case '6': case '7':
4853 case 'x': case 'u': case 'U': case 'N':
4854 /* these do not guarantee ASCII characters */
4855 return -1;
4856 default:
4857 /* count the backslash + the other character */
4858 length += 2;
4859 }
4860 }
4861 }
4862 return length;
4863}
4864
4865/* Similar to PyUnicode_WRITE but either write into wstr field
4866 or treat string as ASCII. */
4867#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4868 do { \
4869 if ((kind) != PyUnicode_WCHAR_KIND) \
4870 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4871 else \
4872 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4873 } while (0)
4874
4875#define WRITE_WSTR(buf, index, value) \
4876 assert(kind == PyUnicode_WCHAR_KIND), \
4877 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4878
4879
Fredrik Lundh06d12682001-01-24 07:59:11 +00004880static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004881
Alexander Belopolsky40018472011-02-26 01:02:56 +00004882PyObject *
4883PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004884 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02004885 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004888 Py_ssize_t startinpos;
4889 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004890 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004892 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004894 char* message;
4895 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 PyObject *errorHandler = NULL;
4897 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004898 Py_ssize_t ascii_length;
4899 Py_ssize_t i;
4900 int kind;
4901 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004903 ascii_length = length_of_escaped_ascii_string(s, size);
4904
4905 /* After length_of_escaped_ascii_string() there are two alternatives,
4906 either the string is pure ASCII with named escapes like \n, etc.
4907 and we determined it's exact size (common case)
4908 or it contains \x, \u, ... escape sequences. then we create a
4909 legacy wchar string and resize it at the end of this function. */
4910 if (ascii_length >= 0) {
4911 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4912 if (!v)
4913 goto onError;
4914 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4915 kind = PyUnicode_1BYTE_KIND;
4916 data = PyUnicode_DATA(v);
4917 }
4918 else {
4919 /* Escaped strings will always be longer than the resulting
4920 Unicode string, so we start with size here and then reduce the
4921 length after conversion to the true value.
4922 (but if the error callback returns a long replacement string
4923 we'll have to allocate more space) */
4924 v = _PyUnicode_New(size);
4925 if (!v)
4926 goto onError;
4927 kind = PyUnicode_WCHAR_KIND;
4928 data = PyUnicode_AS_UNICODE(v);
4929 }
4930
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931 if (size == 0)
4932 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004933 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004935
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 while (s < end) {
4937 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004938 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004939 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004941 if (kind == PyUnicode_WCHAR_KIND) {
4942 assert(i < _PyUnicode_WSTR_LENGTH(v));
4943 }
4944 else {
4945 /* The only case in which i == ascii_length is a backslash
4946 followed by a newline. */
4947 assert(i <= ascii_length);
4948 }
4949
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950 /* Non-escape characters are interpreted as Unicode ordinals */
4951 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004952 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953 continue;
4954 }
4955
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004956 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957 /* \ - Escapes */
4958 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004959 c = *s++;
4960 if (s > end)
4961 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004962
4963 if (kind == PyUnicode_WCHAR_KIND) {
4964 assert(i < _PyUnicode_WSTR_LENGTH(v));
4965 }
4966 else {
4967 /* The only case in which i == ascii_length is a backslash
4968 followed by a newline. */
4969 assert(i < ascii_length || (i == ascii_length && c == '\n'));
4970 }
4971
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004972 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004976 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
4977 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
4978 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
4979 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
4980 /* FF */
4981 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
4982 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
4983 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
4984 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
4985 /* VT */
4986 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
4987 /* BEL, not classic C */
4988 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989
Benjamin Peterson29060642009-01-31 22:14:21 +00004990 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991 case '0': case '1': case '2': case '3':
4992 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004993 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004994 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004995 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004996 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004997 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004999 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 break;
5001
Benjamin Peterson29060642009-01-31 22:14:21 +00005002 /* hex escapes */
5003 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005005 digits = 2;
5006 message = "truncated \\xXX escape";
5007 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005011 digits = 4;
5012 message = "truncated \\uXXXX escape";
5013 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014
Benjamin Peterson29060642009-01-31 22:14:21 +00005015 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005016 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005017 digits = 8;
5018 message = "truncated \\UXXXXXXXX escape";
5019 hexescape:
5020 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005021 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 if (s+digits>end) {
5023 endinpos = size;
5024 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 errors, &errorHandler,
5026 "unicodeescape", "end of string in escape sequence",
5027 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005028 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005029 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005030 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031 goto nextByte;
5032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005033 for (j = 0; j < digits; ++j) {
5034 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005035 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005036 endinpos = (s+j+1)-starts;
5037 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005038 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 errors, &errorHandler,
5040 "unicodeescape", message,
5041 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005042 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005043 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005044 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005046 }
5047 chr = (chr<<4) & ~0xF;
5048 if (c >= '0' && c <= '9')
5049 chr += c - '0';
5050 else if (c >= 'a' && c <= 'f')
5051 chr += 10 + c - 'a';
5052 else
5053 chr += 10 + c - 'A';
5054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005055 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005056 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005057 /* _decoding_error will have already written into the
5058 target buffer. */
5059 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005060 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005061 /* when we get here, chr is a 32-bit unicode character */
5062 if (chr <= 0xffff)
5063 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005064 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005065 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005066 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005067 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005068#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005069 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005070#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005071 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005072 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5073 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005074#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005075 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005076 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005077 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005078 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005079 errors, &errorHandler,
5080 "unicodeescape", "illegal Unicode character",
5081 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005082 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005083 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005084 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005085 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005086 break;
5087
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005089 case 'N':
5090 message = "malformed \\N character escape";
5091 if (ucnhash_CAPI == NULL) {
5092 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005093 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5094 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005095 if (ucnhash_CAPI == NULL)
5096 goto ucnhashError;
5097 }
5098 if (*s == '{') {
5099 const char *start = s+1;
5100 /* look for the closing brace */
5101 while (*s != '}' && s < end)
5102 s++;
5103 if (s > start && s < end && *s == '}') {
5104 /* found a name. look it up in the unicode database */
5105 message = "unknown Unicode character name";
5106 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005107 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5108 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005109 goto store;
5110 }
5111 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005112 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005113 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005115 errors, &errorHandler,
5116 "unicodeescape", message,
5117 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005118 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005119 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005120 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005121 break;
5122
5123 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005124 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005125 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126 message = "\\ at end of string";
5127 s--;
5128 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005129 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005130 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005131 errors, &errorHandler,
5132 "unicodeescape", message,
5133 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005134 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005135 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005136 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005137 }
5138 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005139 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5140 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005141 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005142 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005144 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005145 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005147 /* Ensure the length prediction worked in case of ASCII strings */
5148 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5149
5150 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5151 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005152 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005153 Py_XDECREF(errorHandler);
5154 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005156
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005158 PyErr_SetString(
5159 PyExc_UnicodeError,
5160 "\\N escapes not supported (can't load unicodedata module)"
5161 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005162 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005163 Py_XDECREF(errorHandler);
5164 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005165 return NULL;
5166
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005169 Py_XDECREF(errorHandler);
5170 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 return NULL;
5172}
5173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005174#undef WRITE_ASCII_OR_WSTR
5175#undef WRITE_WSTR
5176
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177/* Return a Unicode-Escape string version of the Unicode object.
5178
5179 If quotes is true, the string is enclosed in u"" or u'' quotes as
5180 appropriate.
5181
5182*/
5183
Walter Dörwald79e913e2007-05-12 11:08:06 +00005184static const char *hexdigits = "0123456789abcdef";
5185
Alexander Belopolsky40018472011-02-26 01:02:56 +00005186PyObject *
5187PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005188 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005190 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005193#ifdef Py_UNICODE_WIDE
5194 const Py_ssize_t expandsize = 10;
5195#else
5196 const Py_ssize_t expandsize = 6;
5197#endif
5198
Thomas Wouters89f507f2006-12-13 04:49:30 +00005199 /* XXX(nnorwitz): rather than over-allocating, it would be
5200 better to choose a different scheme. Perhaps scan the
5201 first N-chars of the string and allocate based on that size.
5202 */
5203 /* Initial allocation is based on the longest-possible unichr
5204 escape.
5205
5206 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5207 unichr, so in this case it's the longest unichr escape. In
5208 narrow (UTF-16) builds this is five chars per source unichr
5209 since there are two unichrs in the surrogate pair, so in narrow
5210 (UTF-16) builds it's not the longest unichr escape.
5211
5212 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5213 so in the narrow (UTF-16) build case it's the longest unichr
5214 escape.
5215 */
5216
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005217 if (size == 0)
5218 return PyBytes_FromStringAndSize(NULL, 0);
5219
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005220 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005222
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005223 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005224 2
5225 + expandsize*size
5226 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 if (repr == NULL)
5228 return NULL;
5229
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005230 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 while (size-- > 0) {
5233 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005234
Walter Dörwald79e913e2007-05-12 11:08:06 +00005235 /* Escape backslashes */
5236 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 *p++ = '\\';
5238 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005239 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005240 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005241
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005242#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005243 /* Map 21-bit characters to '\U00xxxxxx' */
5244 else if (ch >= 0x10000) {
5245 *p++ = '\\';
5246 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005247 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5248 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5249 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5250 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5251 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5252 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5253 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5254 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005255 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005256 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005257#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5259 else if (ch >= 0xD800 && ch < 0xDC00) {
5260 Py_UNICODE ch2;
5261 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005262
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 ch2 = *s++;
5264 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005265 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5267 *p++ = '\\';
5268 *p++ = 'U';
5269 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5270 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5271 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5272 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5273 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5274 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5275 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5276 *p++ = hexdigits[ucs & 0x0000000F];
5277 continue;
5278 }
5279 /* Fall through: isolated surrogates are copied as-is */
5280 s--;
5281 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005282 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005283#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005284
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005286 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 *p++ = '\\';
5288 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005289 *p++ = hexdigits[(ch >> 12) & 0x000F];
5290 *p++ = hexdigits[(ch >> 8) & 0x000F];
5291 *p++ = hexdigits[(ch >> 4) & 0x000F];
5292 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005294
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005295 /* Map special whitespace to '\t', \n', '\r' */
5296 else if (ch == '\t') {
5297 *p++ = '\\';
5298 *p++ = 't';
5299 }
5300 else if (ch == '\n') {
5301 *p++ = '\\';
5302 *p++ = 'n';
5303 }
5304 else if (ch == '\r') {
5305 *p++ = '\\';
5306 *p++ = 'r';
5307 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005308
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005309 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005310 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005312 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005313 *p++ = hexdigits[(ch >> 4) & 0x000F];
5314 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005315 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005316
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317 /* Copy everything else as-is */
5318 else
5319 *p++ = (char) ch;
5320 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005322 assert(p - PyBytes_AS_STRING(repr) > 0);
5323 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5324 return NULL;
5325 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326}
5327
Alexander Belopolsky40018472011-02-26 01:02:56 +00005328PyObject *
5329PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005331 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 if (!PyUnicode_Check(unicode)) {
5333 PyErr_BadArgument();
5334 return NULL;
5335 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005336 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5337 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005338 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339}
5340
5341/* --- Raw Unicode Escape Codec ------------------------------------------- */
5342
Alexander Belopolsky40018472011-02-26 01:02:56 +00005343PyObject *
5344PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005345 Py_ssize_t size,
5346 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005348 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005349 Py_ssize_t startinpos;
5350 Py_ssize_t endinpos;
5351 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 const char *end;
5355 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005356 PyObject *errorHandler = NULL;
5357 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005358
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 /* Escaped strings will always be longer than the resulting
5360 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005361 length after conversion to the true value. (But decoding error
5362 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 v = _PyUnicode_New(size);
5364 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005368 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 end = s + size;
5370 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 unsigned char c;
5372 Py_UCS4 x;
5373 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005374 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 /* Non-escape characters are interpreted as Unicode ordinals */
5377 if (*s != '\\') {
5378 *p++ = (unsigned char)*s++;
5379 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005380 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 startinpos = s-starts;
5382
5383 /* \u-escapes are only interpreted iff the number of leading
5384 backslashes if odd */
5385 bs = s;
5386 for (;s < end;) {
5387 if (*s != '\\')
5388 break;
5389 *p++ = (unsigned char)*s++;
5390 }
5391 if (((s - bs) & 1) == 0 ||
5392 s >= end ||
5393 (*s != 'u' && *s != 'U')) {
5394 continue;
5395 }
5396 p--;
5397 count = *s=='u' ? 4 : 8;
5398 s++;
5399
5400 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5401 outpos = p-PyUnicode_AS_UNICODE(v);
5402 for (x = 0, i = 0; i < count; ++i, ++s) {
5403 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005404 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 endinpos = s-starts;
5406 if (unicode_decode_call_errorhandler(
5407 errors, &errorHandler,
5408 "rawunicodeescape", "truncated \\uXXXX",
5409 &starts, &end, &startinpos, &endinpos, &exc, &s,
5410 &v, &outpos, &p))
5411 goto onError;
5412 goto nextByte;
5413 }
5414 x = (x<<4) & ~0xF;
5415 if (c >= '0' && c <= '9')
5416 x += c - '0';
5417 else if (c >= 'a' && c <= 'f')
5418 x += 10 + c - 'a';
5419 else
5420 x += 10 + c - 'A';
5421 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005422 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005423 /* UCS-2 character */
5424 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005425 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005426 /* UCS-4 character. Either store directly, or as
5427 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005428#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005430#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005431 x -= 0x10000L;
5432 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5433 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005434#endif
5435 } else {
5436 endinpos = s-starts;
5437 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005438 if (unicode_decode_call_errorhandler(
5439 errors, &errorHandler,
5440 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 &starts, &end, &startinpos, &endinpos, &exc, &s,
5442 &v, &outpos, &p))
5443 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005444 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005445 nextByte:
5446 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005448 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005449 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005450 Py_XDECREF(errorHandler);
5451 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005452 if (PyUnicode_READY(v) == -1) {
5453 Py_DECREF(v);
5454 return NULL;
5455 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005457
Benjamin Peterson29060642009-01-31 22:14:21 +00005458 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005460 Py_XDECREF(errorHandler);
5461 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 return NULL;
5463}
5464
Alexander Belopolsky40018472011-02-26 01:02:56 +00005465PyObject *
5466PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005467 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005469 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 char *p;
5471 char *q;
5472
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005473#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005474 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005475#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005476 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005477#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005478
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005479 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005480 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005481
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005482 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 if (repr == NULL)
5484 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005485 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005486 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005488 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 while (size-- > 0) {
5490 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005491#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 /* Map 32-bit characters to '\Uxxxxxxxx' */
5493 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005494 *p++ = '\\';
5495 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005496 *p++ = hexdigits[(ch >> 28) & 0xf];
5497 *p++ = hexdigits[(ch >> 24) & 0xf];
5498 *p++ = hexdigits[(ch >> 20) & 0xf];
5499 *p++ = hexdigits[(ch >> 16) & 0xf];
5500 *p++ = hexdigits[(ch >> 12) & 0xf];
5501 *p++ = hexdigits[(ch >> 8) & 0xf];
5502 *p++ = hexdigits[(ch >> 4) & 0xf];
5503 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005504 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005505 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005506#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5508 if (ch >= 0xD800 && ch < 0xDC00) {
5509 Py_UNICODE ch2;
5510 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005511
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 ch2 = *s++;
5513 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005514 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5516 *p++ = '\\';
5517 *p++ = 'U';
5518 *p++ = hexdigits[(ucs >> 28) & 0xf];
5519 *p++ = hexdigits[(ucs >> 24) & 0xf];
5520 *p++ = hexdigits[(ucs >> 20) & 0xf];
5521 *p++ = hexdigits[(ucs >> 16) & 0xf];
5522 *p++ = hexdigits[(ucs >> 12) & 0xf];
5523 *p++ = hexdigits[(ucs >> 8) & 0xf];
5524 *p++ = hexdigits[(ucs >> 4) & 0xf];
5525 *p++ = hexdigits[ucs & 0xf];
5526 continue;
5527 }
5528 /* Fall through: isolated surrogates are copied as-is */
5529 s--;
5530 size++;
5531 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005532#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005533 /* Map 16-bit characters to '\uxxxx' */
5534 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535 *p++ = '\\';
5536 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005537 *p++ = hexdigits[(ch >> 12) & 0xf];
5538 *p++ = hexdigits[(ch >> 8) & 0xf];
5539 *p++ = hexdigits[(ch >> 4) & 0xf];
5540 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 /* Copy everything else as-is */
5543 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544 *p++ = (char) ch;
5545 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005546 size = p - q;
5547
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005548 assert(size > 0);
5549 if (_PyBytes_Resize(&repr, size) < 0)
5550 return NULL;
5551 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552}
5553
Alexander Belopolsky40018472011-02-26 01:02:56 +00005554PyObject *
5555PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005557 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005559 PyErr_BadArgument();
5560 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005562 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5563 PyUnicode_GET_SIZE(unicode));
5564
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005565 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566}
5567
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005568/* --- Unicode Internal Codec ------------------------------------------- */
5569
Alexander Belopolsky40018472011-02-26 01:02:56 +00005570PyObject *
5571_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005572 Py_ssize_t size,
5573 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005574{
5575 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005576 Py_ssize_t startinpos;
5577 Py_ssize_t endinpos;
5578 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005579 PyUnicodeObject *v;
5580 Py_UNICODE *p;
5581 const char *end;
5582 const char *reason;
5583 PyObject *errorHandler = NULL;
5584 PyObject *exc = NULL;
5585
Neal Norwitzd43069c2006-01-08 01:12:10 +00005586#ifdef Py_UNICODE_WIDE
5587 Py_UNICODE unimax = PyUnicode_GetMax();
5588#endif
5589
Thomas Wouters89f507f2006-12-13 04:49:30 +00005590 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005591 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5592 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005594 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5595 as string was created with the old API. */
5596 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005598 p = PyUnicode_AS_UNICODE(v);
5599 end = s + size;
5600
5601 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005602 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005603 /* We have to sanity check the raw data, otherwise doom looms for
5604 some malformed UCS-4 data. */
5605 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005606#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005607 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005608#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005609 end-s < Py_UNICODE_SIZE
5610 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005612 startinpos = s - starts;
5613 if (end-s < Py_UNICODE_SIZE) {
5614 endinpos = end-starts;
5615 reason = "truncated input";
5616 }
5617 else {
5618 endinpos = s - starts + Py_UNICODE_SIZE;
5619 reason = "illegal code point (> 0x10FFFF)";
5620 }
5621 outpos = p - PyUnicode_AS_UNICODE(v);
5622 if (unicode_decode_call_errorhandler(
5623 errors, &errorHandler,
5624 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005625 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005626 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005627 goto onError;
5628 }
5629 }
5630 else {
5631 p++;
5632 s += Py_UNICODE_SIZE;
5633 }
5634 }
5635
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005636 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005637 goto onError;
5638 Py_XDECREF(errorHandler);
5639 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005640 if (PyUnicode_READY(v) == -1) {
5641 Py_DECREF(v);
5642 return NULL;
5643 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005644 return (PyObject *)v;
5645
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005647 Py_XDECREF(v);
5648 Py_XDECREF(errorHandler);
5649 Py_XDECREF(exc);
5650 return NULL;
5651}
5652
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653/* --- Latin-1 Codec ------------------------------------------------------ */
5654
Alexander Belopolsky40018472011-02-26 01:02:56 +00005655PyObject *
5656PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005657 Py_ssize_t size,
5658 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005661 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662}
5663
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005664/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005665static void
5666make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005667 const char *encoding,
5668 const Py_UNICODE *unicode, Py_ssize_t size,
5669 Py_ssize_t startpos, Py_ssize_t endpos,
5670 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005672 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005673 *exceptionObject = PyUnicodeEncodeError_Create(
5674 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 }
5676 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5678 goto onError;
5679 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5680 goto onError;
5681 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5682 goto onError;
5683 return;
5684 onError:
5685 Py_DECREF(*exceptionObject);
5686 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 }
5688}
5689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005690/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005691static void
5692raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005693 const char *encoding,
5694 const Py_UNICODE *unicode, Py_ssize_t size,
5695 Py_ssize_t startpos, Py_ssize_t endpos,
5696 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697{
5698 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005700 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702}
5703
5704/* error handling callback helper:
5705 build arguments, call the callback and check the arguments,
5706 put the result into newpos and return the replacement string, which
5707 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005708static PyObject *
5709unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005710 PyObject **errorHandler,
5711 const char *encoding, const char *reason,
5712 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5713 Py_ssize_t startpos, Py_ssize_t endpos,
5714 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005716 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005717
5718 PyObject *restuple;
5719 PyObject *resunicode;
5720
5721 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005723 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005724 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005725 }
5726
5727 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005729 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731
5732 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005734 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005736 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005737 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 Py_DECREF(restuple);
5739 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005741 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 &resunicode, newpos)) {
5743 Py_DECREF(restuple);
5744 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005745 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005746 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5747 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5748 Py_DECREF(restuple);
5749 return NULL;
5750 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005751 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005753 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005754 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5755 Py_DECREF(restuple);
5756 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005757 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 Py_INCREF(resunicode);
5759 Py_DECREF(restuple);
5760 return resunicode;
5761}
5762
Alexander Belopolsky40018472011-02-26 01:02:56 +00005763static PyObject *
5764unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005765 Py_ssize_t size,
5766 const char *errors,
5767 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768{
5769 /* output object */
5770 PyObject *res;
5771 /* pointers to the beginning and end+1 of input */
5772 const Py_UNICODE *startp = p;
5773 const Py_UNICODE *endp = p + size;
5774 /* pointer to the beginning of the unencodable characters */
5775 /* const Py_UNICODE *badp = NULL; */
5776 /* pointer into the output */
5777 char *str;
5778 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005779 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005780 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5781 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 PyObject *errorHandler = NULL;
5783 PyObject *exc = NULL;
5784 /* the following variable is used for caching string comparisons
5785 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5786 int known_errorHandler = -1;
5787
5788 /* allocate enough for a simple encoding without
5789 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005790 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005791 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005792 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005793 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005794 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005795 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 ressize = size;
5797
5798 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005799 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 /* can we encode this? */
5802 if (c<limit) {
5803 /* no overflow check, because we know that the space is enough */
5804 *str++ = (char)c;
5805 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005806 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 else {
5808 Py_ssize_t unicodepos = p-startp;
5809 Py_ssize_t requiredsize;
5810 PyObject *repunicode;
5811 Py_ssize_t repsize;
5812 Py_ssize_t newpos;
5813 Py_ssize_t respos;
5814 Py_UNICODE *uni2;
5815 /* startpos for collecting unencodable chars */
5816 const Py_UNICODE *collstart = p;
5817 const Py_UNICODE *collend = p;
5818 /* find all unecodable characters */
5819 while ((collend < endp) && ((*collend)>=limit))
5820 ++collend;
5821 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5822 if (known_errorHandler==-1) {
5823 if ((errors==NULL) || (!strcmp(errors, "strict")))
5824 known_errorHandler = 1;
5825 else if (!strcmp(errors, "replace"))
5826 known_errorHandler = 2;
5827 else if (!strcmp(errors, "ignore"))
5828 known_errorHandler = 3;
5829 else if (!strcmp(errors, "xmlcharrefreplace"))
5830 known_errorHandler = 4;
5831 else
5832 known_errorHandler = 0;
5833 }
5834 switch (known_errorHandler) {
5835 case 1: /* strict */
5836 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5837 goto onError;
5838 case 2: /* replace */
5839 while (collstart++<collend)
5840 *str++ = '?'; /* fall through */
5841 case 3: /* ignore */
5842 p = collend;
5843 break;
5844 case 4: /* xmlcharrefreplace */
5845 respos = str - PyBytes_AS_STRING(res);
5846 /* determine replacement size (temporarily (mis)uses p) */
5847 for (p = collstart, repsize = 0; p < collend; ++p) {
5848 if (*p<10)
5849 repsize += 2+1+1;
5850 else if (*p<100)
5851 repsize += 2+2+1;
5852 else if (*p<1000)
5853 repsize += 2+3+1;
5854 else if (*p<10000)
5855 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005856#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 else
5858 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005859#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 else if (*p<100000)
5861 repsize += 2+5+1;
5862 else if (*p<1000000)
5863 repsize += 2+6+1;
5864 else
5865 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005866#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005867 }
5868 requiredsize = respos+repsize+(endp-collend);
5869 if (requiredsize > ressize) {
5870 if (requiredsize<2*ressize)
5871 requiredsize = 2*ressize;
5872 if (_PyBytes_Resize(&res, requiredsize))
5873 goto onError;
5874 str = PyBytes_AS_STRING(res) + respos;
5875 ressize = requiredsize;
5876 }
5877 /* generate replacement (temporarily (mis)uses p) */
5878 for (p = collstart; p < collend; ++p) {
5879 str += sprintf(str, "&#%d;", (int)*p);
5880 }
5881 p = collend;
5882 break;
5883 default:
5884 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5885 encoding, reason, startp, size, &exc,
5886 collstart-startp, collend-startp, &newpos);
5887 if (repunicode == NULL)
5888 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005889 if (PyBytes_Check(repunicode)) {
5890 /* Directly copy bytes result to output. */
5891 repsize = PyBytes_Size(repunicode);
5892 if (repsize > 1) {
5893 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005894 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005895 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5896 Py_DECREF(repunicode);
5897 goto onError;
5898 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005899 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005900 ressize += repsize-1;
5901 }
5902 memcpy(str, PyBytes_AsString(repunicode), repsize);
5903 str += repsize;
5904 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005905 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005906 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005907 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 /* need more space? (at least enough for what we
5909 have+the replacement+the rest of the string, so
5910 we won't have to check space for encodable characters) */
5911 respos = str - PyBytes_AS_STRING(res);
5912 repsize = PyUnicode_GET_SIZE(repunicode);
5913 requiredsize = respos+repsize+(endp-collend);
5914 if (requiredsize > ressize) {
5915 if (requiredsize<2*ressize)
5916 requiredsize = 2*ressize;
5917 if (_PyBytes_Resize(&res, requiredsize)) {
5918 Py_DECREF(repunicode);
5919 goto onError;
5920 }
5921 str = PyBytes_AS_STRING(res) + respos;
5922 ressize = requiredsize;
5923 }
5924 /* check if there is anything unencodable in the replacement
5925 and copy it to the output */
5926 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5927 c = *uni2;
5928 if (c >= limit) {
5929 raise_encode_exception(&exc, encoding, startp, size,
5930 unicodepos, unicodepos+1, reason);
5931 Py_DECREF(repunicode);
5932 goto onError;
5933 }
5934 *str = (char)c;
5935 }
5936 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005937 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005938 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005939 }
5940 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005941 /* Resize if we allocated to much */
5942 size = str - PyBytes_AS_STRING(res);
5943 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005944 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005945 if (_PyBytes_Resize(&res, size) < 0)
5946 goto onError;
5947 }
5948
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005949 Py_XDECREF(errorHandler);
5950 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005951 return res;
5952
5953 onError:
5954 Py_XDECREF(res);
5955 Py_XDECREF(errorHandler);
5956 Py_XDECREF(exc);
5957 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005958}
5959
Alexander Belopolsky40018472011-02-26 01:02:56 +00005960PyObject *
5961PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005962 Py_ssize_t size,
5963 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005965 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966}
5967
Alexander Belopolsky40018472011-02-26 01:02:56 +00005968PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005969_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970{
5971 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 PyErr_BadArgument();
5973 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005975 if (PyUnicode_READY(unicode) == -1)
5976 return NULL;
5977 /* Fast path: if it is a one-byte string, construct
5978 bytes object directly. */
5979 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
5980 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
5981 PyUnicode_GET_LENGTH(unicode));
5982 /* Non-Latin-1 characters present. Defer to above function to
5983 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005986 errors);
5987}
5988
5989PyObject*
5990PyUnicode_AsLatin1String(PyObject *unicode)
5991{
5992 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993}
5994
5995/* --- 7-bit ASCII Codec -------------------------------------------------- */
5996
Alexander Belopolsky40018472011-02-26 01:02:56 +00005997PyObject *
5998PyUnicode_DecodeASCII(const char *s,
5999 Py_ssize_t size,
6000 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 PyUnicodeObject *v;
6004 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006005 Py_ssize_t startinpos;
6006 Py_ssize_t endinpos;
6007 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006008 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006009 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010 PyObject *errorHandler = NULL;
6011 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006012 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006013
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006015 if (size == 1 && *(unsigned char*)s < 128)
6016 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6017
6018 /* Fast path. Assume the input actually *is* ASCII, and allocate
6019 a single-block Unicode object with that assumption. If there is
6020 an error, drop the object and start over. */
6021 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6022 if (v == NULL)
6023 goto onError;
6024 d = PyUnicode_1BYTE_DATA(v);
6025 for (i = 0; i < size; i++) {
6026 unsigned char ch = ((unsigned char*)s)[i];
6027 if (ch < 128)
6028 d[i] = ch;
6029 else
6030 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006032 if (i == size)
6033 return (PyObject*)v;
6034 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006035
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 v = _PyUnicode_New(size);
6037 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042 e = s + size;
6043 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 register unsigned char c = (unsigned char)*s;
6045 if (c < 128) {
6046 *p++ = c;
6047 ++s;
6048 }
6049 else {
6050 startinpos = s-starts;
6051 endinpos = startinpos + 1;
6052 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6053 if (unicode_decode_call_errorhandler(
6054 errors, &errorHandler,
6055 "ascii", "ordinal not in range(128)",
6056 &starts, &e, &startinpos, &endinpos, &exc, &s,
6057 &v, &outpos, &p))
6058 goto onError;
6059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006061 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6063 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 Py_XDECREF(errorHandler);
6065 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006066 if (PyUnicode_READY(v) == -1) {
6067 Py_DECREF(v);
6068 return NULL;
6069 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006071
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006074 Py_XDECREF(errorHandler);
6075 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 return NULL;
6077}
6078
Alexander Belopolsky40018472011-02-26 01:02:56 +00006079PyObject *
6080PyUnicode_EncodeASCII(const Py_UNICODE *p,
6081 Py_ssize_t size,
6082 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006084 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085}
6086
Alexander Belopolsky40018472011-02-26 01:02:56 +00006087PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006088_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089{
6090 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 PyErr_BadArgument();
6092 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006094 if (PyUnicode_READY(unicode) == -1)
6095 return NULL;
6096 /* Fast path: if it is an ASCII-only string, construct bytes object
6097 directly. Else defer to above function to raise the exception. */
6098 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6099 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6100 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006103 errors);
6104}
6105
6106PyObject *
6107PyUnicode_AsASCIIString(PyObject *unicode)
6108{
6109 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110}
6111
Victor Stinner99b95382011-07-04 14:23:54 +02006112#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006113
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006114/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006115
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006116#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006117#define NEED_RETRY
6118#endif
6119
6120/* XXX This code is limited to "true" double-byte encodings, as
6121 a) it assumes an incomplete character consists of a single byte, and
6122 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006124
Alexander Belopolsky40018472011-02-26 01:02:56 +00006125static int
6126is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006127{
6128 const char *curr = s + offset;
6129
6130 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 const char *prev = CharPrev(s, curr);
6132 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006133 }
6134 return 0;
6135}
6136
6137/*
6138 * Decode MBCS string into unicode object. If 'final' is set, converts
6139 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6140 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006141static int
6142decode_mbcs(PyUnicodeObject **v,
6143 const char *s, /* MBCS string */
6144 int size, /* sizeof MBCS string */
6145 int final,
6146 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006147{
6148 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006149 Py_ssize_t n;
6150 DWORD usize;
6151 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006152
6153 assert(size >= 0);
6154
Victor Stinner554f3f02010-06-16 23:33:54 +00006155 /* check and handle 'errors' arg */
6156 if (errors==NULL || strcmp(errors, "strict")==0)
6157 flags = MB_ERR_INVALID_CHARS;
6158 else if (strcmp(errors, "ignore")==0)
6159 flags = 0;
6160 else {
6161 PyErr_Format(PyExc_ValueError,
6162 "mbcs encoding does not support errors='%s'",
6163 errors);
6164 return -1;
6165 }
6166
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006167 /* Skip trailing lead-byte unless 'final' is set */
6168 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006170
6171 /* First get the size of the result */
6172 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006173 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6174 if (usize==0)
6175 goto mbcs_decode_error;
6176 } else
6177 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006178
6179 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 /* Create unicode object */
6181 *v = _PyUnicode_New(usize);
6182 if (*v == NULL)
6183 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006184 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006185 }
6186 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 /* Extend unicode object */
6188 n = PyUnicode_GET_SIZE(*v);
6189 if (_PyUnicode_Resize(v, n + usize) < 0)
6190 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006191 }
6192
6193 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006194 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006196 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6197 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006199 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006200 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006201
6202mbcs_decode_error:
6203 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6204 we raise a UnicodeDecodeError - else it is a 'generic'
6205 windows error
6206 */
6207 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6208 /* Ideally, we should get reason from FormatMessage - this
6209 is the Windows 2000 English version of the message
6210 */
6211 PyObject *exc = NULL;
6212 const char *reason = "No mapping for the Unicode character exists "
6213 "in the target multi-byte code page.";
6214 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6215 if (exc != NULL) {
6216 PyCodec_StrictErrors(exc);
6217 Py_DECREF(exc);
6218 }
6219 } else {
6220 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6221 }
6222 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006223}
6224
Alexander Belopolsky40018472011-02-26 01:02:56 +00006225PyObject *
6226PyUnicode_DecodeMBCSStateful(const char *s,
6227 Py_ssize_t size,
6228 const char *errors,
6229 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006230{
6231 PyUnicodeObject *v = NULL;
6232 int done;
6233
6234 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006236
6237#ifdef NEED_RETRY
6238 retry:
6239 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006240 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006241 else
6242#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006243 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006244
6245 if (done < 0) {
6246 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006248 }
6249
6250 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006252
6253#ifdef NEED_RETRY
6254 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 s += done;
6256 size -= done;
6257 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006258 }
6259#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006260 if (PyUnicode_READY(v) == -1) {
6261 Py_DECREF(v);
6262 return NULL;
6263 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006264 return (PyObject *)v;
6265}
6266
Alexander Belopolsky40018472011-02-26 01:02:56 +00006267PyObject *
6268PyUnicode_DecodeMBCS(const char *s,
6269 Py_ssize_t size,
6270 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006271{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006272 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6273}
6274
6275/*
6276 * Convert unicode into string object (MBCS).
6277 * Returns 0 if succeed, -1 otherwise.
6278 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006279static int
6280encode_mbcs(PyObject **repr,
6281 const Py_UNICODE *p, /* unicode */
6282 int size, /* size of unicode */
6283 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006284{
Victor Stinner554f3f02010-06-16 23:33:54 +00006285 BOOL usedDefaultChar = FALSE;
6286 BOOL *pusedDefaultChar;
6287 int mbcssize;
6288 Py_ssize_t n;
6289 PyObject *exc = NULL;
6290 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006291
6292 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006293
Victor Stinner554f3f02010-06-16 23:33:54 +00006294 /* check and handle 'errors' arg */
6295 if (errors==NULL || strcmp(errors, "strict")==0) {
6296 flags = WC_NO_BEST_FIT_CHARS;
6297 pusedDefaultChar = &usedDefaultChar;
6298 } else if (strcmp(errors, "replace")==0) {
6299 flags = 0;
6300 pusedDefaultChar = NULL;
6301 } else {
6302 PyErr_Format(PyExc_ValueError,
6303 "mbcs encoding does not support errors='%s'",
6304 errors);
6305 return -1;
6306 }
6307
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006308 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006309 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006310 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6311 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006312 if (mbcssize == 0) {
6313 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6314 return -1;
6315 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006316 /* If we used a default char, then we failed! */
6317 if (pusedDefaultChar && *pusedDefaultChar)
6318 goto mbcs_encode_error;
6319 } else {
6320 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006321 }
6322
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006323 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 /* Create string object */
6325 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6326 if (*repr == NULL)
6327 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006328 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006329 }
6330 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 /* Extend string object */
6332 n = PyBytes_Size(*repr);
6333 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6334 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006335 }
6336
6337 /* Do the conversion */
6338 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006340 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6341 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006342 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6343 return -1;
6344 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006345 if (pusedDefaultChar && *pusedDefaultChar)
6346 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006347 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006348 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006349
6350mbcs_encode_error:
6351 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6352 Py_XDECREF(exc);
6353 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006354}
6355
Alexander Belopolsky40018472011-02-26 01:02:56 +00006356PyObject *
6357PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6358 Py_ssize_t size,
6359 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006360{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006361 PyObject *repr = NULL;
6362 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006363
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006364#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006366 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006367 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006368 else
6369#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006370 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006371
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006372 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006373 Py_XDECREF(repr);
6374 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006375 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006376
6377#ifdef NEED_RETRY
6378 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 p += INT_MAX;
6380 size -= INT_MAX;
6381 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006382 }
6383#endif
6384
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006385 return repr;
6386}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006387
Alexander Belopolsky40018472011-02-26 01:02:56 +00006388PyObject *
6389PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006390{
6391 if (!PyUnicode_Check(unicode)) {
6392 PyErr_BadArgument();
6393 return NULL;
6394 }
6395 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 PyUnicode_GET_SIZE(unicode),
6397 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006398}
6399
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006400#undef NEED_RETRY
6401
Victor Stinner99b95382011-07-04 14:23:54 +02006402#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006403
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404/* --- Character Mapping Codec -------------------------------------------- */
6405
Alexander Belopolsky40018472011-02-26 01:02:56 +00006406PyObject *
6407PyUnicode_DecodeCharmap(const char *s,
6408 Py_ssize_t size,
6409 PyObject *mapping,
6410 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006412 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006413 Py_ssize_t startinpos;
6414 Py_ssize_t endinpos;
6415 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417 PyUnicodeObject *v;
6418 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006419 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006420 PyObject *errorHandler = NULL;
6421 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006422 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006423 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006424
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 /* Default to Latin-1 */
6426 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428
6429 v = _PyUnicode_New(size);
6430 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006435 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006436 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 mapstring = PyUnicode_AS_UNICODE(mapping);
6438 maplen = PyUnicode_GET_SIZE(mapping);
6439 while (s < e) {
6440 unsigned char ch = *s;
6441 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 if (ch < maplen)
6444 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 if (x == 0xfffe) {
6447 /* undefined mapping */
6448 outpos = p-PyUnicode_AS_UNICODE(v);
6449 startinpos = s-starts;
6450 endinpos = startinpos+1;
6451 if (unicode_decode_call_errorhandler(
6452 errors, &errorHandler,
6453 "charmap", "character maps to <undefined>",
6454 &starts, &e, &startinpos, &endinpos, &exc, &s,
6455 &v, &outpos, &p)) {
6456 goto onError;
6457 }
6458 continue;
6459 }
6460 *p++ = x;
6461 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006462 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006463 }
6464 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 while (s < e) {
6466 unsigned char ch = *s;
6467 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006468
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6470 w = PyLong_FromLong((long)ch);
6471 if (w == NULL)
6472 goto onError;
6473 x = PyObject_GetItem(mapping, w);
6474 Py_DECREF(w);
6475 if (x == NULL) {
6476 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6477 /* No mapping found means: mapping is undefined. */
6478 PyErr_Clear();
6479 x = Py_None;
6480 Py_INCREF(x);
6481 } else
6482 goto onError;
6483 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006484
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 /* Apply mapping */
6486 if (PyLong_Check(x)) {
6487 long value = PyLong_AS_LONG(x);
6488 if (value < 0 || value > 65535) {
6489 PyErr_SetString(PyExc_TypeError,
6490 "character mapping must be in range(65536)");
6491 Py_DECREF(x);
6492 goto onError;
6493 }
6494 *p++ = (Py_UNICODE)value;
6495 }
6496 else if (x == Py_None) {
6497 /* undefined mapping */
6498 outpos = p-PyUnicode_AS_UNICODE(v);
6499 startinpos = s-starts;
6500 endinpos = startinpos+1;
6501 if (unicode_decode_call_errorhandler(
6502 errors, &errorHandler,
6503 "charmap", "character maps to <undefined>",
6504 &starts, &e, &startinpos, &endinpos, &exc, &s,
6505 &v, &outpos, &p)) {
6506 Py_DECREF(x);
6507 goto onError;
6508 }
6509 Py_DECREF(x);
6510 continue;
6511 }
6512 else if (PyUnicode_Check(x)) {
6513 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006514
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 if (targetsize == 1)
6516 /* 1-1 mapping */
6517 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006518
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 else if (targetsize > 1) {
6520 /* 1-n mapping */
6521 if (targetsize > extrachars) {
6522 /* resize first */
6523 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6524 Py_ssize_t needed = (targetsize - extrachars) + \
6525 (targetsize << 2);
6526 extrachars += needed;
6527 /* XXX overflow detection missing */
6528 if (_PyUnicode_Resize(&v,
6529 PyUnicode_GET_SIZE(v) + needed) < 0) {
6530 Py_DECREF(x);
6531 goto onError;
6532 }
6533 p = PyUnicode_AS_UNICODE(v) + oldpos;
6534 }
6535 Py_UNICODE_COPY(p,
6536 PyUnicode_AS_UNICODE(x),
6537 targetsize);
6538 p += targetsize;
6539 extrachars -= targetsize;
6540 }
6541 /* 1-0 mapping: skip the character */
6542 }
6543 else {
6544 /* wrong return value */
6545 PyErr_SetString(PyExc_TypeError,
6546 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006547 Py_DECREF(x);
6548 goto onError;
6549 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 Py_DECREF(x);
6551 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006552 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 }
6554 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6556 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006557 Py_XDECREF(errorHandler);
6558 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006559 if (PyUnicode_READY(v) == -1) {
6560 Py_DECREF(v);
6561 return NULL;
6562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006564
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006566 Py_XDECREF(errorHandler);
6567 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 Py_XDECREF(v);
6569 return NULL;
6570}
6571
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006572/* Charmap encoding: the lookup table */
6573
Alexander Belopolsky40018472011-02-26 01:02:56 +00006574struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 PyObject_HEAD
6576 unsigned char level1[32];
6577 int count2, count3;
6578 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006579};
6580
6581static PyObject*
6582encoding_map_size(PyObject *obj, PyObject* args)
6583{
6584 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006585 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006587}
6588
6589static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006590 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 PyDoc_STR("Return the size (in bytes) of this object") },
6592 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006593};
6594
6595static void
6596encoding_map_dealloc(PyObject* o)
6597{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006598 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006599}
6600
6601static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006602 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 "EncodingMap", /*tp_name*/
6604 sizeof(struct encoding_map), /*tp_basicsize*/
6605 0, /*tp_itemsize*/
6606 /* methods */
6607 encoding_map_dealloc, /*tp_dealloc*/
6608 0, /*tp_print*/
6609 0, /*tp_getattr*/
6610 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006611 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006612 0, /*tp_repr*/
6613 0, /*tp_as_number*/
6614 0, /*tp_as_sequence*/
6615 0, /*tp_as_mapping*/
6616 0, /*tp_hash*/
6617 0, /*tp_call*/
6618 0, /*tp_str*/
6619 0, /*tp_getattro*/
6620 0, /*tp_setattro*/
6621 0, /*tp_as_buffer*/
6622 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6623 0, /*tp_doc*/
6624 0, /*tp_traverse*/
6625 0, /*tp_clear*/
6626 0, /*tp_richcompare*/
6627 0, /*tp_weaklistoffset*/
6628 0, /*tp_iter*/
6629 0, /*tp_iternext*/
6630 encoding_map_methods, /*tp_methods*/
6631 0, /*tp_members*/
6632 0, /*tp_getset*/
6633 0, /*tp_base*/
6634 0, /*tp_dict*/
6635 0, /*tp_descr_get*/
6636 0, /*tp_descr_set*/
6637 0, /*tp_dictoffset*/
6638 0, /*tp_init*/
6639 0, /*tp_alloc*/
6640 0, /*tp_new*/
6641 0, /*tp_free*/
6642 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006643};
6644
6645PyObject*
6646PyUnicode_BuildEncodingMap(PyObject* string)
6647{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006648 PyObject *result;
6649 struct encoding_map *mresult;
6650 int i;
6651 int need_dict = 0;
6652 unsigned char level1[32];
6653 unsigned char level2[512];
6654 unsigned char *mlevel1, *mlevel2, *mlevel3;
6655 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006656 int kind;
6657 void *data;
6658 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006660 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006661 PyErr_BadArgument();
6662 return NULL;
6663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006664 kind = PyUnicode_KIND(string);
6665 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006666 memset(level1, 0xFF, sizeof level1);
6667 memset(level2, 0xFF, sizeof level2);
6668
6669 /* If there isn't a one-to-one mapping of NULL to \0,
6670 or if there are non-BMP characters, we need to use
6671 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006672 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006673 need_dict = 1;
6674 for (i = 1; i < 256; i++) {
6675 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006676 ch = PyUnicode_READ(kind, data, i);
6677 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006678 need_dict = 1;
6679 break;
6680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006681 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006682 /* unmapped character */
6683 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006684 l1 = ch >> 11;
6685 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006686 if (level1[l1] == 0xFF)
6687 level1[l1] = count2++;
6688 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006689 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006690 }
6691
6692 if (count2 >= 0xFF || count3 >= 0xFF)
6693 need_dict = 1;
6694
6695 if (need_dict) {
6696 PyObject *result = PyDict_New();
6697 PyObject *key, *value;
6698 if (!result)
6699 return NULL;
6700 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006701 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006702 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006703 if (!key || !value)
6704 goto failed1;
6705 if (PyDict_SetItem(result, key, value) == -1)
6706 goto failed1;
6707 Py_DECREF(key);
6708 Py_DECREF(value);
6709 }
6710 return result;
6711 failed1:
6712 Py_XDECREF(key);
6713 Py_XDECREF(value);
6714 Py_DECREF(result);
6715 return NULL;
6716 }
6717
6718 /* Create a three-level trie */
6719 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6720 16*count2 + 128*count3 - 1);
6721 if (!result)
6722 return PyErr_NoMemory();
6723 PyObject_Init(result, &EncodingMapType);
6724 mresult = (struct encoding_map*)result;
6725 mresult->count2 = count2;
6726 mresult->count3 = count3;
6727 mlevel1 = mresult->level1;
6728 mlevel2 = mresult->level23;
6729 mlevel3 = mresult->level23 + 16*count2;
6730 memcpy(mlevel1, level1, 32);
6731 memset(mlevel2, 0xFF, 16*count2);
6732 memset(mlevel3, 0, 128*count3);
6733 count3 = 0;
6734 for (i = 1; i < 256; i++) {
6735 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006736 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006737 /* unmapped character */
6738 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006739 o1 = PyUnicode_READ(kind, data, i)>>11;
6740 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006741 i2 = 16*mlevel1[o1] + o2;
6742 if (mlevel2[i2] == 0xFF)
6743 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006744 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006745 i3 = 128*mlevel2[i2] + o3;
6746 mlevel3[i3] = i;
6747 }
6748 return result;
6749}
6750
6751static int
6752encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6753{
6754 struct encoding_map *map = (struct encoding_map*)mapping;
6755 int l1 = c>>11;
6756 int l2 = (c>>7) & 0xF;
6757 int l3 = c & 0x7F;
6758 int i;
6759
6760#ifdef Py_UNICODE_WIDE
6761 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006763 }
6764#endif
6765 if (c == 0)
6766 return 0;
6767 /* level 1*/
6768 i = map->level1[l1];
6769 if (i == 0xFF) {
6770 return -1;
6771 }
6772 /* level 2*/
6773 i = map->level23[16*i+l2];
6774 if (i == 0xFF) {
6775 return -1;
6776 }
6777 /* level 3 */
6778 i = map->level23[16*map->count2 + 128*i + l3];
6779 if (i == 0) {
6780 return -1;
6781 }
6782 return i;
6783}
6784
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006785/* Lookup the character ch in the mapping. If the character
6786 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006787 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006788static PyObject *
6789charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790{
Christian Heimes217cfd12007-12-02 14:31:20 +00006791 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006792 PyObject *x;
6793
6794 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006795 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006796 x = PyObject_GetItem(mapping, w);
6797 Py_DECREF(w);
6798 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6800 /* No mapping found means: mapping is undefined. */
6801 PyErr_Clear();
6802 x = Py_None;
6803 Py_INCREF(x);
6804 return x;
6805 } else
6806 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006808 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006810 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 long value = PyLong_AS_LONG(x);
6812 if (value < 0 || value > 255) {
6813 PyErr_SetString(PyExc_TypeError,
6814 "character mapping must be in range(256)");
6815 Py_DECREF(x);
6816 return NULL;
6817 }
6818 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006820 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 /* wrong return value */
6824 PyErr_Format(PyExc_TypeError,
6825 "character mapping must return integer, bytes or None, not %.400s",
6826 x->ob_type->tp_name);
6827 Py_DECREF(x);
6828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829 }
6830}
6831
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006832static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006833charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006834{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006835 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6836 /* exponentially overallocate to minimize reallocations */
6837 if (requiredsize < 2*outsize)
6838 requiredsize = 2*outsize;
6839 if (_PyBytes_Resize(outobj, requiredsize))
6840 return -1;
6841 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006842}
6843
Benjamin Peterson14339b62009-01-31 16:36:08 +00006844typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006846} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006847/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006848 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006849 space is available. Return a new reference to the object that
6850 was put in the output buffer, or Py_None, if the mapping was undefined
6851 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006852 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006853static charmapencode_result
6854charmapencode_output(Py_UNICODE c, PyObject *mapping,
6855 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006856{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006857 PyObject *rep;
6858 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006859 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006860
Christian Heimes90aa7642007-12-19 02:45:37 +00006861 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006862 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006864 if (res == -1)
6865 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 if (outsize<requiredsize)
6867 if (charmapencode_resize(outobj, outpos, requiredsize))
6868 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006869 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 outstart[(*outpos)++] = (char)res;
6871 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006872 }
6873
6874 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006875 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006877 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 Py_DECREF(rep);
6879 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006880 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 if (PyLong_Check(rep)) {
6882 Py_ssize_t requiredsize = *outpos+1;
6883 if (outsize<requiredsize)
6884 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6885 Py_DECREF(rep);
6886 return enc_EXCEPTION;
6887 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006888 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006890 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 else {
6892 const char *repchars = PyBytes_AS_STRING(rep);
6893 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6894 Py_ssize_t requiredsize = *outpos+repsize;
6895 if (outsize<requiredsize)
6896 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6897 Py_DECREF(rep);
6898 return enc_EXCEPTION;
6899 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006900 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 memcpy(outstart + *outpos, repchars, repsize);
6902 *outpos += repsize;
6903 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006904 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006905 Py_DECREF(rep);
6906 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006907}
6908
6909/* handle an error in PyUnicode_EncodeCharmap
6910 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006911static int
6912charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006913 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006914 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006915 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006916 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006917{
6918 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006919 Py_ssize_t repsize;
6920 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006921 Py_UNICODE *uni2;
6922 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006923 Py_ssize_t collstartpos = *inpos;
6924 Py_ssize_t collendpos = *inpos+1;
6925 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006926 char *encoding = "charmap";
6927 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006928 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006929
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006930 /* find all unencodable characters */
6931 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006932 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006933 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 int res = encoding_map_lookup(p[collendpos], mapping);
6935 if (res != -1)
6936 break;
6937 ++collendpos;
6938 continue;
6939 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006940
Benjamin Peterson29060642009-01-31 22:14:21 +00006941 rep = charmapencode_lookup(p[collendpos], mapping);
6942 if (rep==NULL)
6943 return -1;
6944 else if (rep!=Py_None) {
6945 Py_DECREF(rep);
6946 break;
6947 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006948 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006950 }
6951 /* cache callback name lookup
6952 * (if not done yet, i.e. it's the first error) */
6953 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 if ((errors==NULL) || (!strcmp(errors, "strict")))
6955 *known_errorHandler = 1;
6956 else if (!strcmp(errors, "replace"))
6957 *known_errorHandler = 2;
6958 else if (!strcmp(errors, "ignore"))
6959 *known_errorHandler = 3;
6960 else if (!strcmp(errors, "xmlcharrefreplace"))
6961 *known_errorHandler = 4;
6962 else
6963 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006964 }
6965 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006966 case 1: /* strict */
6967 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6968 return -1;
6969 case 2: /* replace */
6970 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 x = charmapencode_output('?', mapping, res, respos);
6972 if (x==enc_EXCEPTION) {
6973 return -1;
6974 }
6975 else if (x==enc_FAILED) {
6976 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6977 return -1;
6978 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006979 }
6980 /* fall through */
6981 case 3: /* ignore */
6982 *inpos = collendpos;
6983 break;
6984 case 4: /* xmlcharrefreplace */
6985 /* generate replacement (temporarily (mis)uses p) */
6986 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006987 char buffer[2+29+1+1];
6988 char *cp;
6989 sprintf(buffer, "&#%d;", (int)p[collpos]);
6990 for (cp = buffer; *cp; ++cp) {
6991 x = charmapencode_output(*cp, mapping, res, respos);
6992 if (x==enc_EXCEPTION)
6993 return -1;
6994 else if (x==enc_FAILED) {
6995 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6996 return -1;
6997 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006998 }
6999 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007000 *inpos = collendpos;
7001 break;
7002 default:
7003 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 encoding, reason, p, size, exceptionObject,
7005 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007006 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007008 if (PyBytes_Check(repunicode)) {
7009 /* Directly copy bytes result to output. */
7010 Py_ssize_t outsize = PyBytes_Size(*res);
7011 Py_ssize_t requiredsize;
7012 repsize = PyBytes_Size(repunicode);
7013 requiredsize = *respos + repsize;
7014 if (requiredsize > outsize)
7015 /* Make room for all additional bytes. */
7016 if (charmapencode_resize(res, respos, requiredsize)) {
7017 Py_DECREF(repunicode);
7018 return -1;
7019 }
7020 memcpy(PyBytes_AsString(*res) + *respos,
7021 PyBytes_AsString(repunicode), repsize);
7022 *respos += repsize;
7023 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007024 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007025 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007026 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007027 /* generate replacement */
7028 repsize = PyUnicode_GET_SIZE(repunicode);
7029 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 x = charmapencode_output(*uni2, mapping, res, respos);
7031 if (x==enc_EXCEPTION) {
7032 return -1;
7033 }
7034 else if (x==enc_FAILED) {
7035 Py_DECREF(repunicode);
7036 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7037 return -1;
7038 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007039 }
7040 *inpos = newpos;
7041 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007042 }
7043 return 0;
7044}
7045
Alexander Belopolsky40018472011-02-26 01:02:56 +00007046PyObject *
7047PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7048 Py_ssize_t size,
7049 PyObject *mapping,
7050 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007052 /* output object */
7053 PyObject *res = NULL;
7054 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007055 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007056 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007057 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007058 PyObject *errorHandler = NULL;
7059 PyObject *exc = NULL;
7060 /* the following variable is used for caching string comparisons
7061 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7062 * 3=ignore, 4=xmlcharrefreplace */
7063 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064
7065 /* Default to Latin-1 */
7066 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007067 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007069 /* allocate enough for a simple encoding without
7070 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007071 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007072 if (res == NULL)
7073 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007074 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007075 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007077 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007078 /* try to encode it */
7079 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7080 if (x==enc_EXCEPTION) /* error */
7081 goto onError;
7082 if (x==enc_FAILED) { /* unencodable character */
7083 if (charmap_encoding_error(p, size, &inpos, mapping,
7084 &exc,
7085 &known_errorHandler, &errorHandler, errors,
7086 &res, &respos)) {
7087 goto onError;
7088 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007089 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 else
7091 /* done with this character => adjust input position */
7092 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007095 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007096 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007097 if (_PyBytes_Resize(&res, respos) < 0)
7098 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007099
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007100 Py_XDECREF(exc);
7101 Py_XDECREF(errorHandler);
7102 return res;
7103
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007105 Py_XDECREF(res);
7106 Py_XDECREF(exc);
7107 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108 return NULL;
7109}
7110
Alexander Belopolsky40018472011-02-26 01:02:56 +00007111PyObject *
7112PyUnicode_AsCharmapString(PyObject *unicode,
7113 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114{
7115 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007116 PyErr_BadArgument();
7117 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118 }
7119 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 PyUnicode_GET_SIZE(unicode),
7121 mapping,
7122 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123}
7124
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007125/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007126static void
7127make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007128 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007129 Py_ssize_t startpos, Py_ssize_t endpos,
7130 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007132 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007133 *exceptionObject = _PyUnicodeTranslateError_Create(
7134 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135 }
7136 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7138 goto onError;
7139 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7140 goto onError;
7141 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7142 goto onError;
7143 return;
7144 onError:
7145 Py_DECREF(*exceptionObject);
7146 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 }
7148}
7149
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007150/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007151static void
7152raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007153 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007154 Py_ssize_t startpos, Py_ssize_t endpos,
7155 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007156{
7157 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007158 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007159 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007160 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007161}
7162
7163/* error handling callback helper:
7164 build arguments, call the callback and check the arguments,
7165 put the result into newpos and return the replacement string, which
7166 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007167static PyObject *
7168unicode_translate_call_errorhandler(const char *errors,
7169 PyObject **errorHandler,
7170 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007171 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007172 Py_ssize_t startpos, Py_ssize_t endpos,
7173 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007174{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007175 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007176
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007177 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007178 PyObject *restuple;
7179 PyObject *resunicode;
7180
7181 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007183 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007185 }
7186
7187 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007188 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007189 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007191
7192 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007193 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007194 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007195 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007196 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007197 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 Py_DECREF(restuple);
7199 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007200 }
7201 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 &resunicode, &i_newpos)) {
7203 Py_DECREF(restuple);
7204 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007205 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007206 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007207 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007208 else
7209 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007210 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7212 Py_DECREF(restuple);
7213 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007214 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007215 Py_INCREF(resunicode);
7216 Py_DECREF(restuple);
7217 return resunicode;
7218}
7219
7220/* Lookup the character ch in the mapping and put the result in result,
7221 which must be decrefed by the caller.
7222 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007223static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007224charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007225{
Christian Heimes217cfd12007-12-02 14:31:20 +00007226 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007227 PyObject *x;
7228
7229 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007231 x = PyObject_GetItem(mapping, w);
7232 Py_DECREF(w);
7233 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7235 /* No mapping found means: use 1:1 mapping. */
7236 PyErr_Clear();
7237 *result = NULL;
7238 return 0;
7239 } else
7240 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007241 }
7242 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007243 *result = x;
7244 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007245 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007246 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 long value = PyLong_AS_LONG(x);
7248 long max = PyUnicode_GetMax();
7249 if (value < 0 || value > max) {
7250 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007251 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007252 Py_DECREF(x);
7253 return -1;
7254 }
7255 *result = x;
7256 return 0;
7257 }
7258 else if (PyUnicode_Check(x)) {
7259 *result = x;
7260 return 0;
7261 }
7262 else {
7263 /* wrong return value */
7264 PyErr_SetString(PyExc_TypeError,
7265 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007266 Py_DECREF(x);
7267 return -1;
7268 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007269}
7270/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007271 if not reallocate and adjust various state variables.
7272 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007273static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007274charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007275 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007276{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007277 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007278 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 /* exponentially overallocate to minimize reallocations */
7280 if (requiredsize < 2 * oldsize)
7281 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007282 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7283 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007284 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007285 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007286 }
7287 return 0;
7288}
7289/* lookup the character, put the result in the output string and adjust
7290 various state variables. Return a new reference to the object that
7291 was put in the output buffer in *result, or Py_None, if the mapping was
7292 undefined (in which case no character was written).
7293 The called must decref result.
7294 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007295static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007296charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7297 PyObject *mapping, Py_UCS4 **output,
7298 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007299 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007300{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007301 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7302 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007303 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007304 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007305 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007306 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007307 }
7308 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007310 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007312 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007313 }
7314 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007315 Py_ssize_t repsize;
7316 if (PyUnicode_READY(*res) == -1)
7317 return -1;
7318 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 if (repsize==1) {
7320 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007321 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007322 }
7323 else if (repsize!=0) {
7324 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007325 Py_ssize_t requiredsize = *opos +
7326 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007328 Py_ssize_t i;
7329 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007330 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007331 for(i = 0; i < repsize; i++)
7332 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007333 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007334 }
7335 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007336 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007337 return 0;
7338}
7339
Alexander Belopolsky40018472011-02-26 01:02:56 +00007340PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007341_PyUnicode_TranslateCharmap(PyObject *input,
7342 PyObject *mapping,
7343 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007345 /* input object */
7346 char *idata;
7347 Py_ssize_t size, i;
7348 int kind;
7349 /* output buffer */
7350 Py_UCS4 *output = NULL;
7351 Py_ssize_t osize;
7352 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007353 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007354 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007355 char *reason = "character maps to <undefined>";
7356 PyObject *errorHandler = NULL;
7357 PyObject *exc = NULL;
7358 /* the following variable is used for caching string comparisons
7359 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7360 * 3=ignore, 4=xmlcharrefreplace */
7361 int known_errorHandler = -1;
7362
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007364 PyErr_BadArgument();
7365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007368 if (PyUnicode_READY(input) == -1)
7369 return NULL;
7370 idata = (char*)PyUnicode_DATA(input);
7371 kind = PyUnicode_KIND(input);
7372 size = PyUnicode_GET_LENGTH(input);
7373 i = 0;
7374
7375 if (size == 0) {
7376 Py_INCREF(input);
7377 return input;
7378 }
7379
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007380 /* allocate enough for a simple 1:1 translation without
7381 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007382 osize = size;
7383 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7384 opos = 0;
7385 if (output == NULL) {
7386 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007387 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007388 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007390 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007391 /* try to encode it */
7392 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007393 if (charmaptranslate_output(input, i, mapping,
7394 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007395 Py_XDECREF(x);
7396 goto onError;
7397 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007398 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007399 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007400 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 else { /* untranslatable character */
7402 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7403 Py_ssize_t repsize;
7404 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007405 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007407 Py_ssize_t collstart = i;
7408 Py_ssize_t collend = i+1;
7409 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007412 while (collend < size) {
7413 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 goto onError;
7415 Py_XDECREF(x);
7416 if (x!=Py_None)
7417 break;
7418 ++collend;
7419 }
7420 /* cache callback name lookup
7421 * (if not done yet, i.e. it's the first error) */
7422 if (known_errorHandler==-1) {
7423 if ((errors==NULL) || (!strcmp(errors, "strict")))
7424 known_errorHandler = 1;
7425 else if (!strcmp(errors, "replace"))
7426 known_errorHandler = 2;
7427 else if (!strcmp(errors, "ignore"))
7428 known_errorHandler = 3;
7429 else if (!strcmp(errors, "xmlcharrefreplace"))
7430 known_errorHandler = 4;
7431 else
7432 known_errorHandler = 0;
7433 }
7434 switch (known_errorHandler) {
7435 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007436 raise_translate_exception(&exc, input, collstart,
7437 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007438 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 case 2: /* replace */
7440 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007441 for (coll = collstart; coll<collend; coll++)
7442 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 /* fall through */
7444 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007445 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 break;
7447 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007448 /* generate replacement (temporarily (mis)uses i) */
7449 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 char buffer[2+29+1+1];
7451 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007452 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7453 if (charmaptranslate_makespace(&output, &osize,
7454 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 goto onError;
7456 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007457 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007459 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 break;
7461 default:
7462 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007463 reason, input, &exc,
7464 collstart, collend, &newpos);
7465 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 goto onError;
7467 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007468 repsize = PyUnicode_GET_LENGTH(repunicode);
7469 if (charmaptranslate_makespace(&output, &osize,
7470 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 Py_DECREF(repunicode);
7472 goto onError;
7473 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007474 for (uni2 = 0; repsize-->0; ++uni2)
7475 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7476 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007478 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007479 }
7480 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007481 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7482 if (!res)
7483 goto onError;
7484 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007485 Py_XDECREF(exc);
7486 Py_XDECREF(errorHandler);
7487 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007490 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007491 Py_XDECREF(exc);
7492 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 return NULL;
7494}
7495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007496/* Deprecated. Use PyUnicode_Translate instead. */
7497PyObject *
7498PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7499 Py_ssize_t size,
7500 PyObject *mapping,
7501 const char *errors)
7502{
7503 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7504 if (!unicode)
7505 return NULL;
7506 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7507}
7508
Alexander Belopolsky40018472011-02-26 01:02:56 +00007509PyObject *
7510PyUnicode_Translate(PyObject *str,
7511 PyObject *mapping,
7512 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513{
7514 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007515
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516 str = PyUnicode_FromObject(str);
7517 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007519 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 Py_DECREF(str);
7521 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007522
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 Py_XDECREF(str);
7525 return NULL;
7526}
Tim Petersced69f82003-09-16 20:30:58 +00007527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007528static Py_UCS4
7529fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7530{
7531 /* No need to call PyUnicode_READY(self) because this function is only
7532 called as a callback from fixup() which does it already. */
7533 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7534 const int kind = PyUnicode_KIND(self);
7535 void *data = PyUnicode_DATA(self);
7536 Py_UCS4 maxchar = 0, ch, fixed;
7537 Py_ssize_t i;
7538
7539 for (i = 0; i < len; ++i) {
7540 ch = PyUnicode_READ(kind, data, i);
7541 fixed = 0;
7542 if (ch > 127) {
7543 if (Py_UNICODE_ISSPACE(ch))
7544 fixed = ' ';
7545 else {
7546 const int decimal = Py_UNICODE_TODECIMAL(ch);
7547 if (decimal >= 0)
7548 fixed = '0' + decimal;
7549 }
7550 if (fixed != 0) {
7551 if (fixed > maxchar)
7552 maxchar = fixed;
7553 PyUnicode_WRITE(kind, data, i, fixed);
7554 }
7555 else if (ch > maxchar)
7556 maxchar = ch;
7557 }
7558 else if (ch > maxchar)
7559 maxchar = ch;
7560 }
7561
7562 return maxchar;
7563}
7564
7565PyObject *
7566_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7567{
7568 if (!PyUnicode_Check(unicode)) {
7569 PyErr_BadInternalCall();
7570 return NULL;
7571 }
7572 if (PyUnicode_READY(unicode) == -1)
7573 return NULL;
7574 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7575 /* If the string is already ASCII, just return the same string */
7576 Py_INCREF(unicode);
7577 return unicode;
7578 }
7579 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7580}
7581
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007582PyObject *
7583PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7584 Py_ssize_t length)
7585{
7586 PyObject *result;
7587 Py_UNICODE *p; /* write pointer into result */
7588 Py_ssize_t i;
7589 /* Copy to a new string */
7590 result = (PyObject *)_PyUnicode_New(length);
7591 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7592 if (result == NULL)
7593 return result;
7594 p = PyUnicode_AS_UNICODE(result);
7595 /* Iterate over code points */
7596 for (i = 0; i < length; i++) {
7597 Py_UNICODE ch =s[i];
7598 if (ch > 127) {
7599 int decimal = Py_UNICODE_TODECIMAL(ch);
7600 if (decimal >= 0)
7601 p[i] = '0' + decimal;
7602 }
7603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007604 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7605 Py_DECREF(result);
7606 return NULL;
7607 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007608 return result;
7609}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007610/* --- Decimal Encoder ---------------------------------------------------- */
7611
Alexander Belopolsky40018472011-02-26 01:02:56 +00007612int
7613PyUnicode_EncodeDecimal(Py_UNICODE *s,
7614 Py_ssize_t length,
7615 char *output,
7616 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007617{
7618 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007619 PyObject *errorHandler = NULL;
7620 PyObject *exc = NULL;
7621 const char *encoding = "decimal";
7622 const char *reason = "invalid decimal Unicode string";
7623 /* the following variable is used for caching string comparisons
7624 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7625 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007626
7627 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007628 PyErr_BadArgument();
7629 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007630 }
7631
7632 p = s;
7633 end = s + length;
7634 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 register Py_UNICODE ch = *p;
7636 int decimal;
7637 PyObject *repunicode;
7638 Py_ssize_t repsize;
7639 Py_ssize_t newpos;
7640 Py_UNICODE *uni2;
7641 Py_UNICODE *collstart;
7642 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007643
Benjamin Peterson29060642009-01-31 22:14:21 +00007644 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007645 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 ++p;
7647 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007648 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 decimal = Py_UNICODE_TODECIMAL(ch);
7650 if (decimal >= 0) {
7651 *output++ = '0' + decimal;
7652 ++p;
7653 continue;
7654 }
7655 if (0 < ch && ch < 256) {
7656 *output++ = (char)ch;
7657 ++p;
7658 continue;
7659 }
7660 /* All other characters are considered unencodable */
7661 collstart = p;
7662 collend = p+1;
7663 while (collend < end) {
7664 if ((0 < *collend && *collend < 256) ||
7665 !Py_UNICODE_ISSPACE(*collend) ||
7666 Py_UNICODE_TODECIMAL(*collend))
7667 break;
7668 }
7669 /* cache callback name lookup
7670 * (if not done yet, i.e. it's the first error) */
7671 if (known_errorHandler==-1) {
7672 if ((errors==NULL) || (!strcmp(errors, "strict")))
7673 known_errorHandler = 1;
7674 else if (!strcmp(errors, "replace"))
7675 known_errorHandler = 2;
7676 else if (!strcmp(errors, "ignore"))
7677 known_errorHandler = 3;
7678 else if (!strcmp(errors, "xmlcharrefreplace"))
7679 known_errorHandler = 4;
7680 else
7681 known_errorHandler = 0;
7682 }
7683 switch (known_errorHandler) {
7684 case 1: /* strict */
7685 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7686 goto onError;
7687 case 2: /* replace */
7688 for (p = collstart; p < collend; ++p)
7689 *output++ = '?';
7690 /* fall through */
7691 case 3: /* ignore */
7692 p = collend;
7693 break;
7694 case 4: /* xmlcharrefreplace */
7695 /* generate replacement (temporarily (mis)uses p) */
7696 for (p = collstart; p < collend; ++p)
7697 output += sprintf(output, "&#%d;", (int)*p);
7698 p = collend;
7699 break;
7700 default:
7701 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7702 encoding, reason, s, length, &exc,
7703 collstart-s, collend-s, &newpos);
7704 if (repunicode == NULL)
7705 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007706 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007707 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007708 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7709 Py_DECREF(repunicode);
7710 goto onError;
7711 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 /* generate replacement */
7713 repsize = PyUnicode_GET_SIZE(repunicode);
7714 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7715 Py_UNICODE ch = *uni2;
7716 if (Py_UNICODE_ISSPACE(ch))
7717 *output++ = ' ';
7718 else {
7719 decimal = Py_UNICODE_TODECIMAL(ch);
7720 if (decimal >= 0)
7721 *output++ = '0' + decimal;
7722 else if (0 < ch && ch < 256)
7723 *output++ = (char)ch;
7724 else {
7725 Py_DECREF(repunicode);
7726 raise_encode_exception(&exc, encoding,
7727 s, length, collstart-s, collend-s, reason);
7728 goto onError;
7729 }
7730 }
7731 }
7732 p = s + newpos;
7733 Py_DECREF(repunicode);
7734 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007735 }
7736 /* 0-terminate the output string */
7737 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007738 Py_XDECREF(exc);
7739 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007740 return 0;
7741
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007743 Py_XDECREF(exc);
7744 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007745 return -1;
7746}
7747
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748/* --- Helpers ------------------------------------------------------------ */
7749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007750#include "stringlib/ucs1lib.h"
7751#include "stringlib/fastsearch.h"
7752#include "stringlib/partition.h"
7753#include "stringlib/split.h"
7754#include "stringlib/count.h"
7755#include "stringlib/find.h"
7756#include "stringlib/localeutil.h"
7757#include "stringlib/undef.h"
7758
7759#include "stringlib/ucs2lib.h"
7760#include "stringlib/fastsearch.h"
7761#include "stringlib/partition.h"
7762#include "stringlib/split.h"
7763#include "stringlib/count.h"
7764#include "stringlib/find.h"
7765#include "stringlib/localeutil.h"
7766#include "stringlib/undef.h"
7767
7768#include "stringlib/ucs4lib.h"
7769#include "stringlib/fastsearch.h"
7770#include "stringlib/partition.h"
7771#include "stringlib/split.h"
7772#include "stringlib/count.h"
7773#include "stringlib/find.h"
7774#include "stringlib/localeutil.h"
7775#include "stringlib/undef.h"
7776
7777static Py_ssize_t
7778any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7779 const Py_UCS1*, Py_ssize_t,
7780 Py_ssize_t, Py_ssize_t),
7781 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7782 const Py_UCS2*, Py_ssize_t,
7783 Py_ssize_t, Py_ssize_t),
7784 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7785 const Py_UCS4*, Py_ssize_t,
7786 Py_ssize_t, Py_ssize_t),
7787 PyObject* s1, PyObject* s2,
7788 Py_ssize_t start,
7789 Py_ssize_t end)
7790{
7791 int kind1, kind2, kind;
7792 void *buf1, *buf2;
7793 Py_ssize_t len1, len2, result;
7794
7795 kind1 = PyUnicode_KIND(s1);
7796 kind2 = PyUnicode_KIND(s2);
7797 kind = kind1 > kind2 ? kind1 : kind2;
7798 buf1 = PyUnicode_DATA(s1);
7799 buf2 = PyUnicode_DATA(s2);
7800 if (kind1 != kind)
7801 buf1 = _PyUnicode_AsKind(s1, kind);
7802 if (!buf1)
7803 return -2;
7804 if (kind2 != kind)
7805 buf2 = _PyUnicode_AsKind(s2, kind);
7806 if (!buf2) {
7807 if (kind1 != kind) PyMem_Free(buf1);
7808 return -2;
7809 }
7810 len1 = PyUnicode_GET_LENGTH(s1);
7811 len2 = PyUnicode_GET_LENGTH(s2);
7812
7813 switch(kind) {
7814 case PyUnicode_1BYTE_KIND:
7815 result = ucs1(buf1, len1, buf2, len2, start, end);
7816 break;
7817 case PyUnicode_2BYTE_KIND:
7818 result = ucs2(buf1, len1, buf2, len2, start, end);
7819 break;
7820 case PyUnicode_4BYTE_KIND:
7821 result = ucs4(buf1, len1, buf2, len2, start, end);
7822 break;
7823 default:
7824 assert(0); result = -2;
7825 }
7826
7827 if (kind1 != kind)
7828 PyMem_Free(buf1);
7829 if (kind2 != kind)
7830 PyMem_Free(buf2);
7831
7832 return result;
7833}
7834
7835Py_ssize_t
7836_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7837 Py_ssize_t n_buffer,
7838 void *digits, Py_ssize_t n_digits,
7839 Py_ssize_t min_width,
7840 const char *grouping,
7841 const char *thousands_sep)
7842{
7843 switch(kind) {
7844 case PyUnicode_1BYTE_KIND:
7845 return _PyUnicode_ucs1_InsertThousandsGrouping(
7846 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7847 min_width, grouping, thousands_sep);
7848 case PyUnicode_2BYTE_KIND:
7849 return _PyUnicode_ucs2_InsertThousandsGrouping(
7850 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7851 min_width, grouping, thousands_sep);
7852 case PyUnicode_4BYTE_KIND:
7853 return _PyUnicode_ucs4_InsertThousandsGrouping(
7854 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7855 min_width, grouping, thousands_sep);
7856 }
7857 assert(0);
7858 return -1;
7859}
7860
7861
Eric Smith8c663262007-08-25 02:26:07 +00007862#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007863#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007864
Thomas Wouters477c8d52006-05-27 19:21:47 +00007865#include "stringlib/count.h"
7866#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007867
Thomas Wouters477c8d52006-05-27 19:21:47 +00007868/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007869#define ADJUST_INDICES(start, end, len) \
7870 if (end > len) \
7871 end = len; \
7872 else if (end < 0) { \
7873 end += len; \
7874 if (end < 0) \
7875 end = 0; \
7876 } \
7877 if (start < 0) { \
7878 start += len; \
7879 if (start < 0) \
7880 start = 0; \
7881 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007882
Alexander Belopolsky40018472011-02-26 01:02:56 +00007883Py_ssize_t
7884PyUnicode_Count(PyObject *str,
7885 PyObject *substr,
7886 Py_ssize_t start,
7887 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007889 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007890 PyUnicodeObject* str_obj;
7891 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007892 int kind1, kind2, kind;
7893 void *buf1 = NULL, *buf2 = NULL;
7894 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007895
Thomas Wouters477c8d52006-05-27 19:21:47 +00007896 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007897 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007899 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007900 if (!sub_obj || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 Py_DECREF(str_obj);
7902 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903 }
Tim Petersced69f82003-09-16 20:30:58 +00007904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007905 kind1 = PyUnicode_KIND(str_obj);
7906 kind2 = PyUnicode_KIND(sub_obj);
7907 kind = kind1 > kind2 ? kind1 : kind2;
7908 buf1 = PyUnicode_DATA(str_obj);
7909 if (kind1 != kind)
7910 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7911 if (!buf1)
7912 goto onError;
7913 buf2 = PyUnicode_DATA(sub_obj);
7914 if (kind2 != kind)
7915 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7916 if (!buf2)
7917 goto onError;
7918 len1 = PyUnicode_GET_LENGTH(str_obj);
7919 len2 = PyUnicode_GET_LENGTH(sub_obj);
7920
7921 ADJUST_INDICES(start, end, len1);
7922 switch(kind) {
7923 case PyUnicode_1BYTE_KIND:
7924 result = ucs1lib_count(
7925 ((Py_UCS1*)buf1) + start, end - start,
7926 buf2, len2, PY_SSIZE_T_MAX
7927 );
7928 break;
7929 case PyUnicode_2BYTE_KIND:
7930 result = ucs2lib_count(
7931 ((Py_UCS2*)buf1) + start, end - start,
7932 buf2, len2, PY_SSIZE_T_MAX
7933 );
7934 break;
7935 case PyUnicode_4BYTE_KIND:
7936 result = ucs4lib_count(
7937 ((Py_UCS4*)buf1) + start, end - start,
7938 buf2, len2, PY_SSIZE_T_MAX
7939 );
7940 break;
7941 default:
7942 assert(0); result = 0;
7943 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007944
7945 Py_DECREF(sub_obj);
7946 Py_DECREF(str_obj);
7947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007948 if (kind1 != kind)
7949 PyMem_Free(buf1);
7950 if (kind2 != kind)
7951 PyMem_Free(buf2);
7952
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007954 onError:
7955 Py_DECREF(sub_obj);
7956 Py_DECREF(str_obj);
7957 if (kind1 != kind && buf1)
7958 PyMem_Free(buf1);
7959 if (kind2 != kind && buf2)
7960 PyMem_Free(buf2);
7961 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962}
7963
Alexander Belopolsky40018472011-02-26 01:02:56 +00007964Py_ssize_t
7965PyUnicode_Find(PyObject *str,
7966 PyObject *sub,
7967 Py_ssize_t start,
7968 Py_ssize_t end,
7969 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007971 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00007972
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007974 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007976 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007977 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 Py_DECREF(str);
7979 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 }
Tim Petersced69f82003-09-16 20:30:58 +00007981
Thomas Wouters477c8d52006-05-27 19:21:47 +00007982 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007983 result = any_find_slice(
7984 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
7985 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007986 );
7987 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007988 result = any_find_slice(
7989 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
7990 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007991 );
7992
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007994 Py_DECREF(sub);
7995
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 return result;
7997}
7998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007999Py_ssize_t
8000PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8001 Py_ssize_t start, Py_ssize_t end,
8002 int direction)
8003{
8004 char *result;
8005 int kind;
8006 if (PyUnicode_READY(str) == -1)
8007 return -2;
8008 if (end > PyUnicode_GET_LENGTH(str))
8009 end = PyUnicode_GET_LENGTH(str);
8010 kind = PyUnicode_KIND(str);
8011 result = findchar(PyUnicode_1BYTE_DATA(str)
8012 + PyUnicode_KIND_SIZE(kind, start),
8013 kind,
8014 end-start, ch, direction);
8015 if (!result)
8016 return -1;
8017 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8018}
8019
Alexander Belopolsky40018472011-02-26 01:02:56 +00008020static int
8021tailmatch(PyUnicodeObject *self,
8022 PyUnicodeObject *substring,
8023 Py_ssize_t start,
8024 Py_ssize_t end,
8025 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008027 int kind_self;
8028 int kind_sub;
8029 void *data_self;
8030 void *data_sub;
8031 Py_ssize_t offset;
8032 Py_ssize_t i;
8033 Py_ssize_t end_sub;
8034
8035 if (PyUnicode_READY(self) == -1 ||
8036 PyUnicode_READY(substring) == -1)
8037 return 0;
8038
8039 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 return 1;
8041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008042 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8043 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008047 kind_self = PyUnicode_KIND(self);
8048 data_self = PyUnicode_DATA(self);
8049 kind_sub = PyUnicode_KIND(substring);
8050 data_sub = PyUnicode_DATA(substring);
8051 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8052
8053 if (direction > 0)
8054 offset = end;
8055 else
8056 offset = start;
8057
8058 if (PyUnicode_READ(kind_self, data_self, offset) ==
8059 PyUnicode_READ(kind_sub, data_sub, 0) &&
8060 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8061 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8062 /* If both are of the same kind, memcmp is sufficient */
8063 if (kind_self == kind_sub) {
8064 return ! memcmp((char *)data_self +
8065 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8066 data_sub,
8067 PyUnicode_GET_LENGTH(substring) *
8068 PyUnicode_CHARACTER_SIZE(substring));
8069 }
8070 /* otherwise we have to compare each character by first accesing it */
8071 else {
8072 /* We do not need to compare 0 and len(substring)-1 because
8073 the if statement above ensured already that they are equal
8074 when we end up here. */
8075 // TODO: honor direction and do a forward or backwards search
8076 for (i = 1; i < end_sub; ++i) {
8077 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8078 PyUnicode_READ(kind_sub, data_sub, i))
8079 return 0;
8080 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008082 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 }
8084
8085 return 0;
8086}
8087
Alexander Belopolsky40018472011-02-26 01:02:56 +00008088Py_ssize_t
8089PyUnicode_Tailmatch(PyObject *str,
8090 PyObject *substr,
8091 Py_ssize_t start,
8092 Py_ssize_t end,
8093 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008095 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008096
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 str = PyUnicode_FromObject(str);
8098 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100 substr = PyUnicode_FromObject(substr);
8101 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 Py_DECREF(str);
8103 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104 }
Tim Petersced69f82003-09-16 20:30:58 +00008105
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 (PyUnicodeObject *)substr,
8108 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109 Py_DECREF(str);
8110 Py_DECREF(substr);
8111 return result;
8112}
8113
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114/* Apply fixfct filter to the Unicode object self and return a
8115 reference to the modified object */
8116
Alexander Belopolsky40018472011-02-26 01:02:56 +00008117static PyObject *
8118fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008121 PyObject *u;
8122 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008124 if (PyUnicode_READY(self) == -1)
8125 return NULL;
8126 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8127 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8128 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8133 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008135 /* fix functions return the new maximum character in a string,
8136 if the kind of the resulting unicode object does not change,
8137 everything is fine. Otherwise we need to change the string kind
8138 and re-run the fix function. */
8139 maxchar_new = fixfct((PyUnicodeObject*)u);
8140 if (maxchar_new == 0)
8141 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8142 else if (maxchar_new <= 127)
8143 maxchar_new = 127;
8144 else if (maxchar_new <= 255)
8145 maxchar_new = 255;
8146 else if (maxchar_new <= 65535)
8147 maxchar_new = 65535;
8148 else
8149 maxchar_new = 1114111; /* 0x10ffff */
8150
8151 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008152 /* fixfct should return TRUE if it modified the buffer. If
8153 FALSE, return a reference to the original buffer instead
8154 (to save space, not time) */
8155 Py_INCREF(self);
8156 Py_DECREF(u);
8157 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008159 else if (maxchar_new == maxchar_old) {
8160 return u;
8161 }
8162 else {
8163 /* In case the maximum character changed, we need to
8164 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008165 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008166 if (v == NULL) {
8167 Py_DECREF(u);
8168 return NULL;
8169 }
8170 if (maxchar_new > maxchar_old) {
8171 /* If the maxchar increased so that the kind changed, not all
8172 characters are representable anymore and we need to fix the
8173 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008174 if (PyUnicode_CopyCharacters(v, 0,
8175 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008176 PyUnicode_GET_LENGTH(self)) < 0)
8177 {
8178 Py_DECREF(u);
8179 return NULL;
8180 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008181 maxchar_old = fixfct((PyUnicodeObject*)v);
8182 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8183 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008184 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008185 if (PyUnicode_CopyCharacters(v, 0,
8186 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008187 PyUnicode_GET_LENGTH(self)) < 0)
8188 {
8189 Py_DECREF(u);
8190 return NULL;
8191 }
8192 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008193
8194 Py_DECREF(u);
8195 return v;
8196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197}
8198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008199static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008200fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008202 /* No need to call PyUnicode_READY(self) because this function is only
8203 called as a callback from fixup() which does it already. */
8204 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8205 const int kind = PyUnicode_KIND(self);
8206 void *data = PyUnicode_DATA(self);
8207 int touched = 0;
8208 Py_UCS4 maxchar = 0;
8209 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008211 for (i = 0; i < len; ++i) {
8212 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8213 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8214 if (up != ch) {
8215 if (up > maxchar)
8216 maxchar = up;
8217 PyUnicode_WRITE(kind, data, i, up);
8218 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008220 else if (ch > maxchar)
8221 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222 }
8223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008224 if (touched)
8225 return maxchar;
8226 else
8227 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228}
8229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008230static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008231fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008233 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8234 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8235 const int kind = PyUnicode_KIND(self);
8236 void *data = PyUnicode_DATA(self);
8237 int touched = 0;
8238 Py_UCS4 maxchar = 0;
8239 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008241 for(i = 0; i < len; ++i) {
8242 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8243 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8244 if (lo != ch) {
8245 if (lo > maxchar)
8246 maxchar = lo;
8247 PyUnicode_WRITE(kind, data, i, lo);
8248 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008250 else if (ch > maxchar)
8251 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 }
8253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008254 if (touched)
8255 return maxchar;
8256 else
8257 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258}
8259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008260static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008261fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008263 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8264 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8265 const int kind = PyUnicode_KIND(self);
8266 void *data = PyUnicode_DATA(self);
8267 int touched = 0;
8268 Py_UCS4 maxchar = 0;
8269 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008271 for(i = 0; i < len; ++i) {
8272 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8273 Py_UCS4 nu = 0;
8274
8275 if (Py_UNICODE_ISUPPER(ch))
8276 nu = Py_UNICODE_TOLOWER(ch);
8277 else if (Py_UNICODE_ISLOWER(ch))
8278 nu = Py_UNICODE_TOUPPER(ch);
8279
8280 if (nu != 0) {
8281 if (nu > maxchar)
8282 maxchar = nu;
8283 PyUnicode_WRITE(kind, data, i, nu);
8284 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008286 else if (ch > maxchar)
8287 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288 }
8289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008290 if (touched)
8291 return maxchar;
8292 else
8293 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294}
8295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008296static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008297fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008299 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8300 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8301 const int kind = PyUnicode_KIND(self);
8302 void *data = PyUnicode_DATA(self);
8303 int touched = 0;
8304 Py_UCS4 maxchar = 0;
8305 Py_ssize_t i = 0;
8306 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008307
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008308 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008310
8311 ch = PyUnicode_READ(kind, data, i);
8312 if (!Py_UNICODE_ISUPPER(ch)) {
8313 maxchar = Py_UNICODE_TOUPPER(ch);
8314 PyUnicode_WRITE(kind, data, i, maxchar);
8315 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008317 ++i;
8318 for(; i < len; ++i) {
8319 ch = PyUnicode_READ(kind, data, i);
8320 if (!Py_UNICODE_ISLOWER(ch)) {
8321 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8322 if (lo > maxchar)
8323 maxchar = lo;
8324 PyUnicode_WRITE(kind, data, i, lo);
8325 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008326 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327 else if (ch > maxchar)
8328 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008329 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008330
8331 if (touched)
8332 return maxchar;
8333 else
8334 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335}
8336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008337static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008338fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008340 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8341 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8342 const int kind = PyUnicode_KIND(self);
8343 void *data = PyUnicode_DATA(self);
8344 Py_UCS4 maxchar = 0;
8345 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 int previous_is_cased;
8347
8348 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008349 if (len == 1) {
8350 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8351 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8352 if (ti != ch) {
8353 PyUnicode_WRITE(kind, data, i, ti);
8354 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 }
8356 else
8357 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008360 for(; i < len; ++i) {
8361 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8362 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008363
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 nu = Py_UNICODE_TOTITLE(ch);
8368
8369 if (nu > maxchar)
8370 maxchar = nu;
8371 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008372
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 if (Py_UNICODE_ISLOWER(ch) ||
8374 Py_UNICODE_ISUPPER(ch) ||
8375 Py_UNICODE_ISTITLE(ch))
8376 previous_is_cased = 1;
8377 else
8378 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381}
8382
Tim Peters8ce9f162004-08-27 01:49:32 +00008383PyObject *
8384PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008387 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008388 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008389 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008390 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8391 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008392 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 Py_ssize_t sz, i, res_offset;
8394 Py_UCS4 maxchar = 0;
8395 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396
Tim Peters05eba1f2004-08-27 21:32:02 +00008397 fseq = PySequence_Fast(seq, "");
8398 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008399 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008400 }
8401
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008402 /* NOTE: the following code can't call back into Python code,
8403 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008404 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008405
Tim Peters05eba1f2004-08-27 21:32:02 +00008406 seqlen = PySequence_Fast_GET_SIZE(fseq);
8407 /* If empty sequence, return u"". */
8408 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008409 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008410 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008411 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008412 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008413 /* If singleton sequence with an exact Unicode, return that. */
8414 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 item = items[0];
8416 if (PyUnicode_CheckExact(item)) {
8417 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008418 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 goto Done;
8420 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008421 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008422 else {
8423 /* Set up sep and seplen */
8424 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008425 /* fall back to a blank space separator */
8426 sep = PyUnicode_FromOrdinal(' ');
8427 if (!sep || PyUnicode_READY(sep) == -1)
8428 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008429 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008430 else {
8431 if (!PyUnicode_Check(separator)) {
8432 PyErr_Format(PyExc_TypeError,
8433 "separator: expected str instance,"
8434 " %.80s found",
8435 Py_TYPE(separator)->tp_name);
8436 goto onError;
8437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008438 if (PyUnicode_READY(separator) == -1)
8439 goto onError;
8440 sep = separator;
8441 seplen = PyUnicode_GET_LENGTH(separator);
8442 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8443 /* inc refcount to keep this code path symetric with the
8444 above case of a blank separator */
8445 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008446 }
8447 }
8448
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008449 /* There are at least two things to join, or else we have a subclass
8450 * of str in the sequence.
8451 * Do a pre-pass to figure out the total amount of space we'll
8452 * need (sz), and see whether all argument are strings.
8453 */
8454 sz = 0;
8455 for (i = 0; i < seqlen; i++) {
8456 const Py_ssize_t old_sz = sz;
8457 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 if (!PyUnicode_Check(item)) {
8459 PyErr_Format(PyExc_TypeError,
8460 "sequence item %zd: expected str instance,"
8461 " %.80s found",
8462 i, Py_TYPE(item)->tp_name);
8463 goto onError;
8464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008465 if (PyUnicode_READY(item) == -1)
8466 goto onError;
8467 sz += PyUnicode_GET_LENGTH(item);
8468 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8469 if (item_maxchar > maxchar)
8470 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008471 if (i != 0)
8472 sz += seplen;
8473 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8474 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008476 goto onError;
8477 }
8478 }
Tim Petersced69f82003-09-16 20:30:58 +00008479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008481 if (res == NULL)
8482 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008483
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008484 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008485 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008486 Py_ssize_t itemlen;
8487 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008488 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 /* Copy item, and maybe the separator. */
8490 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008491 if (PyUnicode_CopyCharacters(res, res_offset,
8492 sep, 0, seplen) < 0)
8493 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008494 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008496 if (PyUnicode_CopyCharacters(res, res_offset,
8497 item, 0, itemlen) < 0)
8498 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008499 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008500 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008502
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008504 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 Py_XDECREF(sep);
8506 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008509 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008510 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008511 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512 return NULL;
8513}
8514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515#define FILL(kind, data, value, start, length) \
8516 do { \
8517 Py_ssize_t i_ = 0; \
8518 assert(kind != PyUnicode_WCHAR_KIND); \
8519 switch ((kind)) { \
8520 case PyUnicode_1BYTE_KIND: { \
8521 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8522 memset(to_, (unsigned char)value, length); \
8523 break; \
8524 } \
8525 case PyUnicode_2BYTE_KIND: { \
8526 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8527 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8528 break; \
8529 } \
8530 default: { \
8531 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8532 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8533 break; \
8534 } \
8535 } \
8536 } while (0)
8537
Alexander Belopolsky40018472011-02-26 01:02:56 +00008538static PyUnicodeObject *
8539pad(PyUnicodeObject *self,
8540 Py_ssize_t left,
8541 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544 PyObject *u;
8545 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008546 int kind;
8547 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548
8549 if (left < 0)
8550 left = 0;
8551 if (right < 0)
8552 right = 0;
8553
Tim Peters7a29bd52001-09-12 03:03:31 +00008554 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555 Py_INCREF(self);
8556 return self;
8557 }
8558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8560 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008561 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8562 return NULL;
8563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8565 if (fill > maxchar)
8566 maxchar = fill;
8567 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008568 if (!u)
8569 return NULL;
8570
8571 kind = PyUnicode_KIND(u);
8572 data = PyUnicode_DATA(u);
8573 if (left)
8574 FILL(kind, data, fill, 0, left);
8575 if (right)
8576 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008577 if (PyUnicode_CopyCharacters(u, left,
8578 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008579 _PyUnicode_LENGTH(self)) < 0)
8580 {
8581 Py_DECREF(u);
8582 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583 }
8584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008585 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008587#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588
Alexander Belopolsky40018472011-02-26 01:02:56 +00008589PyObject *
8590PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593
8594 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 switch(PyUnicode_KIND(string)) {
8599 case PyUnicode_1BYTE_KIND:
8600 list = ucs1lib_splitlines(
8601 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8602 PyUnicode_GET_LENGTH(string), keepends);
8603 break;
8604 case PyUnicode_2BYTE_KIND:
8605 list = ucs2lib_splitlines(
8606 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8607 PyUnicode_GET_LENGTH(string), keepends);
8608 break;
8609 case PyUnicode_4BYTE_KIND:
8610 list = ucs4lib_splitlines(
8611 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8612 PyUnicode_GET_LENGTH(string), keepends);
8613 break;
8614 default:
8615 assert(0);
8616 list = 0;
8617 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 Py_DECREF(string);
8619 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620}
8621
Alexander Belopolsky40018472011-02-26 01:02:56 +00008622static PyObject *
8623split(PyUnicodeObject *self,
8624 PyUnicodeObject *substring,
8625 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 int kind1, kind2, kind;
8628 void *buf1, *buf2;
8629 Py_ssize_t len1, len2;
8630 PyObject* out;
8631
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008633 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 if (PyUnicode_READY(self) == -1)
8636 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 if (substring == NULL)
8639 switch(PyUnicode_KIND(self)) {
8640 case PyUnicode_1BYTE_KIND:
8641 return ucs1lib_split_whitespace(
8642 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8643 PyUnicode_GET_LENGTH(self), maxcount
8644 );
8645 case PyUnicode_2BYTE_KIND:
8646 return ucs2lib_split_whitespace(
8647 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8648 PyUnicode_GET_LENGTH(self), maxcount
8649 );
8650 case PyUnicode_4BYTE_KIND:
8651 return ucs4lib_split_whitespace(
8652 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8653 PyUnicode_GET_LENGTH(self), maxcount
8654 );
8655 default:
8656 assert(0);
8657 return NULL;
8658 }
8659
8660 if (PyUnicode_READY(substring) == -1)
8661 return NULL;
8662
8663 kind1 = PyUnicode_KIND(self);
8664 kind2 = PyUnicode_KIND(substring);
8665 kind = kind1 > kind2 ? kind1 : kind2;
8666 buf1 = PyUnicode_DATA(self);
8667 buf2 = PyUnicode_DATA(substring);
8668 if (kind1 != kind)
8669 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8670 if (!buf1)
8671 return NULL;
8672 if (kind2 != kind)
8673 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8674 if (!buf2) {
8675 if (kind1 != kind) PyMem_Free(buf1);
8676 return NULL;
8677 }
8678 len1 = PyUnicode_GET_LENGTH(self);
8679 len2 = PyUnicode_GET_LENGTH(substring);
8680
8681 switch(kind) {
8682 case PyUnicode_1BYTE_KIND:
8683 out = ucs1lib_split(
8684 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8685 break;
8686 case PyUnicode_2BYTE_KIND:
8687 out = ucs2lib_split(
8688 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8689 break;
8690 case PyUnicode_4BYTE_KIND:
8691 out = ucs4lib_split(
8692 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8693 break;
8694 default:
8695 out = NULL;
8696 }
8697 if (kind1 != kind)
8698 PyMem_Free(buf1);
8699 if (kind2 != kind)
8700 PyMem_Free(buf2);
8701 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702}
8703
Alexander Belopolsky40018472011-02-26 01:02:56 +00008704static PyObject *
8705rsplit(PyUnicodeObject *self,
8706 PyUnicodeObject *substring,
8707 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008709 int kind1, kind2, kind;
8710 void *buf1, *buf2;
8711 Py_ssize_t len1, len2;
8712 PyObject* out;
8713
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008714 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008715 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008717 if (PyUnicode_READY(self) == -1)
8718 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720 if (substring == NULL)
8721 switch(PyUnicode_KIND(self)) {
8722 case PyUnicode_1BYTE_KIND:
8723 return ucs1lib_rsplit_whitespace(
8724 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8725 PyUnicode_GET_LENGTH(self), maxcount
8726 );
8727 case PyUnicode_2BYTE_KIND:
8728 return ucs2lib_rsplit_whitespace(
8729 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8730 PyUnicode_GET_LENGTH(self), maxcount
8731 );
8732 case PyUnicode_4BYTE_KIND:
8733 return ucs4lib_rsplit_whitespace(
8734 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8735 PyUnicode_GET_LENGTH(self), maxcount
8736 );
8737 default:
8738 assert(0);
8739 return NULL;
8740 }
8741
8742 if (PyUnicode_READY(substring) == -1)
8743 return NULL;
8744
8745 kind1 = PyUnicode_KIND(self);
8746 kind2 = PyUnicode_KIND(substring);
8747 kind = kind1 > kind2 ? kind1 : kind2;
8748 buf1 = PyUnicode_DATA(self);
8749 buf2 = PyUnicode_DATA(substring);
8750 if (kind1 != kind)
8751 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8752 if (!buf1)
8753 return NULL;
8754 if (kind2 != kind)
8755 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8756 if (!buf2) {
8757 if (kind1 != kind) PyMem_Free(buf1);
8758 return NULL;
8759 }
8760 len1 = PyUnicode_GET_LENGTH(self);
8761 len2 = PyUnicode_GET_LENGTH(substring);
8762
8763 switch(kind) {
8764 case PyUnicode_1BYTE_KIND:
8765 out = ucs1lib_rsplit(
8766 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8767 break;
8768 case PyUnicode_2BYTE_KIND:
8769 out = ucs2lib_rsplit(
8770 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8771 break;
8772 case PyUnicode_4BYTE_KIND:
8773 out = ucs4lib_rsplit(
8774 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8775 break;
8776 default:
8777 out = NULL;
8778 }
8779 if (kind1 != kind)
8780 PyMem_Free(buf1);
8781 if (kind2 != kind)
8782 PyMem_Free(buf2);
8783 return out;
8784}
8785
8786static Py_ssize_t
8787anylib_find(int kind, void *buf1, Py_ssize_t len1,
8788 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8789{
8790 switch(kind) {
8791 case PyUnicode_1BYTE_KIND:
8792 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8793 case PyUnicode_2BYTE_KIND:
8794 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8795 case PyUnicode_4BYTE_KIND:
8796 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8797 }
8798 assert(0);
8799 return -1;
8800}
8801
8802static Py_ssize_t
8803anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8804 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8805{
8806 switch(kind) {
8807 case PyUnicode_1BYTE_KIND:
8808 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8809 case PyUnicode_2BYTE_KIND:
8810 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8811 case PyUnicode_4BYTE_KIND:
8812 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8813 }
8814 assert(0);
8815 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008816}
8817
Alexander Belopolsky40018472011-02-26 01:02:56 +00008818static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819replace(PyObject *self, PyObject *str1,
8820 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 PyObject *u;
8823 char *sbuf = PyUnicode_DATA(self);
8824 char *buf1 = PyUnicode_DATA(str1);
8825 char *buf2 = PyUnicode_DATA(str2);
8826 int srelease = 0, release1 = 0, release2 = 0;
8827 int skind = PyUnicode_KIND(self);
8828 int kind1 = PyUnicode_KIND(str1);
8829 int kind2 = PyUnicode_KIND(str2);
8830 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8831 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8832 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833
8834 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008835 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008837 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839 if (skind < kind1)
8840 /* substring too wide to be present */
8841 goto nothing;
8842
8843 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008844 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008845 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008847 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008848 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008849 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 Py_UCS4 u1, u2, maxchar;
8851 int mayshrink, rkind;
8852 u1 = PyUnicode_READ_CHAR(str1, 0);
8853 if (!findchar(sbuf, PyUnicode_KIND(self),
8854 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008855 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856 u2 = PyUnicode_READ_CHAR(str2, 0);
8857 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8858 /* Replacing u1 with u2 may cause a maxchar reduction in the
8859 result string. */
8860 mayshrink = maxchar > 127;
8861 if (u2 > maxchar) {
8862 maxchar = u2;
8863 mayshrink = 0;
8864 }
8865 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008866 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008867 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008868 if (PyUnicode_CopyCharacters(u, 0,
8869 (PyObject*)self, 0, slen) < 0)
8870 {
8871 Py_DECREF(u);
8872 return NULL;
8873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 rkind = PyUnicode_KIND(u);
8875 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8876 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008877 if (--maxcount < 0)
8878 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008879 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008880 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881 if (mayshrink) {
8882 PyObject *tmp = u;
8883 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8884 PyUnicode_GET_LENGTH(tmp));
8885 Py_DECREF(tmp);
8886 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 int rkind = skind;
8889 char *res;
8890 if (kind1 < rkind) {
8891 /* widen substring */
8892 buf1 = _PyUnicode_AsKind(str1, rkind);
8893 if (!buf1) goto error;
8894 release1 = 1;
8895 }
8896 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008897 if (i < 0)
8898 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008899 if (rkind > kind2) {
8900 /* widen replacement */
8901 buf2 = _PyUnicode_AsKind(str2, rkind);
8902 if (!buf2) goto error;
8903 release2 = 1;
8904 }
8905 else if (rkind < kind2) {
8906 /* widen self and buf1 */
8907 rkind = kind2;
8908 if (release1) PyMem_Free(buf1);
8909 sbuf = _PyUnicode_AsKind(self, rkind);
8910 if (!sbuf) goto error;
8911 srelease = 1;
8912 buf1 = _PyUnicode_AsKind(str1, rkind);
8913 if (!buf1) goto error;
8914 release1 = 1;
8915 }
8916 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8917 if (!res) {
8918 PyErr_NoMemory();
8919 goto error;
8920 }
8921 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008922 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8924 buf2,
8925 PyUnicode_KIND_SIZE(rkind, len2));
8926 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008927
8928 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8930 slen-i,
8931 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008932 if (i == -1)
8933 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8935 buf2,
8936 PyUnicode_KIND_SIZE(rkind, len2));
8937 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008938 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008939
8940 u = PyUnicode_FromKindAndData(rkind, res, slen);
8941 PyMem_Free(res);
8942 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 Py_ssize_t n, i, j, ires;
8947 Py_ssize_t product, new_size;
8948 int rkind = skind;
8949 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 if (kind1 < rkind) {
8952 buf1 = _PyUnicode_AsKind(str1, rkind);
8953 if (!buf1) goto error;
8954 release1 = 1;
8955 }
8956 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008957 if (n == 0)
8958 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008959 if (kind2 < rkind) {
8960 buf2 = _PyUnicode_AsKind(str2, rkind);
8961 if (!buf2) goto error;
8962 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 else if (kind2 > rkind) {
8965 rkind = kind2;
8966 sbuf = _PyUnicode_AsKind(self, rkind);
8967 if (!sbuf) goto error;
8968 srelease = 1;
8969 if (release1) PyMem_Free(buf1);
8970 buf1 = _PyUnicode_AsKind(str1, rkind);
8971 if (!buf1) goto error;
8972 release1 = 1;
8973 }
8974 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
8975 PyUnicode_GET_LENGTH(str1))); */
8976 product = n * (len2-len1);
8977 if ((product / (len2-len1)) != n) {
8978 PyErr_SetString(PyExc_OverflowError,
8979 "replace string is too long");
8980 goto error;
8981 }
8982 new_size = slen + product;
8983 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
8984 PyErr_SetString(PyExc_OverflowError,
8985 "replace string is too long");
8986 goto error;
8987 }
8988 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
8989 if (!res)
8990 goto error;
8991 ires = i = 0;
8992 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008993 while (n-- > 0) {
8994 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995 j = anylib_find(rkind,
8996 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8997 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008998 if (j == -1)
8999 break;
9000 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009001 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9003 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9004 PyUnicode_KIND_SIZE(rkind, j-i));
9005 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009006 }
9007 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008 if (len2 > 0) {
9009 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9010 buf2,
9011 PyUnicode_KIND_SIZE(rkind, len2));
9012 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009015 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009017 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9019 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9020 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009021 } else {
9022 /* interleave */
9023 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9025 buf2,
9026 PyUnicode_KIND_SIZE(rkind, len2));
9027 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009028 if (--n <= 0)
9029 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9031 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9032 PyUnicode_KIND_SIZE(rkind, 1));
9033 ires++;
9034 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9037 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9038 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 if (srelease)
9043 PyMem_FREE(sbuf);
9044 if (release1)
9045 PyMem_FREE(buf1);
9046 if (release2)
9047 PyMem_FREE(buf2);
9048 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009049
Benjamin Peterson29060642009-01-31 22:14:21 +00009050 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009051 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 if (srelease)
9053 PyMem_FREE(sbuf);
9054 if (release1)
9055 PyMem_FREE(buf1);
9056 if (release2)
9057 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009058 if (PyUnicode_CheckExact(self)) {
9059 Py_INCREF(self);
9060 return (PyObject *) self;
9061 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062 return PyUnicode_FromKindAndData(PyUnicode_KIND(self),
9063 PyUnicode_DATA(self),
9064 PyUnicode_GET_LENGTH(self));
9065 error:
9066 if (srelease && sbuf)
9067 PyMem_FREE(sbuf);
9068 if (release1 && buf1)
9069 PyMem_FREE(buf1);
9070 if (release2 && buf2)
9071 PyMem_FREE(buf2);
9072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073}
9074
9075/* --- Unicode Object Methods --------------------------------------------- */
9076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009077PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009078 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079\n\
9080Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009081characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082
9083static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009084unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086 return fixup(self, fixtitle);
9087}
9088
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009089PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091\n\
9092Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009093have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094
9095static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009096unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 return fixup(self, fixcapitalize);
9099}
9100
9101#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009102PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009103 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104\n\
9105Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009106normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107
9108static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009109unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110{
9111 PyObject *list;
9112 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009113 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114
Guido van Rossumd57fd912000-03-10 22:53:23 +00009115 /* Split into words */
9116 list = split(self, NULL, -1);
9117 if (!list)
9118 return NULL;
9119
9120 /* Capitalize each word */
9121 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9122 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124 if (item == NULL)
9125 goto onError;
9126 Py_DECREF(PyList_GET_ITEM(list, i));
9127 PyList_SET_ITEM(list, i, item);
9128 }
9129
9130 /* Join the words to form a new string */
9131 item = PyUnicode_Join(NULL, list);
9132
Benjamin Peterson29060642009-01-31 22:14:21 +00009133 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134 Py_DECREF(list);
9135 return (PyObject *)item;
9136}
9137#endif
9138
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009139/* Argument converter. Coerces to a single unicode character */
9140
9141static int
9142convert_uc(PyObject *obj, void *addr)
9143{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009145 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009146
Benjamin Peterson14339b62009-01-31 16:36:08 +00009147 uniobj = PyUnicode_FromObject(obj);
9148 if (uniobj == NULL) {
9149 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009150 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009151 return 0;
9152 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009154 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009156 Py_DECREF(uniobj);
9157 return 0;
9158 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159 if (PyUnicode_READY(uniobj)) {
9160 Py_DECREF(uniobj);
9161 return 0;
9162 }
9163 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009164 Py_DECREF(uniobj);
9165 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009166}
9167
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009168PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009170\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009171Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009172done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173
9174static PyObject *
9175unicode_center(PyUnicodeObject *self, PyObject *args)
9176{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009177 Py_ssize_t marg, left;
9178 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 Py_UCS4 fillchar = ' ';
9180
9181 if (PyUnicode_READY(self) == -1)
9182 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183
Thomas Woutersde017742006-02-16 19:34:37 +00009184 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185 return NULL;
9186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009188 Py_INCREF(self);
9189 return (PyObject*) self;
9190 }
9191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009192 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009193 left = marg / 2 + (marg & width & 1);
9194
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009195 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196}
9197
Marc-André Lemburge5034372000-08-08 08:04:29 +00009198#if 0
9199
9200/* This code should go into some future Unicode collation support
9201 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009202 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009203
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009204/* speedy UTF-16 code point order comparison */
9205/* gleaned from: */
9206/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9207
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009208static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009209{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009210 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009211 0, 0, 0, 0, 0, 0, 0, 0,
9212 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009213 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009214};
9215
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216static int
9217unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9218{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009219 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009220
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221 Py_UNICODE *s1 = str1->str;
9222 Py_UNICODE *s2 = str2->str;
9223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 len1 = str1->_base._base.length;
9225 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009226
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009228 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009229
9230 c1 = *s1++;
9231 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009232
Benjamin Peterson29060642009-01-31 22:14:21 +00009233 if (c1 > (1<<11) * 26)
9234 c1 += utf16Fixup[c1>>11];
9235 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009236 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009237 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009238
9239 if (c1 != c2)
9240 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009241
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009242 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009243 }
9244
9245 return (len1 < len2) ? -1 : (len1 != len2);
9246}
9247
Marc-André Lemburge5034372000-08-08 08:04:29 +00009248#else
9249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250/* This function assumes that str1 and str2 are readied by the caller. */
9251
Marc-André Lemburge5034372000-08-08 08:04:29 +00009252static int
9253unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9254{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009255 int kind1, kind2;
9256 void *data1, *data2;
9257 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259 kind1 = PyUnicode_KIND(str1);
9260 kind2 = PyUnicode_KIND(str2);
9261 data1 = PyUnicode_DATA(str1);
9262 data2 = PyUnicode_DATA(str2);
9263 len1 = PyUnicode_GET_LENGTH(str1);
9264 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 for (i = 0; i < len1 && i < len2; ++i) {
9267 Py_UCS4 c1, c2;
9268 c1 = PyUnicode_READ(kind1, data1, i);
9269 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009270
9271 if (c1 != c2)
9272 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009273 }
9274
9275 return (len1 < len2) ? -1 : (len1 != len2);
9276}
9277
9278#endif
9279
Alexander Belopolsky40018472011-02-26 01:02:56 +00009280int
9281PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009282{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9284 if (PyUnicode_READY(left) == -1 ||
9285 PyUnicode_READY(right) == -1)
9286 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009287 return unicode_compare((PyUnicodeObject *)left,
9288 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009290 PyErr_Format(PyExc_TypeError,
9291 "Can't compare %.100s and %.100s",
9292 left->ob_type->tp_name,
9293 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294 return -1;
9295}
9296
Martin v. Löwis5b222132007-06-10 09:51:05 +00009297int
9298PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9299{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300 Py_ssize_t i;
9301 int kind;
9302 void *data;
9303 Py_UCS4 chr;
9304
Martin v. Löwis5b222132007-06-10 09:51:05 +00009305 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 if (PyUnicode_READY(uni) == -1)
9307 return -1;
9308 kind = PyUnicode_KIND(uni);
9309 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009310 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9312 if (chr != str[i])
9313 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009314 /* This check keeps Python strings that end in '\0' from comparing equal
9315 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009317 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009318 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009319 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009320 return 0;
9321}
9322
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009323
Benjamin Peterson29060642009-01-31 22:14:21 +00009324#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009325 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009326
Alexander Belopolsky40018472011-02-26 01:02:56 +00009327PyObject *
9328PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009329{
9330 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009331
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009332 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9333 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334 if (PyUnicode_READY(left) == -1 ||
9335 PyUnicode_READY(right) == -1)
9336 return NULL;
9337 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9338 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009339 if (op == Py_EQ) {
9340 Py_INCREF(Py_False);
9341 return Py_False;
9342 }
9343 if (op == Py_NE) {
9344 Py_INCREF(Py_True);
9345 return Py_True;
9346 }
9347 }
9348 if (left == right)
9349 result = 0;
9350 else
9351 result = unicode_compare((PyUnicodeObject *)left,
9352 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009353
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009354 /* Convert the return value to a Boolean */
9355 switch (op) {
9356 case Py_EQ:
9357 v = TEST_COND(result == 0);
9358 break;
9359 case Py_NE:
9360 v = TEST_COND(result != 0);
9361 break;
9362 case Py_LE:
9363 v = TEST_COND(result <= 0);
9364 break;
9365 case Py_GE:
9366 v = TEST_COND(result >= 0);
9367 break;
9368 case Py_LT:
9369 v = TEST_COND(result == -1);
9370 break;
9371 case Py_GT:
9372 v = TEST_COND(result == 1);
9373 break;
9374 default:
9375 PyErr_BadArgument();
9376 return NULL;
9377 }
9378 Py_INCREF(v);
9379 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009380 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009381
Brian Curtindfc80e32011-08-10 20:28:54 -05009382 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009383}
9384
Alexander Belopolsky40018472011-02-26 01:02:56 +00009385int
9386PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009387{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009388 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 int kind1, kind2, kind;
9390 void *buf1, *buf2;
9391 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009392 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009393
9394 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009395 sub = PyUnicode_FromObject(element);
9396 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009397 PyErr_Format(PyExc_TypeError,
9398 "'in <string>' requires string as left operand, not %s",
9399 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009400 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009401 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 if (PyUnicode_READY(sub) == -1)
9403 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009404
Thomas Wouters477c8d52006-05-27 19:21:47 +00009405 str = PyUnicode_FromObject(container);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 if (!str || PyUnicode_READY(container) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009407 Py_DECREF(sub);
9408 return -1;
9409 }
9410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411 kind1 = PyUnicode_KIND(str);
9412 kind2 = PyUnicode_KIND(sub);
9413 kind = kind1 > kind2 ? kind1 : kind2;
9414 buf1 = PyUnicode_DATA(str);
9415 buf2 = PyUnicode_DATA(sub);
9416 if (kind1 != kind)
9417 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9418 if (!buf1) {
9419 Py_DECREF(sub);
9420 return -1;
9421 }
9422 if (kind2 != kind)
9423 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9424 if (!buf2) {
9425 Py_DECREF(sub);
9426 if (kind1 != kind) PyMem_Free(buf1);
9427 return -1;
9428 }
9429 len1 = PyUnicode_GET_LENGTH(str);
9430 len2 = PyUnicode_GET_LENGTH(sub);
9431
9432 switch(kind) {
9433 case PyUnicode_1BYTE_KIND:
9434 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9435 break;
9436 case PyUnicode_2BYTE_KIND:
9437 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9438 break;
9439 case PyUnicode_4BYTE_KIND:
9440 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9441 break;
9442 default:
9443 result = -1;
9444 assert(0);
9445 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009446
9447 Py_DECREF(str);
9448 Py_DECREF(sub);
9449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 if (kind1 != kind)
9451 PyMem_Free(buf1);
9452 if (kind2 != kind)
9453 PyMem_Free(buf2);
9454
Guido van Rossum403d68b2000-03-13 15:55:09 +00009455 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009456}
9457
Guido van Rossumd57fd912000-03-10 22:53:23 +00009458/* Concat to string or Unicode object giving a new Unicode object. */
9459
Alexander Belopolsky40018472011-02-26 01:02:56 +00009460PyObject *
9461PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 PyObject *u = NULL, *v = NULL, *w;
9464 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465
9466 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009468 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009469 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009472 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473
9474 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009476 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009480 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482 }
9483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 if (PyUnicode_READY(u) == -1 || PyUnicode_READY(v) == -1)
9485 goto onError;
9486
9487 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009488 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 w = PyUnicode_New(
9492 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9493 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009495 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009496 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9497 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009498 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009499 v, 0,
9500 PyUnicode_GET_LENGTH(v)) < 0)
9501 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 Py_DECREF(u);
9503 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505
Benjamin Peterson29060642009-01-31 22:14:21 +00009506 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507 Py_XDECREF(u);
9508 Py_XDECREF(v);
9509 return NULL;
9510}
9511
Walter Dörwald1ab83302007-05-18 17:15:44 +00009512void
9513PyUnicode_Append(PyObject **pleft, PyObject *right)
9514{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009515 PyObject *new;
9516 if (*pleft == NULL)
9517 return;
9518 if (right == NULL || !PyUnicode_Check(*pleft)) {
9519 Py_DECREF(*pleft);
9520 *pleft = NULL;
9521 return;
9522 }
9523 new = PyUnicode_Concat(*pleft, right);
9524 Py_DECREF(*pleft);
9525 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009526}
9527
9528void
9529PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9530{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009531 PyUnicode_Append(pleft, right);
9532 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009533}
9534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009535PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009536 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009538Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009539string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009540interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541
9542static PyObject *
9543unicode_count(PyUnicodeObject *self, PyObject *args)
9544{
9545 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009546 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009547 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 int kind1, kind2, kind;
9550 void *buf1, *buf2;
9551 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552
Jesus Ceaac451502011-04-20 17:09:23 +02009553 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9554 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 kind1 = PyUnicode_KIND(self);
9558 kind2 = PyUnicode_KIND(substring);
9559 kind = kind1 > kind2 ? kind1 : kind2;
9560 buf1 = PyUnicode_DATA(self);
9561 buf2 = PyUnicode_DATA(substring);
9562 if (kind1 != kind)
9563 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9564 if (!buf1) {
9565 Py_DECREF(substring);
9566 return NULL;
9567 }
9568 if (kind2 != kind)
9569 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9570 if (!buf2) {
9571 Py_DECREF(substring);
9572 if (kind1 != kind) PyMem_Free(buf1);
9573 return NULL;
9574 }
9575 len1 = PyUnicode_GET_LENGTH(self);
9576 len2 = PyUnicode_GET_LENGTH(substring);
9577
9578 ADJUST_INDICES(start, end, len1);
9579 switch(kind) {
9580 case PyUnicode_1BYTE_KIND:
9581 iresult = ucs1lib_count(
9582 ((Py_UCS1*)buf1) + start, end - start,
9583 buf2, len2, PY_SSIZE_T_MAX
9584 );
9585 break;
9586 case PyUnicode_2BYTE_KIND:
9587 iresult = ucs2lib_count(
9588 ((Py_UCS2*)buf1) + start, end - start,
9589 buf2, len2, PY_SSIZE_T_MAX
9590 );
9591 break;
9592 case PyUnicode_4BYTE_KIND:
9593 iresult = ucs4lib_count(
9594 ((Py_UCS4*)buf1) + start, end - start,
9595 buf2, len2, PY_SSIZE_T_MAX
9596 );
9597 break;
9598 default:
9599 assert(0); iresult = 0;
9600 }
9601
9602 result = PyLong_FromSsize_t(iresult);
9603
9604 if (kind1 != kind)
9605 PyMem_Free(buf1);
9606 if (kind2 != kind)
9607 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608
9609 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009610
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611 return result;
9612}
9613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009614PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009615 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009617Encode S using the codec registered for encoding. Default encoding\n\
9618is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009619handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009620a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9621'xmlcharrefreplace' as well as any other name registered with\n\
9622codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623
9624static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009625unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009627 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628 char *encoding = NULL;
9629 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009630
Benjamin Peterson308d6372009-09-18 21:42:35 +00009631 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9632 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009634 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009635}
9636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009637PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009638 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639\n\
9640Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009641If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009642
9643static PyObject*
9644unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9645{
9646 Py_UNICODE *e;
9647 Py_UNICODE *p;
9648 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009649 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651 PyUnicodeObject *u;
9652 int tabsize = 8;
9653
9654 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9658 return NULL;
9659
Thomas Wouters7e474022000-07-16 12:04:32 +00009660 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009661 i = 0; /* chars up to and including most recent \n or \r */
9662 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9664 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009665 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009666 if (tabsize > 0) {
9667 incr = tabsize - (j % tabsize); /* cannot overflow */
9668 if (j > PY_SSIZE_T_MAX - incr)
9669 goto overflow1;
9670 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009671 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009672 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009674 if (j > PY_SSIZE_T_MAX - 1)
9675 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676 j++;
9677 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009678 if (i > PY_SSIZE_T_MAX - j)
9679 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009681 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682 }
9683 }
9684
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009685 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009686 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009687
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 /* Second pass: create output string and fill it */
9689 u = _PyUnicode_New(i + j);
9690 if (!u)
9691 return NULL;
9692
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009693 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694 q = _PyUnicode_WSTR(u); /* next output char */
9695 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009699 if (tabsize > 0) {
9700 i = tabsize - (j % tabsize);
9701 j += i;
9702 while (i--) {
9703 if (q >= qe)
9704 goto overflow2;
9705 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009706 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009708 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009709 else {
9710 if (q >= qe)
9711 goto overflow2;
9712 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009713 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714 if (*p == '\n' || *p == '\r')
9715 j = 0;
9716 }
9717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 if (PyUnicode_READY(u) == -1) {
9719 Py_DECREF(u);
9720 return NULL;
9721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009723
9724 overflow2:
9725 Py_DECREF(u);
9726 overflow1:
9727 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9728 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009729}
9730
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009731PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009732 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009733\n\
9734Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009735such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736arguments start and end are interpreted as in slice notation.\n\
9737\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009738Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009739
9740static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009742{
Jesus Ceaac451502011-04-20 17:09:23 +02009743 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009744 Py_ssize_t start;
9745 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009746 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747
Jesus Ceaac451502011-04-20 17:09:23 +02009748 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9749 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 if (PyUnicode_READY(self) == -1)
9753 return NULL;
9754 if (PyUnicode_READY(substring) == -1)
9755 return NULL;
9756
9757 result = any_find_slice(
9758 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9759 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009760 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761
9762 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009763
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009764 if (result == -2)
9765 return NULL;
9766
Christian Heimes217cfd12007-12-02 14:31:20 +00009767 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768}
9769
9770static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009771unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009772{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 Py_UCS4 ch;
9774
9775 if (PyUnicode_READY(self) == -1)
9776 return NULL;
9777 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009778 PyErr_SetString(PyExc_IndexError, "string index out of range");
9779 return NULL;
9780 }
9781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9783 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009784}
9785
Guido van Rossumc2504932007-09-18 19:42:40 +00009786/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009787 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009788static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009789unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790{
Guido van Rossumc2504932007-09-18 19:42:40 +00009791 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009792 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 if (_PyUnicode_HASH(self) != -1)
9795 return _PyUnicode_HASH(self);
9796 if (PyUnicode_READY(self) == -1)
9797 return -1;
9798 len = PyUnicode_GET_LENGTH(self);
9799
9800 /* The hash function as a macro, gets expanded three times below. */
9801#define HASH(P) \
9802 x = (Py_uhash_t)*P << 7; \
9803 while (--len >= 0) \
9804 x = (1000003*x) ^ (Py_uhash_t)*P++;
9805
9806 switch (PyUnicode_KIND(self)) {
9807 case PyUnicode_1BYTE_KIND: {
9808 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9809 HASH(c);
9810 break;
9811 }
9812 case PyUnicode_2BYTE_KIND: {
9813 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9814 HASH(s);
9815 break;
9816 }
9817 default: {
9818 Py_UCS4 *l;
9819 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9820 "Impossible switch case in unicode_hash");
9821 l = PyUnicode_4BYTE_DATA(self);
9822 HASH(l);
9823 break;
9824 }
9825 }
9826 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9827
Guido van Rossumc2504932007-09-18 19:42:40 +00009828 if (x == -1)
9829 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009831 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009834
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009835PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009836 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009837\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009838Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009839
9840static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009842{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009843 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009844 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009845 Py_ssize_t start;
9846 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009847
Jesus Ceaac451502011-04-20 17:09:23 +02009848 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9849 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 if (PyUnicode_READY(self) == -1)
9853 return NULL;
9854 if (PyUnicode_READY(substring) == -1)
9855 return NULL;
9856
9857 result = any_find_slice(
9858 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9859 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009860 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861
9862 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 if (result == -2)
9865 return NULL;
9866
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867 if (result < 0) {
9868 PyErr_SetString(PyExc_ValueError, "substring not found");
9869 return NULL;
9870 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009871
Christian Heimes217cfd12007-12-02 14:31:20 +00009872 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873}
9874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009875PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009876 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009877\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009878Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009879at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009880
9881static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009882unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 Py_ssize_t i, length;
9885 int kind;
9886 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887 int cased;
9888
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 if (PyUnicode_READY(self) == -1)
9890 return NULL;
9891 length = PyUnicode_GET_LENGTH(self);
9892 kind = PyUnicode_KIND(self);
9893 data = PyUnicode_DATA(self);
9894
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 if (length == 1)
9897 return PyBool_FromLong(
9898 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009900 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009902 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009903
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 for (i = 0; i < length; i++) {
9906 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009907
Benjamin Peterson29060642009-01-31 22:14:21 +00009908 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9909 return PyBool_FromLong(0);
9910 else if (!cased && Py_UNICODE_ISLOWER(ch))
9911 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009913 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914}
9915
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009916PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009917 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009919Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009920at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009921
9922static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009923unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 Py_ssize_t i, length;
9926 int kind;
9927 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928 int cased;
9929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 if (PyUnicode_READY(self) == -1)
9931 return NULL;
9932 length = PyUnicode_GET_LENGTH(self);
9933 kind = PyUnicode_KIND(self);
9934 data = PyUnicode_DATA(self);
9935
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 if (length == 1)
9938 return PyBool_FromLong(
9939 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009941 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009943 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009944
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 for (i = 0; i < length; i++) {
9947 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009948
Benjamin Peterson29060642009-01-31 22:14:21 +00009949 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9950 return PyBool_FromLong(0);
9951 else if (!cased && Py_UNICODE_ISUPPER(ch))
9952 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009954 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955}
9956
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009957PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009958 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009960Return True if S is a titlecased string and there is at least one\n\
9961character in S, i.e. upper- and titlecase characters may only\n\
9962follow uncased characters and lowercase characters only cased ones.\n\
9963Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009964
9965static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009966unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 Py_ssize_t i, length;
9969 int kind;
9970 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009971 int cased, previous_is_cased;
9972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 if (PyUnicode_READY(self) == -1)
9974 return NULL;
9975 length = PyUnicode_GET_LENGTH(self);
9976 kind = PyUnicode_KIND(self);
9977 data = PyUnicode_DATA(self);
9978
Guido van Rossumd57fd912000-03-10 22:53:23 +00009979 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 if (length == 1) {
9981 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
9982 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
9983 (Py_UNICODE_ISUPPER(ch) != 0));
9984 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009985
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009986 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009988 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009989
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990 cased = 0;
9991 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 for (i = 0; i < length; i++) {
9993 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009994
Benjamin Peterson29060642009-01-31 22:14:21 +00009995 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
9996 if (previous_is_cased)
9997 return PyBool_FromLong(0);
9998 previous_is_cased = 1;
9999 cased = 1;
10000 }
10001 else if (Py_UNICODE_ISLOWER(ch)) {
10002 if (!previous_is_cased)
10003 return PyBool_FromLong(0);
10004 previous_is_cased = 1;
10005 cased = 1;
10006 }
10007 else
10008 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010009 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010010 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010011}
10012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010013PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010014 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010016Return True if all characters in S are whitespace\n\
10017and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018
10019static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010020unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 Py_ssize_t i, length;
10023 int kind;
10024 void *data;
10025
10026 if (PyUnicode_READY(self) == -1)
10027 return NULL;
10028 length = PyUnicode_GET_LENGTH(self);
10029 kind = PyUnicode_KIND(self);
10030 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 if (length == 1)
10034 return PyBool_FromLong(
10035 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010037 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010039 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 for (i = 0; i < length; i++) {
10042 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010043 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010044 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010045 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010046 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010047}
10048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010049PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010050 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010051\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010052Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010053and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010054
10055static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010056unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010057{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 Py_ssize_t i, length;
10059 int kind;
10060 void *data;
10061
10062 if (PyUnicode_READY(self) == -1)
10063 return NULL;
10064 length = PyUnicode_GET_LENGTH(self);
10065 kind = PyUnicode_KIND(self);
10066 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010067
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010068 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 if (length == 1)
10070 return PyBool_FromLong(
10071 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010072
10073 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010075 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 for (i = 0; i < length; i++) {
10078 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010079 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010080 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010081 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010082}
10083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010084PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010085 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010086\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010087Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010088and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010089
10090static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010091unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 int kind;
10094 void *data;
10095 Py_ssize_t len, i;
10096
10097 if (PyUnicode_READY(self) == -1)
10098 return NULL;
10099
10100 kind = PyUnicode_KIND(self);
10101 data = PyUnicode_DATA(self);
10102 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010103
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010104 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 if (len == 1) {
10106 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10107 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10108 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010109
10110 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010112 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 for (i = 0; i < len; i++) {
10115 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010116 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010117 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010118 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010119 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010120}
10121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010122PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010123 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010124\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010125Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010126False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127
10128static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010129unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010130{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 Py_ssize_t i, length;
10132 int kind;
10133 void *data;
10134
10135 if (PyUnicode_READY(self) == -1)
10136 return NULL;
10137 length = PyUnicode_GET_LENGTH(self);
10138 kind = PyUnicode_KIND(self);
10139 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 if (length == 1)
10143 return PyBool_FromLong(
10144 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010146 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010148 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 for (i = 0; i < length; i++) {
10151 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010152 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010154 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155}
10156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010157PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010158 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010160Return True if all characters in S are digits\n\
10161and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162
10163static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010164unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 Py_ssize_t i, length;
10167 int kind;
10168 void *data;
10169
10170 if (PyUnicode_READY(self) == -1)
10171 return NULL;
10172 length = PyUnicode_GET_LENGTH(self);
10173 kind = PyUnicode_KIND(self);
10174 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 if (length == 1) {
10178 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10179 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010182 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010184 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 for (i = 0; i < length; i++) {
10187 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010188 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010190 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010191}
10192
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010193PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010194 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010195\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010196Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010197False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198
10199static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010200unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 Py_ssize_t i, length;
10203 int kind;
10204 void *data;
10205
10206 if (PyUnicode_READY(self) == -1)
10207 return NULL;
10208 length = PyUnicode_GET_LENGTH(self);
10209 kind = PyUnicode_KIND(self);
10210 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 if (length == 1)
10214 return PyBool_FromLong(
10215 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010217 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010219 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 for (i = 0; i < length; i++) {
10222 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010223 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010225 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226}
10227
Martin v. Löwis47383402007-08-15 07:32:56 +000010228int
10229PyUnicode_IsIdentifier(PyObject *self)
10230{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 int kind;
10232 void *data;
10233 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010234 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 if (PyUnicode_READY(self) == -1) {
10237 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010238 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 }
10240
10241 /* Special case for empty strings */
10242 if (PyUnicode_GET_LENGTH(self) == 0)
10243 return 0;
10244 kind = PyUnicode_KIND(self);
10245 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010246
10247 /* PEP 3131 says that the first character must be in
10248 XID_Start and subsequent characters in XID_Continue,
10249 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010250 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010251 letters, digits, underscore). However, given the current
10252 definition of XID_Start and XID_Continue, it is sufficient
10253 to check just for these, except that _ must be allowed
10254 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010256 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010257 return 0;
10258
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010259 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010261 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010262 return 1;
10263}
10264
10265PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010266 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010267\n\
10268Return True if S is a valid identifier according\n\
10269to the language definition.");
10270
10271static PyObject*
10272unicode_isidentifier(PyObject *self)
10273{
10274 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10275}
10276
Georg Brandl559e5d72008-06-11 18:37:52 +000010277PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010278 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010279\n\
10280Return True if all characters in S are considered\n\
10281printable in repr() or S is empty, False otherwise.");
10282
10283static PyObject*
10284unicode_isprintable(PyObject *self)
10285{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 Py_ssize_t i, length;
10287 int kind;
10288 void *data;
10289
10290 if (PyUnicode_READY(self) == -1)
10291 return NULL;
10292 length = PyUnicode_GET_LENGTH(self);
10293 kind = PyUnicode_KIND(self);
10294 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010295
10296 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 if (length == 1)
10298 return PyBool_FromLong(
10299 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 for (i = 0; i < length; i++) {
10302 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010303 Py_RETURN_FALSE;
10304 }
10305 }
10306 Py_RETURN_TRUE;
10307}
10308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010309PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010310 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311\n\
10312Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010313iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314
10315static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010316unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010318 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319}
10320
Martin v. Löwis18e16552006-02-15 17:27:45 +000010321static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322unicode_length(PyUnicodeObject *self)
10323{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 if (PyUnicode_READY(self) == -1)
10325 return -1;
10326 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327}
10328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010329PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010330 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010332Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010333done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010334
10335static PyObject *
10336unicode_ljust(PyUnicodeObject *self, PyObject *args)
10337{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010338 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 Py_UCS4 fillchar = ' ';
10340
10341 if (PyUnicode_READY(self) == -1)
10342 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010343
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010344 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010345 return NULL;
10346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348 Py_INCREF(self);
10349 return (PyObject*) self;
10350 }
10351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010353}
10354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010355PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010356 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010358Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359
10360static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010361unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363 return fixup(self, fixlower);
10364}
10365
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010366#define LEFTSTRIP 0
10367#define RIGHTSTRIP 1
10368#define BOTHSTRIP 2
10369
10370/* Arrays indexed by above */
10371static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10372
10373#define STRIPNAME(i) (stripformat[i]+3)
10374
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010375/* externally visible for str.strip(unicode) */
10376PyObject *
10377_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10378{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 void *data;
10380 int kind;
10381 Py_ssize_t i, j, len;
10382 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10385 return NULL;
10386
10387 kind = PyUnicode_KIND(self);
10388 data = PyUnicode_DATA(self);
10389 len = PyUnicode_GET_LENGTH(self);
10390 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10391 PyUnicode_DATA(sepobj),
10392 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010393
Benjamin Peterson14339b62009-01-31 16:36:08 +000010394 i = 0;
10395 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 while (i < len &&
10397 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010398 i++;
10399 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010400 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010401
Benjamin Peterson14339b62009-01-31 16:36:08 +000010402 j = len;
10403 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010404 do {
10405 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 } while (j >= i &&
10407 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010408 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010409 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010410
Benjamin Peterson14339b62009-01-31 16:36:08 +000010411 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010412 Py_INCREF(self);
10413 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010414 }
10415 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 return PyUnicode_Substring((PyObject*)self, i, j);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010417}
10418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419/* Assumes an already ready self string. */
10420
10421static PyObject *
10422substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len)
10423{
10424 const int kind = PyUnicode_KIND(self);
10425 void *data = PyUnicode_DATA(self);
10426 Py_UCS4 maxchar = 0;
10427 Py_ssize_t i;
10428 PyObject *unicode;
10429
10430 if (start < 0 || len < 0 || (start + len) > PyUnicode_GET_LENGTH(self)) {
10431 PyErr_BadInternalCall();
10432 return NULL;
10433 }
10434
10435 if (len == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) {
10436 Py_INCREF(self);
10437 return (PyObject*)self;
10438 }
10439
10440 for (i = 0; i < len; ++i) {
10441 const Py_UCS4 ch = PyUnicode_READ(kind, data, start + i);
10442 if (ch > maxchar)
10443 maxchar = ch;
10444 }
10445
10446 unicode = PyUnicode_New(len, maxchar);
10447 if (unicode == NULL)
10448 return NULL;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010449 if (PyUnicode_CopyCharacters(unicode, 0,
10450 (PyObject*)self, start, len) < 0)
10451 {
10452 Py_DECREF(unicode);
10453 return NULL;
10454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 return unicode;
10456}
10457
10458PyObject*
10459PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10460{
10461 unsigned char *data;
10462 int kind;
10463
10464 if (start == 0 && end == PyUnicode_GET_LENGTH(self)
10465 && PyUnicode_CheckExact(self))
10466 {
10467 Py_INCREF(self);
10468 return (PyObject *)self;
10469 }
10470
10471 if ((end - start) == 1)
10472 return unicode_getitem((PyUnicodeObject*)self, start);
10473
10474 if (PyUnicode_READY(self) == -1)
10475 return NULL;
10476 kind = PyUnicode_KIND(self);
10477 data = PyUnicode_1BYTE_DATA(self);
10478 return PyUnicode_FromKindAndData(kind, data + PyUnicode_KIND_SIZE(kind, start),
10479 end-start);
10480}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010481
10482static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010483do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 int kind;
10486 void *data;
10487 Py_ssize_t len, i, j;
10488
10489 if (PyUnicode_READY(self) == -1)
10490 return NULL;
10491
10492 kind = PyUnicode_KIND(self);
10493 data = PyUnicode_DATA(self);
10494 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010495
Benjamin Peterson14339b62009-01-31 16:36:08 +000010496 i = 0;
10497 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010499 i++;
10500 }
10501 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010502
Benjamin Peterson14339b62009-01-31 16:36:08 +000010503 j = len;
10504 if (striptype != LEFTSTRIP) {
10505 do {
10506 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010508 j++;
10509 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010510
Benjamin Peterson14339b62009-01-31 16:36:08 +000010511 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
10512 Py_INCREF(self);
10513 return (PyObject*)self;
10514 }
10515 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 return substring(self, i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517}
10518
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010519
10520static PyObject *
10521do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10522{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010523 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010524
Benjamin Peterson14339b62009-01-31 16:36:08 +000010525 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10526 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010527
Benjamin Peterson14339b62009-01-31 16:36:08 +000010528 if (sep != NULL && sep != Py_None) {
10529 if (PyUnicode_Check(sep))
10530 return _PyUnicode_XStrip(self, striptype, sep);
10531 else {
10532 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010533 "%s arg must be None or str",
10534 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010535 return NULL;
10536 }
10537 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010538
Benjamin Peterson14339b62009-01-31 16:36:08 +000010539 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010540}
10541
10542
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010543PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010544 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010545\n\
10546Return a copy of the string S with leading and trailing\n\
10547whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010548If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010549
10550static PyObject *
10551unicode_strip(PyUnicodeObject *self, PyObject *args)
10552{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010553 if (PyTuple_GET_SIZE(args) == 0)
10554 return do_strip(self, BOTHSTRIP); /* Common case */
10555 else
10556 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010557}
10558
10559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010560PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010561 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010562\n\
10563Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010564If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010565
10566static PyObject *
10567unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10568{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010569 if (PyTuple_GET_SIZE(args) == 0)
10570 return do_strip(self, LEFTSTRIP); /* Common case */
10571 else
10572 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010573}
10574
10575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010576PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010577 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010578\n\
10579Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010580If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010581
10582static PyObject *
10583unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10584{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010585 if (PyTuple_GET_SIZE(args) == 0)
10586 return do_strip(self, RIGHTSTRIP); /* Common case */
10587 else
10588 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010589}
10590
10591
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010593unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594{
10595 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 Py_ssize_t nchars, n;
10597 size_t nbytes, char_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598
Georg Brandl222de0f2009-04-12 12:01:50 +000010599 if (len < 1) {
10600 Py_INCREF(unicode_empty);
10601 return (PyObject *)unicode_empty;
10602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603
Tim Peters7a29bd52001-09-12 03:03:31 +000010604 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605 /* no repeat, return original string */
10606 Py_INCREF(str);
10607 return (PyObject*) str;
10608 }
Tim Peters8f422462000-09-09 06:13:41 +000010609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 if (PyUnicode_READY(str) == -1)
10611 return NULL;
10612
Tim Peters8f422462000-09-09 06:13:41 +000010613 /* ensure # of chars needed doesn't overflow int and # of bytes
10614 * needed doesn't overflow size_t
10615 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 nchars = len * PyUnicode_GET_LENGTH(str);
10617 if (nchars / len != PyUnicode_GET_LENGTH(str)) {
Tim Peters8f422462000-09-09 06:13:41 +000010618 PyErr_SetString(PyExc_OverflowError,
10619 "repeated string is too long");
10620 return NULL;
10621 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 char_size = PyUnicode_CHARACTER_SIZE(str);
10623 nbytes = (nchars + 1) * char_size;
10624 if (nbytes / char_size != (size_t)(nchars + 1)) {
Tim Peters8f422462000-09-09 06:13:41 +000010625 PyErr_SetString(PyExc_OverflowError,
10626 "repeated string is too long");
10627 return NULL;
10628 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630 if (!u)
10631 return NULL;
10632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 if (PyUnicode_GET_LENGTH(str) == 1) {
10634 const int kind = PyUnicode_KIND(str);
10635 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10636 void *to = PyUnicode_DATA(u);
10637 for (n = 0; n < len; ++n)
10638 PyUnicode_WRITE(kind, to, n, fill_char);
10639 }
10640 else {
10641 /* number of characters copied this far */
10642 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10643 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10644 char *to = (char *) PyUnicode_DATA(u);
10645 Py_MEMCPY(to, PyUnicode_DATA(str),
10646 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010647 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 n = (done <= nchars-done) ? done : nchars-done;
10649 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010652 }
10653
10654 return (PyObject*) u;
10655}
10656
Alexander Belopolsky40018472011-02-26 01:02:56 +000010657PyObject *
10658PyUnicode_Replace(PyObject *obj,
10659 PyObject *subobj,
10660 PyObject *replobj,
10661 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662{
10663 PyObject *self;
10664 PyObject *str1;
10665 PyObject *str2;
10666 PyObject *result;
10667
10668 self = PyUnicode_FromObject(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 if (self == NULL || PyUnicode_READY(obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010670 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671 str1 = PyUnicode_FromObject(subobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 if (str1 == NULL || PyUnicode_READY(obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010673 Py_DECREF(self);
10674 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010675 }
10676 str2 = PyUnicode_FromObject(replobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 if (str2 == NULL || PyUnicode_READY(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010678 Py_DECREF(self);
10679 Py_DECREF(str1);
10680 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683 Py_DECREF(self);
10684 Py_DECREF(str1);
10685 Py_DECREF(str2);
10686 return result;
10687}
10688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010689PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010690 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691\n\
10692Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010693old replaced by new. If the optional argument count is\n\
10694given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695
10696static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 PyObject *str1;
10700 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010701 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702 PyObject *result;
10703
Martin v. Löwis18e16552006-02-15 17:27:45 +000010704 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010706 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010707 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 str1 = PyUnicode_FromObject(str1);
10709 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10710 return NULL;
10711 str2 = PyUnicode_FromObject(str2);
10712 if (str2 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010713 Py_DECREF(str1);
10714 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716
10717 result = replace(self, str1, str2, maxcount);
10718
10719 Py_DECREF(str1);
10720 Py_DECREF(str2);
10721 return result;
10722}
10723
Alexander Belopolsky40018472011-02-26 01:02:56 +000010724static PyObject *
10725unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010727 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 Py_ssize_t isize;
10729 Py_ssize_t osize, squote, dquote, i, o;
10730 Py_UCS4 max, quote;
10731 int ikind, okind;
10732 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010735 return NULL;
10736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 isize = PyUnicode_GET_LENGTH(unicode);
10738 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 /* Compute length of output, quote characters, and
10741 maximum character */
10742 osize = 2; /* quotes */
10743 max = 127;
10744 squote = dquote = 0;
10745 ikind = PyUnicode_KIND(unicode);
10746 for (i = 0; i < isize; i++) {
10747 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10748 switch (ch) {
10749 case '\'': squote++; osize++; break;
10750 case '"': dquote++; osize++; break;
10751 case '\\': case '\t': case '\r': case '\n':
10752 osize += 2; break;
10753 default:
10754 /* Fast-path ASCII */
10755 if (ch < ' ' || ch == 0x7f)
10756 osize += 4; /* \xHH */
10757 else if (ch < 0x7f)
10758 osize++;
10759 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10760 osize++;
10761 max = ch > max ? ch : max;
10762 }
10763 else if (ch < 0x100)
10764 osize += 4; /* \xHH */
10765 else if (ch < 0x10000)
10766 osize += 6; /* \uHHHH */
10767 else
10768 osize += 10; /* \uHHHHHHHH */
10769 }
10770 }
10771
10772 quote = '\'';
10773 if (squote) {
10774 if (dquote)
10775 /* Both squote and dquote present. Use squote,
10776 and escape them */
10777 osize += squote;
10778 else
10779 quote = '"';
10780 }
10781
10782 repr = PyUnicode_New(osize, max);
10783 if (repr == NULL)
10784 return NULL;
10785 okind = PyUnicode_KIND(repr);
10786 odata = PyUnicode_DATA(repr);
10787
10788 PyUnicode_WRITE(okind, odata, 0, quote);
10789 PyUnicode_WRITE(okind, odata, osize-1, quote);
10790
10791 for (i = 0, o = 1; i < isize; i++) {
10792 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010793
10794 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 if ((ch == quote) || (ch == '\\')) {
10796 PyUnicode_WRITE(okind, odata, o++, '\\');
10797 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010798 continue;
10799 }
10800
Benjamin Peterson29060642009-01-31 22:14:21 +000010801 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010802 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 PyUnicode_WRITE(okind, odata, o++, '\\');
10804 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010805 }
10806 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 PyUnicode_WRITE(okind, odata, o++, '\\');
10808 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010809 }
10810 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 PyUnicode_WRITE(okind, odata, o++, '\\');
10812 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010813 }
10814
10815 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010816 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 PyUnicode_WRITE(okind, odata, o++, '\\');
10818 PyUnicode_WRITE(okind, odata, o++, 'x');
10819 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10820 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010821 }
10822
Georg Brandl559e5d72008-06-11 18:37:52 +000010823 /* Copy ASCII characters as-is */
10824 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010826 }
10827
Benjamin Peterson29060642009-01-31 22:14:21 +000010828 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010829 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010830 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010831 (categories Z* and C* except ASCII space)
10832 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010834 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 if (ch <= 0xff) {
10836 PyUnicode_WRITE(okind, odata, o++, '\\');
10837 PyUnicode_WRITE(okind, odata, o++, 'x');
10838 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10839 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010840 }
10841 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 else if (ch >= 0x10000) {
10843 PyUnicode_WRITE(okind, odata, o++, '\\');
10844 PyUnicode_WRITE(okind, odata, o++, 'U');
10845 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10846 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10847 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10848 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10849 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10850 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10851 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10852 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010853 }
10854 /* Map 16-bit characters to '\uxxxx' */
10855 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856 PyUnicode_WRITE(okind, odata, o++, '\\');
10857 PyUnicode_WRITE(okind, odata, o++, 'u');
10858 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10859 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10860 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10861 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010862 }
10863 }
10864 /* Copy characters as-is */
10865 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010867 }
10868 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010869 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010871 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872}
10873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010874PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010875 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876\n\
10877Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010878such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879arguments start and end are interpreted as in slice notation.\n\
10880\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010881Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882
10883static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010884unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885{
Jesus Ceaac451502011-04-20 17:09:23 +020010886 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010887 Py_ssize_t start;
10888 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010889 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890
Jesus Ceaac451502011-04-20 17:09:23 +020010891 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10892 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010893 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010895 if (PyUnicode_READY(self) == -1)
10896 return NULL;
10897 if (PyUnicode_READY(substring) == -1)
10898 return NULL;
10899
10900 result = any_find_slice(
10901 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10902 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010903 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904
10905 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907 if (result == -2)
10908 return NULL;
10909
Christian Heimes217cfd12007-12-02 14:31:20 +000010910 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911}
10912
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010913PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010914 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010916Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917
10918static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010920{
Jesus Ceaac451502011-04-20 17:09:23 +020010921 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010922 Py_ssize_t start;
10923 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010924 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925
Jesus Ceaac451502011-04-20 17:09:23 +020010926 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10927 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010928 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 if (PyUnicode_READY(self) == -1)
10931 return NULL;
10932 if (PyUnicode_READY(substring) == -1)
10933 return NULL;
10934
10935 result = any_find_slice(
10936 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10937 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010938 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939
10940 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942 if (result == -2)
10943 return NULL;
10944
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945 if (result < 0) {
10946 PyErr_SetString(PyExc_ValueError, "substring not found");
10947 return NULL;
10948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949
Christian Heimes217cfd12007-12-02 14:31:20 +000010950 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951}
10952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010953PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010954 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010956Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010957done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
10959static PyObject *
10960unicode_rjust(PyUnicodeObject *self, PyObject *args)
10961{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010962 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 Py_UCS4 fillchar = ' ';
10964
10965 if (PyUnicode_READY(self) == -1)
10966 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010967
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010968 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969 return NULL;
10970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972 Py_INCREF(self);
10973 return (PyObject*) self;
10974 }
10975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977}
10978
Alexander Belopolsky40018472011-02-26 01:02:56 +000010979PyObject *
10980PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981{
10982 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010983
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984 s = PyUnicode_FromObject(s);
10985 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010986 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010987 if (sep != NULL) {
10988 sep = PyUnicode_FromObject(sep);
10989 if (sep == NULL) {
10990 Py_DECREF(s);
10991 return NULL;
10992 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993 }
10994
10995 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
10996
10997 Py_DECREF(s);
10998 Py_XDECREF(sep);
10999 return result;
11000}
11001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011002PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004\n\
11005Return a list of the words in S, using sep as the\n\
11006delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011007splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011008whitespace string is a separator and empty strings are\n\
11009removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010
11011static PyObject*
11012unicode_split(PyUnicodeObject *self, PyObject *args)
11013{
11014 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011015 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016
Martin v. Löwis18e16552006-02-15 17:27:45 +000011017 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018 return NULL;
11019
11020 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011021 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011023 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011025 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026}
11027
Thomas Wouters477c8d52006-05-27 19:21:47 +000011028PyObject *
11029PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11030{
11031 PyObject* str_obj;
11032 PyObject* sep_obj;
11033 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 int kind1, kind2, kind;
11035 void *buf1 = NULL, *buf2 = NULL;
11036 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011037
11038 str_obj = PyUnicode_FromObject(str_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039 if (!str_obj || PyUnicode_READY(str_in) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011040 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011041 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011043 Py_DECREF(str_obj);
11044 return NULL;
11045 }
11046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 kind1 = PyUnicode_KIND(str_in);
11048 kind2 = PyUnicode_KIND(sep_obj);
11049 kind = kind1 > kind2 ? kind1 : kind2;
11050 buf1 = PyUnicode_DATA(str_in);
11051 if (kind1 != kind)
11052 buf1 = _PyUnicode_AsKind(str_in, kind);
11053 if (!buf1)
11054 goto onError;
11055 buf2 = PyUnicode_DATA(sep_obj);
11056 if (kind2 != kind)
11057 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11058 if (!buf2)
11059 goto onError;
11060 len1 = PyUnicode_GET_LENGTH(str_obj);
11061 len2 = PyUnicode_GET_LENGTH(sep_obj);
11062
11063 switch(PyUnicode_KIND(str_in)) {
11064 case PyUnicode_1BYTE_KIND:
11065 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11066 break;
11067 case PyUnicode_2BYTE_KIND:
11068 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11069 break;
11070 case PyUnicode_4BYTE_KIND:
11071 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11072 break;
11073 default:
11074 assert(0);
11075 out = 0;
11076 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011077
11078 Py_DECREF(sep_obj);
11079 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 if (kind1 != kind)
11081 PyMem_Free(buf1);
11082 if (kind2 != kind)
11083 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011084
11085 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 onError:
11087 Py_DECREF(sep_obj);
11088 Py_DECREF(str_obj);
11089 if (kind1 != kind && buf1)
11090 PyMem_Free(buf1);
11091 if (kind2 != kind && buf2)
11092 PyMem_Free(buf2);
11093 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011094}
11095
11096
11097PyObject *
11098PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11099{
11100 PyObject* str_obj;
11101 PyObject* sep_obj;
11102 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103 int kind1, kind2, kind;
11104 void *buf1 = NULL, *buf2 = NULL;
11105 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011106
11107 str_obj = PyUnicode_FromObject(str_in);
11108 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011109 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011110 sep_obj = PyUnicode_FromObject(sep_in);
11111 if (!sep_obj) {
11112 Py_DECREF(str_obj);
11113 return NULL;
11114 }
11115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 kind1 = PyUnicode_KIND(str_in);
11117 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011118 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 buf1 = PyUnicode_DATA(str_in);
11120 if (kind1 != kind)
11121 buf1 = _PyUnicode_AsKind(str_in, kind);
11122 if (!buf1)
11123 goto onError;
11124 buf2 = PyUnicode_DATA(sep_obj);
11125 if (kind2 != kind)
11126 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11127 if (!buf2)
11128 goto onError;
11129 len1 = PyUnicode_GET_LENGTH(str_obj);
11130 len2 = PyUnicode_GET_LENGTH(sep_obj);
11131
11132 switch(PyUnicode_KIND(str_in)) {
11133 case PyUnicode_1BYTE_KIND:
11134 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11135 break;
11136 case PyUnicode_2BYTE_KIND:
11137 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11138 break;
11139 case PyUnicode_4BYTE_KIND:
11140 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11141 break;
11142 default:
11143 assert(0);
11144 out = 0;
11145 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011146
11147 Py_DECREF(sep_obj);
11148 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 if (kind1 != kind)
11150 PyMem_Free(buf1);
11151 if (kind2 != kind)
11152 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011153
11154 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 onError:
11156 Py_DECREF(sep_obj);
11157 Py_DECREF(str_obj);
11158 if (kind1 != kind && buf1)
11159 PyMem_Free(buf1);
11160 if (kind2 != kind && buf2)
11161 PyMem_Free(buf2);
11162 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011163}
11164
11165PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011166 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011167\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011168Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011169the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011170found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011171
11172static PyObject*
11173unicode_partition(PyUnicodeObject *self, PyObject *separator)
11174{
11175 return PyUnicode_Partition((PyObject *)self, separator);
11176}
11177
11178PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011179 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011180\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011181Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011182the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011183separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011184
11185static PyObject*
11186unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11187{
11188 return PyUnicode_RPartition((PyObject *)self, separator);
11189}
11190
Alexander Belopolsky40018472011-02-26 01:02:56 +000011191PyObject *
11192PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011193{
11194 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011195
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011196 s = PyUnicode_FromObject(s);
11197 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011198 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011199 if (sep != NULL) {
11200 sep = PyUnicode_FromObject(sep);
11201 if (sep == NULL) {
11202 Py_DECREF(s);
11203 return NULL;
11204 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011205 }
11206
11207 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11208
11209 Py_DECREF(s);
11210 Py_XDECREF(sep);
11211 return result;
11212}
11213
11214PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011215 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011216\n\
11217Return a list of the words in S, using sep as the\n\
11218delimiter string, starting at the end of the string and\n\
11219working to the front. If maxsplit is given, at most maxsplit\n\
11220splits are done. If sep is not specified, any whitespace string\n\
11221is a separator.");
11222
11223static PyObject*
11224unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11225{
11226 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011227 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011228
Martin v. Löwis18e16552006-02-15 17:27:45 +000011229 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011230 return NULL;
11231
11232 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011233 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011234 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011235 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011236 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011237 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011238}
11239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011240PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011241 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242\n\
11243Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011244Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011245is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246
11247static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011248unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011250 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011251 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011253 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11254 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255 return NULL;
11256
Guido van Rossum86662912000-04-11 15:38:46 +000011257 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258}
11259
11260static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011261PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262{
Walter Dörwald346737f2007-05-31 10:44:43 +000011263 if (PyUnicode_CheckExact(self)) {
11264 Py_INCREF(self);
11265 return self;
11266 } else
11267 /* Subtype -- return genuine unicode string with the same value. */
11268 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
11269 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270}
11271
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011272PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011273 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274\n\
11275Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011276and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277
11278static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011279unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281 return fixup(self, fixswapcase);
11282}
11283
Georg Brandlceee0772007-11-27 23:48:05 +000011284PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011285 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011286\n\
11287Return a translation table usable for str.translate().\n\
11288If there is only one argument, it must be a dictionary mapping Unicode\n\
11289ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011290Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011291If there are two arguments, they must be strings of equal length, and\n\
11292in the resulting dictionary, each character in x will be mapped to the\n\
11293character at the same position in y. If there is a third argument, it\n\
11294must be a string, whose characters will be mapped to None in the result.");
11295
11296static PyObject*
11297unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11298{
11299 PyObject *x, *y = NULL, *z = NULL;
11300 PyObject *new = NULL, *key, *value;
11301 Py_ssize_t i = 0;
11302 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011303
Georg Brandlceee0772007-11-27 23:48:05 +000011304 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11305 return NULL;
11306 new = PyDict_New();
11307 if (!new)
11308 return NULL;
11309 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011310 int x_kind, y_kind, z_kind;
11311 void *x_data, *y_data, *z_data;
11312
Georg Brandlceee0772007-11-27 23:48:05 +000011313 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011314 if (!PyUnicode_Check(x)) {
11315 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11316 "be a string if there is a second argument");
11317 goto err;
11318 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011320 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11321 "arguments must have equal length");
11322 goto err;
11323 }
11324 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 x_kind = PyUnicode_KIND(x);
11326 y_kind = PyUnicode_KIND(y);
11327 x_data = PyUnicode_DATA(x);
11328 y_data = PyUnicode_DATA(y);
11329 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11330 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11331 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011332 if (!key || !value)
11333 goto err;
11334 res = PyDict_SetItem(new, key, value);
11335 Py_DECREF(key);
11336 Py_DECREF(value);
11337 if (res < 0)
11338 goto err;
11339 }
11340 /* create entries for deleting chars in z */
11341 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 z_kind = PyUnicode_KIND(z);
11343 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011344 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011346 if (!key)
11347 goto err;
11348 res = PyDict_SetItem(new, key, Py_None);
11349 Py_DECREF(key);
11350 if (res < 0)
11351 goto err;
11352 }
11353 }
11354 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 int kind;
11356 void *data;
11357
Georg Brandlceee0772007-11-27 23:48:05 +000011358 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011359 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011360 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11361 "to maketrans it must be a dict");
11362 goto err;
11363 }
11364 /* copy entries into the new dict, converting string keys to int keys */
11365 while (PyDict_Next(x, &i, &key, &value)) {
11366 if (PyUnicode_Check(key)) {
11367 /* convert string keys to integer keys */
11368 PyObject *newkey;
11369 if (PyUnicode_GET_SIZE(key) != 1) {
11370 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11371 "table must be of length 1");
11372 goto err;
11373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 kind = PyUnicode_KIND(key);
11375 data = PyUnicode_DATA(key);
11376 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011377 if (!newkey)
11378 goto err;
11379 res = PyDict_SetItem(new, newkey, value);
11380 Py_DECREF(newkey);
11381 if (res < 0)
11382 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011383 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011384 /* just keep integer keys */
11385 if (PyDict_SetItem(new, key, value) < 0)
11386 goto err;
11387 } else {
11388 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11389 "be strings or integers");
11390 goto err;
11391 }
11392 }
11393 }
11394 return new;
11395 err:
11396 Py_DECREF(new);
11397 return NULL;
11398}
11399
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011400PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011401 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402\n\
11403Return a copy of the string S, where all characters have been mapped\n\
11404through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011405Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011406Unmapped characters are left untouched. Characters mapped to None\n\
11407are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408
11409static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413}
11414
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011415PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011416 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011418Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419
11420static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011421unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423 return fixup(self, fixupper);
11424}
11425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011426PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011427 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011429Pad a numeric string S with zeros on the left, to fill a field\n\
11430of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431
11432static PyObject *
11433unicode_zfill(PyUnicodeObject *self, PyObject *args)
11434{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011435 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011437 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 int kind;
11439 void *data;
11440 Py_UCS4 chr;
11441
11442 if (PyUnicode_READY(self) == -1)
11443 return NULL;
11444
Martin v. Löwis18e16552006-02-15 17:27:45 +000011445 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446 return NULL;
11447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011449 if (PyUnicode_CheckExact(self)) {
11450 Py_INCREF(self);
11451 return (PyObject*) self;
11452 }
11453 else
11454 return PyUnicode_FromUnicode(
11455 PyUnicode_AS_UNICODE(self),
11456 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +000011457 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458 }
11459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
11462 u = pad(self, fill, 0, '0');
11463
Walter Dörwald068325e2002-04-15 13:36:47 +000011464 if (u == NULL)
11465 return NULL;
11466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 kind = PyUnicode_KIND(u);
11468 data = PyUnicode_DATA(u);
11469 chr = PyUnicode_READ(kind, data, fill);
11470
11471 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473 PyUnicode_WRITE(kind, data, 0, chr);
11474 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475 }
11476
11477 return (PyObject*) u;
11478}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479
11480#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011481static PyObject *
11482unicode__decimal2ascii(PyObject *self)
11483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011485}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486#endif
11487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011488PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011489 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011491Return True if S starts with the specified prefix, False otherwise.\n\
11492With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011493With optional end, stop comparing S at that position.\n\
11494prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495
11496static PyObject *
11497unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011498 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011500 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011502 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011503 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011504 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505
Jesus Ceaac451502011-04-20 17:09:23 +020011506 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011508 if (PyTuple_Check(subobj)) {
11509 Py_ssize_t i;
11510 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11511 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011512 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011513 if (substring == NULL)
11514 return NULL;
11515 result = tailmatch(self, substring, start, end, -1);
11516 Py_DECREF(substring);
11517 if (result) {
11518 Py_RETURN_TRUE;
11519 }
11520 }
11521 /* nothing matched */
11522 Py_RETURN_FALSE;
11523 }
11524 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011525 if (substring == NULL) {
11526 if (PyErr_ExceptionMatches(PyExc_TypeError))
11527 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11528 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011529 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011530 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011531 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011533 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534}
11535
11536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011537PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011538 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011540Return True if S ends with the specified suffix, False otherwise.\n\
11541With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011542With optional end, stop comparing S at that position.\n\
11543suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544
11545static PyObject *
11546unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011547 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011549 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011551 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011552 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011553 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554
Jesus Ceaac451502011-04-20 17:09:23 +020011555 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011556 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011557 if (PyTuple_Check(subobj)) {
11558 Py_ssize_t i;
11559 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11560 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011562 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011564 result = tailmatch(self, substring, start, end, +1);
11565 Py_DECREF(substring);
11566 if (result) {
11567 Py_RETURN_TRUE;
11568 }
11569 }
11570 Py_RETURN_FALSE;
11571 }
11572 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011573 if (substring == NULL) {
11574 if (PyErr_ExceptionMatches(PyExc_TypeError))
11575 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11576 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011577 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011578 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011579 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011581 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582}
11583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011585
11586PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011587 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011588\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011589Return a formatted version of S, using substitutions from args and kwargs.\n\
11590The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011591
Eric Smith27bbca62010-11-04 17:06:58 +000011592PyDoc_STRVAR(format_map__doc__,
11593 "S.format_map(mapping) -> str\n\
11594\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011595Return a formatted version of S, using substitutions from mapping.\n\
11596The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011597
Eric Smith4a7d76d2008-05-30 18:10:19 +000011598static PyObject *
11599unicode__format__(PyObject* self, PyObject* args)
11600{
11601 PyObject *format_spec;
11602
11603 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11604 return NULL;
11605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011606 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11607 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011608}
11609
Eric Smith8c663262007-08-25 02:26:07 +000011610PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011611 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011612\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011613Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011614
11615static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011616unicode__sizeof__(PyUnicodeObject *v)
11617{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011618 Py_ssize_t size;
11619
11620 /* If it's a compact object, account for base structure +
11621 character data. */
11622 if (PyUnicode_IS_COMPACT_ASCII(v))
11623 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11624 else if (PyUnicode_IS_COMPACT(v))
11625 size = sizeof(PyCompactUnicodeObject) +
11626 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11627 else {
11628 /* If it is a two-block object, account for base object, and
11629 for character block if present. */
11630 size = sizeof(PyUnicodeObject);
11631 if (v->data.any)
11632 size += (PyUnicode_GET_LENGTH(v) + 1) *
11633 PyUnicode_CHARACTER_SIZE(v);
11634 }
11635 /* If the wstr pointer is present, account for it unless it is shared
11636 with the data pointer. Since PyUnicode_DATA will crash if the object
11637 is not ready, check whether it's either not ready (in which case the
11638 data is entirely in wstr) or if the data is not shared. */
11639 if (_PyUnicode_WSTR(v) &&
11640 (!PyUnicode_IS_READY(v) ||
11641 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11642 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11643 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11644 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11645
11646 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011647}
11648
11649PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011650 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011651
11652static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011653unicode_getnewargs(PyUnicodeObject *v)
11654{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 PyObject *copy;
11656 unsigned char *data;
11657 int kind;
11658 if (PyUnicode_READY(v) == -1)
11659 return NULL;
11660 kind = PyUnicode_KIND(v);
11661 data = PyUnicode_1BYTE_DATA(v);
11662 copy = PyUnicode_FromKindAndData(kind, data, PyUnicode_GET_LENGTH(v));
11663 if (!copy)
11664 return NULL;
11665 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011666}
11667
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668static PyMethodDef unicode_methods[] = {
11669
11670 /* Order is according to common usage: often used methods should
11671 appear first, since lookup is done sequentially. */
11672
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011673 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011674 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11675 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011676 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011677 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11678 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11679 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11680 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11681 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11682 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11683 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011684 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011685 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11686 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11687 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011688 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011689 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11690 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11691 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011692 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011693 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011694 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011695 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011696 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11697 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11698 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11699 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11700 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11701 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11702 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11703 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11704 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11705 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11706 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11707 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11708 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11709 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011710 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011711 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011712 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011713 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011714 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011715 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011716 {"maketrans", (PyCFunction) unicode_maketrans,
11717 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011718 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011719#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011720 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721#endif
11722
11723#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011724 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011725 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726#endif
11727
Benjamin Peterson14339b62009-01-31 16:36:08 +000011728 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729 {NULL, NULL}
11730};
11731
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011732static PyObject *
11733unicode_mod(PyObject *v, PyObject *w)
11734{
Brian Curtindfc80e32011-08-10 20:28:54 -050011735 if (!PyUnicode_Check(v))
11736 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011737 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011738}
11739
11740static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011741 0, /*nb_add*/
11742 0, /*nb_subtract*/
11743 0, /*nb_multiply*/
11744 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011745};
11746
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011748 (lenfunc) unicode_length, /* sq_length */
11749 PyUnicode_Concat, /* sq_concat */
11750 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11751 (ssizeargfunc) unicode_getitem, /* sq_item */
11752 0, /* sq_slice */
11753 0, /* sq_ass_item */
11754 0, /* sq_ass_slice */
11755 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756};
11757
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011758static PyObject*
11759unicode_subscript(PyUnicodeObject* self, PyObject* item)
11760{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 if (PyUnicode_READY(self) == -1)
11762 return NULL;
11763
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011764 if (PyIndex_Check(item)) {
11765 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011766 if (i == -1 && PyErr_Occurred())
11767 return NULL;
11768 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011770 return unicode_getitem(self, i);
11771 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011772 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011774 Py_UNICODE* result_buf;
11775 PyObject* result;
11776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011778 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011779 return NULL;
11780 }
11781
11782 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 return PyUnicode_New(0, 0);
11784 } else if (start == 0 && step == 1 &&
11785 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011786 PyUnicode_CheckExact(self)) {
11787 Py_INCREF(self);
11788 return (PyObject *)self;
11789 } else if (step == 1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 return substring(self, start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011791 } else {
11792 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011793 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11794 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011795
Benjamin Peterson29060642009-01-31 22:14:21 +000011796 if (result_buf == NULL)
11797 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011798
11799 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11800 result_buf[i] = source_buf[cur];
11801 }
Tim Petersced69f82003-09-16 20:30:58 +000011802
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011803 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011804 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011805 return result;
11806 }
11807 } else {
11808 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11809 return NULL;
11810 }
11811}
11812
11813static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011814 (lenfunc)unicode_length, /* mp_length */
11815 (binaryfunc)unicode_subscript, /* mp_subscript */
11816 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011817};
11818
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820/* Helpers for PyUnicode_Format() */
11821
11822static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011823getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011825 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011827 (*p_argidx)++;
11828 if (arglen < 0)
11829 return args;
11830 else
11831 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832 }
11833 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011834 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 return NULL;
11836}
11837
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011838/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011840static PyObject *
11841formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011843 char *p;
11844 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011846
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847 x = PyFloat_AsDouble(v);
11848 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011849 return NULL;
11850
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011852 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011853
Eric Smith0923d1d2009-04-16 20:16:10 +000011854 p = PyOS_double_to_string(x, type, prec,
11855 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011856 if (p == NULL)
11857 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011859 PyMem_Free(p);
11860 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861}
11862
Tim Peters38fd5b62000-09-21 05:43:11 +000011863static PyObject*
11864formatlong(PyObject *val, int flags, int prec, int type)
11865{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011866 char *buf;
11867 int len;
11868 PyObject *str; /* temporary string object. */
11869 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011870
Benjamin Peterson14339b62009-01-31 16:36:08 +000011871 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11872 if (!str)
11873 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011875 Py_DECREF(str);
11876 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011877}
11878
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011881 size_t buflen,
11882 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011884 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011885 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 if (PyUnicode_GET_LENGTH(v) == 1) {
11887 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011888 buf[1] = '\0';
11889 return 1;
11890 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011891 goto onError;
11892 }
11893 else {
11894 /* Integer input truncated to a character */
11895 long x;
11896 x = PyLong_AsLong(v);
11897 if (x == -1 && PyErr_Occurred())
11898 goto onError;
11899
11900 if (x < 0 || x > 0x10ffff) {
11901 PyErr_SetString(PyExc_OverflowError,
11902 "%c arg not in range(0x110000)");
11903 return -1;
11904 }
11905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011907 buf[1] = '\0';
11908 return 1;
11909 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011910
Benjamin Peterson29060642009-01-31 22:14:21 +000011911 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011912 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011913 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011914 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915}
11916
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011917/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011918 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011919*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011920#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011921
Alexander Belopolsky40018472011-02-26 01:02:56 +000011922PyObject *
11923PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 void *fmt;
11926 int fmtkind;
11927 PyObject *result;
11928 Py_UCS4 *res, *res0;
11929 Py_UCS4 max;
11930 int kind;
11931 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011935
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011937 PyErr_BadInternalCall();
11938 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11941 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011942 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 fmt = PyUnicode_DATA(uformat);
11944 fmtkind = PyUnicode_KIND(uformat);
11945 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11946 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947
11948 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11950 if (res0 == NULL) {
11951 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011952 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954
11955 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011956 arglen = PyTuple_Size(args);
11957 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958 }
11959 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011960 arglen = -1;
11961 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011963 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011964 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011965 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966
11967 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011969 if (--rescnt < 0) {
11970 rescnt = fmtcnt + 100;
11971 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11973 if (res0 == NULL){
11974 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011975 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 }
11977 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011978 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011979 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011981 }
11982 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011983 /* Got a format specifier */
11984 int flags = 0;
11985 Py_ssize_t width = -1;
11986 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 Py_UCS4 c = '\0';
11988 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 int isnumok;
11990 PyObject *v = NULL;
11991 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 void *pbuf;
11993 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011994 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 Py_ssize_t len, len1;
11996 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 fmtpos++;
11999 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12000 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012001 Py_ssize_t keylen;
12002 PyObject *key;
12003 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012004
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 if (dict == NULL) {
12006 PyErr_SetString(PyExc_TypeError,
12007 "format requires a mapping");
12008 goto onError;
12009 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012011 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012013 /* Skip over balanced parentheses */
12014 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012016 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012018 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 if (fmtcnt < 0 || pcount > 0) {
12023 PyErr_SetString(PyExc_ValueError,
12024 "incomplete format key");
12025 goto onError;
12026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 key = substring(uformat, keystart, keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012028 if (key == NULL)
12029 goto onError;
12030 if (args_owned) {
12031 Py_DECREF(args);
12032 args_owned = 0;
12033 }
12034 args = PyObject_GetItem(dict, key);
12035 Py_DECREF(key);
12036 if (args == NULL) {
12037 goto onError;
12038 }
12039 args_owned = 1;
12040 arglen = -1;
12041 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012042 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012043 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012045 case '-': flags |= F_LJUST; continue;
12046 case '+': flags |= F_SIGN; continue;
12047 case ' ': flags |= F_BLANK; continue;
12048 case '#': flags |= F_ALT; continue;
12049 case '0': flags |= F_ZERO; continue;
12050 }
12051 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012052 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012053 if (c == '*') {
12054 v = getnextarg(args, arglen, &argidx);
12055 if (v == NULL)
12056 goto onError;
12057 if (!PyLong_Check(v)) {
12058 PyErr_SetString(PyExc_TypeError,
12059 "* wants int");
12060 goto onError;
12061 }
12062 width = PyLong_AsLong(v);
12063 if (width == -1 && PyErr_Occurred())
12064 goto onError;
12065 if (width < 0) {
12066 flags |= F_LJUST;
12067 width = -width;
12068 }
12069 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012071 }
12072 else if (c >= '0' && c <= '9') {
12073 width = c - '0';
12074 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012076 if (c < '0' || c > '9')
12077 break;
12078 if ((width*10) / 10 != width) {
12079 PyErr_SetString(PyExc_ValueError,
12080 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012081 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012082 }
12083 width = width*10 + (c - '0');
12084 }
12085 }
12086 if (c == '.') {
12087 prec = 0;
12088 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012090 if (c == '*') {
12091 v = getnextarg(args, arglen, &argidx);
12092 if (v == NULL)
12093 goto onError;
12094 if (!PyLong_Check(v)) {
12095 PyErr_SetString(PyExc_TypeError,
12096 "* wants int");
12097 goto onError;
12098 }
12099 prec = PyLong_AsLong(v);
12100 if (prec == -1 && PyErr_Occurred())
12101 goto onError;
12102 if (prec < 0)
12103 prec = 0;
12104 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012106 }
12107 else if (c >= '0' && c <= '9') {
12108 prec = c - '0';
12109 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012111 if (c < '0' || c > '9')
12112 break;
12113 if ((prec*10) / 10 != prec) {
12114 PyErr_SetString(PyExc_ValueError,
12115 "prec too big");
12116 goto onError;
12117 }
12118 prec = prec*10 + (c - '0');
12119 }
12120 }
12121 } /* prec */
12122 if (fmtcnt >= 0) {
12123 if (c == 'h' || c == 'l' || c == 'L') {
12124 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012126 }
12127 }
12128 if (fmtcnt < 0) {
12129 PyErr_SetString(PyExc_ValueError,
12130 "incomplete format");
12131 goto onError;
12132 }
12133 if (c != '%') {
12134 v = getnextarg(args, arglen, &argidx);
12135 if (v == NULL)
12136 goto onError;
12137 }
12138 sign = 0;
12139 fill = ' ';
12140 switch (c) {
12141
12142 case '%':
12143 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012146 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012147 len = 1;
12148 break;
12149
12150 case 's':
12151 case 'r':
12152 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012153 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012154 temp = v;
12155 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012156 }
12157 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012158 if (c == 's')
12159 temp = PyObject_Str(v);
12160 else if (c == 'r')
12161 temp = PyObject_Repr(v);
12162 else
12163 temp = PyObject_ASCII(v);
12164 if (temp == NULL)
12165 goto onError;
12166 if (PyUnicode_Check(temp))
12167 /* nothing to do */;
12168 else {
12169 Py_DECREF(temp);
12170 PyErr_SetString(PyExc_TypeError,
12171 "%s argument has non-string str()");
12172 goto onError;
12173 }
12174 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 if (PyUnicode_READY(temp) == -1) {
12176 Py_CLEAR(temp);
12177 goto onError;
12178 }
12179 pbuf = PyUnicode_DATA(temp);
12180 kind = PyUnicode_KIND(temp);
12181 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012182 if (prec >= 0 && len > prec)
12183 len = prec;
12184 break;
12185
12186 case 'i':
12187 case 'd':
12188 case 'u':
12189 case 'o':
12190 case 'x':
12191 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012192 isnumok = 0;
12193 if (PyNumber_Check(v)) {
12194 PyObject *iobj=NULL;
12195
12196 if (PyLong_Check(v)) {
12197 iobj = v;
12198 Py_INCREF(iobj);
12199 }
12200 else {
12201 iobj = PyNumber_Long(v);
12202 }
12203 if (iobj!=NULL) {
12204 if (PyLong_Check(iobj)) {
12205 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012206 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012207 Py_DECREF(iobj);
12208 if (!temp)
12209 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 if (PyUnicode_READY(temp) == -1) {
12211 Py_CLEAR(temp);
12212 goto onError;
12213 }
12214 pbuf = PyUnicode_DATA(temp);
12215 kind = PyUnicode_KIND(temp);
12216 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012217 sign = 1;
12218 }
12219 else {
12220 Py_DECREF(iobj);
12221 }
12222 }
12223 }
12224 if (!isnumok) {
12225 PyErr_Format(PyExc_TypeError,
12226 "%%%c format: a number is required, "
12227 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12228 goto onError;
12229 }
12230 if (flags & F_ZERO)
12231 fill = '0';
12232 break;
12233
12234 case 'e':
12235 case 'E':
12236 case 'f':
12237 case 'F':
12238 case 'g':
12239 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012240 temp = formatfloat(v, flags, prec, c);
12241 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012242 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 if (PyUnicode_READY(temp) == -1) {
12244 Py_CLEAR(temp);
12245 goto onError;
12246 }
12247 pbuf = PyUnicode_DATA(temp);
12248 kind = PyUnicode_KIND(temp);
12249 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 sign = 1;
12251 if (flags & F_ZERO)
12252 fill = '0';
12253 break;
12254
12255 case 'c':
12256 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012258 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012259 if (len < 0)
12260 goto onError;
12261 break;
12262
12263 default:
12264 PyErr_Format(PyExc_ValueError,
12265 "unsupported format character '%c' (0x%x) "
12266 "at index %zd",
12267 (31<=c && c<=126) ? (char)c : '?',
12268 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012269 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012270 goto onError;
12271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012272 /* pbuf is initialized here. */
12273 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012274 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012275 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12276 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12277 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012278 len--;
12279 }
12280 else if (flags & F_SIGN)
12281 sign = '+';
12282 else if (flags & F_BLANK)
12283 sign = ' ';
12284 else
12285 sign = 0;
12286 }
12287 if (width < len)
12288 width = len;
12289 if (rescnt - (sign != 0) < width) {
12290 reslen -= rescnt;
12291 rescnt = width + fmtcnt + 100;
12292 reslen += rescnt;
12293 if (reslen < 0) {
12294 Py_XDECREF(temp);
12295 PyErr_NoMemory();
12296 goto onError;
12297 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12299 if (res0 == 0) {
12300 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012301 Py_XDECREF(temp);
12302 goto onError;
12303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 }
12306 if (sign) {
12307 if (fill != ' ')
12308 *res++ = sign;
12309 rescnt--;
12310 if (width > len)
12311 width--;
12312 }
12313 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12315 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012316 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12318 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012319 }
12320 rescnt -= 2;
12321 width -= 2;
12322 if (width < 0)
12323 width = 0;
12324 len -= 2;
12325 }
12326 if (width > len && !(flags & F_LJUST)) {
12327 do {
12328 --rescnt;
12329 *res++ = fill;
12330 } while (--width > len);
12331 }
12332 if (fill == ' ') {
12333 if (sign)
12334 *res++ = sign;
12335 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12337 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12338 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12339 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012340 }
12341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 /* Copy all characters, preserving len */
12343 len1 = len;
12344 while (len1--) {
12345 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12346 rescnt--;
12347 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012348 while (--width >= len) {
12349 --rescnt;
12350 *res++ = ' ';
12351 }
12352 if (dict && (argidx < arglen) && c != '%') {
12353 PyErr_SetString(PyExc_TypeError,
12354 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012355 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012356 goto onError;
12357 }
12358 Py_XDECREF(temp);
12359 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360 } /* until end */
12361 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012362 PyErr_SetString(PyExc_TypeError,
12363 "not all arguments converted during string formatting");
12364 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365 }
12366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367
12368 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12369 if (*res > max)
12370 max = *res;
12371 result = PyUnicode_New(reslen - rescnt, max);
12372 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012373 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374 kind = PyUnicode_KIND(result);
12375 for (res = res0; res < res0+reslen-rescnt; res++)
12376 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12377 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012378 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012379 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380 }
12381 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382 return (PyObject *)result;
12383
Benjamin Peterson29060642009-01-31 22:14:21 +000012384 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386 Py_DECREF(uformat);
12387 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012389 }
12390 return NULL;
12391}
12392
Jeremy Hylton938ace62002-07-17 16:30:39 +000012393static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012394unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12395
Tim Peters6d6c1a32001-08-02 04:15:00 +000012396static PyObject *
12397unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12398{
Benjamin Peterson29060642009-01-31 22:14:21 +000012399 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012400 static char *kwlist[] = {"object", "encoding", "errors", 0};
12401 char *encoding = NULL;
12402 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012403
Benjamin Peterson14339b62009-01-31 16:36:08 +000012404 if (type != &PyUnicode_Type)
12405 return unicode_subtype_new(type, args, kwds);
12406 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012407 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012408 return NULL;
12409 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012411 if (encoding == NULL && errors == NULL)
12412 return PyObject_Str(x);
12413 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012414 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012415}
12416
Guido van Rossume023fe02001-08-30 03:12:59 +000012417static PyObject *
12418unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12419{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012420 PyUnicodeObject *tmp, *pnew;
12421 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012422 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012423
Benjamin Peterson14339b62009-01-31 16:36:08 +000012424 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12425 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12426 if (tmp == NULL)
12427 return NULL;
12428 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12430 // it seems kind of strange that tp_alloc gets passed the size
12431 // of the unicode string because there will follow another
12432 // malloc.
12433 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12434 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012435 if (pnew == NULL) {
12436 Py_DECREF(tmp);
12437 return NULL;
12438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12440 if (_PyUnicode_WSTR(pnew) == NULL) {
12441 err = PyErr_NoMemory();
12442 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12445 _PyUnicode_WSTR_LENGTH(pnew) = n;
12446 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12447 _PyUnicode_STATE(pnew).interned = 0;
12448 _PyUnicode_STATE(pnew).kind = 0;
12449 _PyUnicode_STATE(pnew).compact = 0;
12450 _PyUnicode_STATE(pnew).ready = 0;
12451 _PyUnicode_STATE(pnew).ascii = 0;
12452 pnew->data.any = NULL;
12453 _PyUnicode_LENGTH(pnew) = 0;
12454 pnew->_base.utf8 = NULL;
12455 pnew->_base.utf8_length = 0;
12456
12457 if (PyUnicode_READY(pnew) == -1) {
12458 PyObject_FREE(_PyUnicode_WSTR(pnew));
12459 goto onError;
12460 }
12461
Benjamin Peterson14339b62009-01-31 16:36:08 +000012462 Py_DECREF(tmp);
12463 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464
12465 onError:
12466 _Py_ForgetReference((PyObject *)pnew);
12467 PyObject_Del(pnew);
12468 Py_DECREF(tmp);
12469 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012470}
12471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012472PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012473 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012474\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012475Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012476encoding defaults to the current default string encoding.\n\
12477errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012478
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012479static PyObject *unicode_iter(PyObject *seq);
12480
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012482 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012483 "str", /* tp_name */
12484 sizeof(PyUnicodeObject), /* tp_size */
12485 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012487 (destructor)unicode_dealloc, /* tp_dealloc */
12488 0, /* tp_print */
12489 0, /* tp_getattr */
12490 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012491 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012492 unicode_repr, /* tp_repr */
12493 &unicode_as_number, /* tp_as_number */
12494 &unicode_as_sequence, /* tp_as_sequence */
12495 &unicode_as_mapping, /* tp_as_mapping */
12496 (hashfunc) unicode_hash, /* tp_hash*/
12497 0, /* tp_call*/
12498 (reprfunc) unicode_str, /* tp_str */
12499 PyObject_GenericGetAttr, /* tp_getattro */
12500 0, /* tp_setattro */
12501 0, /* tp_as_buffer */
12502 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012503 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012504 unicode_doc, /* tp_doc */
12505 0, /* tp_traverse */
12506 0, /* tp_clear */
12507 PyUnicode_RichCompare, /* tp_richcompare */
12508 0, /* tp_weaklistoffset */
12509 unicode_iter, /* tp_iter */
12510 0, /* tp_iternext */
12511 unicode_methods, /* tp_methods */
12512 0, /* tp_members */
12513 0, /* tp_getset */
12514 &PyBaseObject_Type, /* tp_base */
12515 0, /* tp_dict */
12516 0, /* tp_descr_get */
12517 0, /* tp_descr_set */
12518 0, /* tp_dictoffset */
12519 0, /* tp_init */
12520 0, /* tp_alloc */
12521 unicode_new, /* tp_new */
12522 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523};
12524
12525/* Initialize the Unicode implementation */
12526
Thomas Wouters78890102000-07-22 19:25:51 +000012527void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012529 int i;
12530
Thomas Wouters477c8d52006-05-27 19:21:47 +000012531 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012532 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012533 0x000A, /* LINE FEED */
12534 0x000D, /* CARRIAGE RETURN */
12535 0x001C, /* FILE SEPARATOR */
12536 0x001D, /* GROUP SEPARATOR */
12537 0x001E, /* RECORD SEPARATOR */
12538 0x0085, /* NEXT LINE */
12539 0x2028, /* LINE SEPARATOR */
12540 0x2029, /* PARAGRAPH SEPARATOR */
12541 };
12542
Fred Drakee4315f52000-05-09 19:53:39 +000012543 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012545 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012547
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012548 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012549 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012550 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012551 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012552
12553 /* initialize the linebreak bloom filter */
12554 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012555 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012556 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012557
12558 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559}
12560
12561/* Finalize the Unicode implementation */
12562
Christian Heimesa156e092008-02-16 07:38:31 +000012563int
12564PyUnicode_ClearFreeList(void)
12565{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012567}
12568
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569void
Thomas Wouters78890102000-07-22 19:25:51 +000012570_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012572 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012574 Py_XDECREF(unicode_empty);
12575 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012576
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012577 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012578 if (unicode_latin1[i]) {
12579 Py_DECREF(unicode_latin1[i]);
12580 unicode_latin1[i] = NULL;
12581 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012582 }
Christian Heimesa156e092008-02-16 07:38:31 +000012583 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012585
Walter Dörwald16807132007-05-25 13:52:07 +000012586void
12587PyUnicode_InternInPlace(PyObject **p)
12588{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012589 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12590 PyObject *t;
12591 if (s == NULL || !PyUnicode_Check(s))
12592 Py_FatalError(
12593 "PyUnicode_InternInPlace: unicode strings only please!");
12594 /* If it's a subclass, we don't really know what putting
12595 it in the interned dict might do. */
12596 if (!PyUnicode_CheckExact(s))
12597 return;
12598 if (PyUnicode_CHECK_INTERNED(s))
12599 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012600 if (PyUnicode_READY(s) == -1) {
12601 assert(0 && "ready fail in intern...");
12602 return;
12603 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012604 if (interned == NULL) {
12605 interned = PyDict_New();
12606 if (interned == NULL) {
12607 PyErr_Clear(); /* Don't leave an exception */
12608 return;
12609 }
12610 }
12611 /* It might be that the GetItem call fails even
12612 though the key is present in the dictionary,
12613 namely when this happens during a stack overflow. */
12614 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012615 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012616 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012617
Benjamin Peterson29060642009-01-31 22:14:21 +000012618 if (t) {
12619 Py_INCREF(t);
12620 Py_DECREF(*p);
12621 *p = t;
12622 return;
12623 }
Walter Dörwald16807132007-05-25 13:52:07 +000012624
Benjamin Peterson14339b62009-01-31 16:36:08 +000012625 PyThreadState_GET()->recursion_critical = 1;
12626 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12627 PyErr_Clear();
12628 PyThreadState_GET()->recursion_critical = 0;
12629 return;
12630 }
12631 PyThreadState_GET()->recursion_critical = 0;
12632 /* The two references in interned are not counted by refcnt.
12633 The deallocator will take care of this */
12634 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012636}
12637
12638void
12639PyUnicode_InternImmortal(PyObject **p)
12640{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12642
Benjamin Peterson14339b62009-01-31 16:36:08 +000012643 PyUnicode_InternInPlace(p);
12644 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012646 Py_INCREF(*p);
12647 }
Walter Dörwald16807132007-05-25 13:52:07 +000012648}
12649
12650PyObject *
12651PyUnicode_InternFromString(const char *cp)
12652{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012653 PyObject *s = PyUnicode_FromString(cp);
12654 if (s == NULL)
12655 return NULL;
12656 PyUnicode_InternInPlace(&s);
12657 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012658}
12659
Alexander Belopolsky40018472011-02-26 01:02:56 +000012660void
12661_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012662{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012663 PyObject *keys;
12664 PyUnicodeObject *s;
12665 Py_ssize_t i, n;
12666 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012667
Benjamin Peterson14339b62009-01-31 16:36:08 +000012668 if (interned == NULL || !PyDict_Check(interned))
12669 return;
12670 keys = PyDict_Keys(interned);
12671 if (keys == NULL || !PyList_Check(keys)) {
12672 PyErr_Clear();
12673 return;
12674 }
Walter Dörwald16807132007-05-25 13:52:07 +000012675
Benjamin Peterson14339b62009-01-31 16:36:08 +000012676 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12677 detector, interned unicode strings are not forcibly deallocated;
12678 rather, we give them their stolen references back, and then clear
12679 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012680
Benjamin Peterson14339b62009-01-31 16:36:08 +000012681 n = PyList_GET_SIZE(keys);
12682 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012683 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012684 for (i = 0; i < n; i++) {
12685 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686 if (PyUnicode_READY(s) == -1)
12687 fprintf(stderr, "could not ready string\n");
12688 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012689 case SSTATE_NOT_INTERNED:
12690 /* XXX Shouldn't happen */
12691 break;
12692 case SSTATE_INTERNED_IMMORTAL:
12693 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012695 break;
12696 case SSTATE_INTERNED_MORTAL:
12697 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012699 break;
12700 default:
12701 Py_FatalError("Inconsistent interned string state.");
12702 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012704 }
12705 fprintf(stderr, "total size of all interned strings: "
12706 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12707 "mortal/immortal\n", mortal_size, immortal_size);
12708 Py_DECREF(keys);
12709 PyDict_Clear(interned);
12710 Py_DECREF(interned);
12711 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012712}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012713
12714
12715/********************* Unicode Iterator **************************/
12716
12717typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012718 PyObject_HEAD
12719 Py_ssize_t it_index;
12720 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012721} unicodeiterobject;
12722
12723static void
12724unicodeiter_dealloc(unicodeiterobject *it)
12725{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012726 _PyObject_GC_UNTRACK(it);
12727 Py_XDECREF(it->it_seq);
12728 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012729}
12730
12731static int
12732unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12733{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012734 Py_VISIT(it->it_seq);
12735 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012736}
12737
12738static PyObject *
12739unicodeiter_next(unicodeiterobject *it)
12740{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012741 PyUnicodeObject *seq;
12742 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012743
Benjamin Peterson14339b62009-01-31 16:36:08 +000012744 assert(it != NULL);
12745 seq = it->it_seq;
12746 if (seq == NULL)
12747 return NULL;
12748 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12751 int kind = PyUnicode_KIND(seq);
12752 void *data = PyUnicode_DATA(seq);
12753 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12754 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012755 if (item != NULL)
12756 ++it->it_index;
12757 return item;
12758 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012759
Benjamin Peterson14339b62009-01-31 16:36:08 +000012760 Py_DECREF(seq);
12761 it->it_seq = NULL;
12762 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012763}
12764
12765static PyObject *
12766unicodeiter_len(unicodeiterobject *it)
12767{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012768 Py_ssize_t len = 0;
12769 if (it->it_seq)
12770 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12771 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012772}
12773
12774PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12775
12776static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012777 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012778 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012779 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012780};
12781
12782PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012783 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12784 "str_iterator", /* tp_name */
12785 sizeof(unicodeiterobject), /* tp_basicsize */
12786 0, /* tp_itemsize */
12787 /* methods */
12788 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12789 0, /* tp_print */
12790 0, /* tp_getattr */
12791 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012792 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012793 0, /* tp_repr */
12794 0, /* tp_as_number */
12795 0, /* tp_as_sequence */
12796 0, /* tp_as_mapping */
12797 0, /* tp_hash */
12798 0, /* tp_call */
12799 0, /* tp_str */
12800 PyObject_GenericGetAttr, /* tp_getattro */
12801 0, /* tp_setattro */
12802 0, /* tp_as_buffer */
12803 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12804 0, /* tp_doc */
12805 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12806 0, /* tp_clear */
12807 0, /* tp_richcompare */
12808 0, /* tp_weaklistoffset */
12809 PyObject_SelfIter, /* tp_iter */
12810 (iternextfunc)unicodeiter_next, /* tp_iternext */
12811 unicodeiter_methods, /* tp_methods */
12812 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012813};
12814
12815static PyObject *
12816unicode_iter(PyObject *seq)
12817{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012818 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012819
Benjamin Peterson14339b62009-01-31 16:36:08 +000012820 if (!PyUnicode_Check(seq)) {
12821 PyErr_BadInternalCall();
12822 return NULL;
12823 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824 if (PyUnicode_READY(seq) == -1)
12825 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012826 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12827 if (it == NULL)
12828 return NULL;
12829 it->it_index = 0;
12830 Py_INCREF(seq);
12831 it->it_seq = (PyUnicodeObject *)seq;
12832 _PyObject_GC_TRACK(it);
12833 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012834}
12835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012836#define UNIOP(x) Py_UNICODE_##x
12837#define UNIOP_t Py_UNICODE
12838#include "uniops.h"
12839#undef UNIOP
12840#undef UNIOP_t
12841#define UNIOP(x) Py_UCS4_##x
12842#define UNIOP_t Py_UCS4
12843#include "uniops.h"
12844#undef UNIOP
12845#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012846
Victor Stinner71133ff2010-09-01 23:43:53 +000012847Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012848PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012849{
12850 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12851 Py_UNICODE *copy;
12852 Py_ssize_t size;
12853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854 if (!PyUnicode_Check(unicode)) {
12855 PyErr_BadArgument();
12856 return NULL;
12857 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012858 /* Ensure we won't overflow the size. */
12859 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12860 PyErr_NoMemory();
12861 return NULL;
12862 }
12863 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12864 size *= sizeof(Py_UNICODE);
12865 copy = PyMem_Malloc(size);
12866 if (copy == NULL) {
12867 PyErr_NoMemory();
12868 return NULL;
12869 }
12870 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12871 return copy;
12872}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012873
Georg Brandl66c221e2010-10-14 07:04:07 +000012874/* A _string module, to export formatter_parser and formatter_field_name_split
12875 to the string.Formatter class implemented in Python. */
12876
12877static PyMethodDef _string_methods[] = {
12878 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12879 METH_O, PyDoc_STR("split the argument as a field name")},
12880 {"formatter_parser", (PyCFunction) formatter_parser,
12881 METH_O, PyDoc_STR("parse the argument as a format string")},
12882 {NULL, NULL}
12883};
12884
12885static struct PyModuleDef _string_module = {
12886 PyModuleDef_HEAD_INIT,
12887 "_string",
12888 PyDoc_STR("string helper module"),
12889 0,
12890 _string_methods,
12891 NULL,
12892 NULL,
12893 NULL,
12894 NULL
12895};
12896
12897PyMODINIT_FUNC
12898PyInit__string(void)
12899{
12900 return PyModule_Create(&_string_module);
12901}
12902
12903
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012904#ifdef __cplusplus
12905}
12906#endif