blob: a02c2227fc1f9fc9b31b59020d7ddfe73c05b4dc [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200108#define _PyUnicode_UTF8(op) \
109 (PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 ((PyCompactUnicodeObject*)(op))->utf8)
112#define _PyUnicode_UTF8_LENGTH(op) \
113 (PyUnicode_IS_COMPACT_ASCII(op) ? \
114 ((PyASCIIObject*)(op))->length : \
115 ((PyCompactUnicodeObject*)(op))->utf8_length)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200116#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
117#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
119#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
120#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
121#define _PyUnicode_KIND(op) \
122 (assert(PyUnicode_Check(op)), \
123 ((PyASCIIObject *)(op))->state.kind)
124#define _PyUnicode_GET_LENGTH(op) \
125 (assert(PyUnicode_Check(op)), \
126 ((PyASCIIObject *)(op))->length)
127
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200128/* The Unicode string has been modified: reset the hash */
129#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131
Walter Dörwald16807132007-05-25 13:52:07 +0000132/* This dictionary holds all interned unicode strings. Note that references
133 to strings in this dictionary are *not* counted in the string's ob_refcnt.
134 When the interned string reaches a refcnt of 0 the string deallocation
135 function will delete the reference from this dictionary.
136
137 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000138 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000139*/
140static PyObject *interned;
141
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000142/* The empty Unicode object is shared to improve performance. */
143static PyUnicodeObject *unicode_empty;
144
145/* Single character Unicode strings in the Latin-1 range are being
146 shared as well. */
147static PyUnicodeObject *unicode_latin1[256];
148
Christian Heimes190d79e2008-01-30 11:58:22 +0000149/* Fast detection of the most frequent whitespace characters */
150const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000151 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000152/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000153/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000154/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000155/* case 0x000C: * FORM FEED */
156/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000157 0, 1, 1, 1, 1, 1, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* case 0x001C: * FILE SEPARATOR */
160/* case 0x001D: * GROUP SEPARATOR */
161/* case 0x001E: * RECORD SEPARATOR */
162/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000163 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000165 1, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000169
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000178};
179
Alexander Belopolsky40018472011-02-26 01:02:56 +0000180static PyObject *
181unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000182 PyObject **errorHandler,const char *encoding, const char *reason,
183 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
184 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
185
Alexander Belopolsky40018472011-02-26 01:02:56 +0000186static void
187raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300188 const char *encoding,
189 const Py_UNICODE *unicode, Py_ssize_t size,
190 Py_ssize_t startpos, Py_ssize_t endpos,
191 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000192
Christian Heimes190d79e2008-01-30 11:58:22 +0000193/* Same for linebreaks */
194static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000196/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* 0x000B, * LINE TABULATION */
198/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000202/* 0x001C, * FILE SEPARATOR */
203/* 0x001D, * GROUP SEPARATOR */
204/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 1, 1, 1, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000210
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000219};
220
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300221/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
222 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000223Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000224PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000225{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000226#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000227 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000228#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 /* This is actually an illegal character, so it should
230 not be passed to unichr. */
231 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000232#endif
233}
234
Thomas Wouters477c8d52006-05-27 19:21:47 +0000235/* --- Bloom Filters ----------------------------------------------------- */
236
237/* stuff to implement simple "bloom filters" for Unicode characters.
238 to keep things simple, we use a single bitmask, using the least 5
239 bits from each unicode characters as the bit index. */
240
241/* the linebreak mask is set up by Unicode_Init below */
242
Antoine Pitrouf068f942010-01-13 14:19:12 +0000243#if LONG_BIT >= 128
244#define BLOOM_WIDTH 128
245#elif LONG_BIT >= 64
246#define BLOOM_WIDTH 64
247#elif LONG_BIT >= 32
248#define BLOOM_WIDTH 32
249#else
250#error "LONG_BIT is smaller than 32"
251#endif
252
Thomas Wouters477c8d52006-05-27 19:21:47 +0000253#define BLOOM_MASK unsigned long
254
255static BLOOM_MASK bloom_linebreak;
256
Antoine Pitrouf068f942010-01-13 14:19:12 +0000257#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
258#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259
Benjamin Peterson29060642009-01-31 22:14:21 +0000260#define BLOOM_LINEBREAK(ch) \
261 ((ch) < 128U ? ascii_linebreak[(ch)] : \
262 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000263
Alexander Belopolsky40018472011-02-26 01:02:56 +0000264Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200265make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266{
267 /* calculate simple bloom-style bitmask for a given unicode string */
268
Antoine Pitrouf068f942010-01-13 14:19:12 +0000269 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270 Py_ssize_t i;
271
272 mask = 0;
273 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200274 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000275
276 return mask;
277}
278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279#define BLOOM_MEMBER(mask, chr, str) \
280 (BLOOM(mask, chr) \
281 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
Guido van Rossumd57fd912000-03-10 22:53:23 +0000283/* --- Unicode Object ----------------------------------------------------- */
284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200285static PyObject *
286substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len);
287
288static PyObject *
289fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
290
291Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
292 Py_ssize_t size, Py_UCS4 ch,
293 int direction)
294{
295 /* like wcschr, but doesn't stop at NULL characters */
296 Py_ssize_t i;
297 if (direction == 1) {
298 for(i = 0; i < size; i++)
299 if (PyUnicode_READ(kind, s, i) == ch)
300 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
301 }
302 else {
303 for(i = size-1; i >= 0; i--)
304 if (PyUnicode_READ(kind, s, i) == ch)
305 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
306 }
307 return NULL;
308}
309
Alexander Belopolsky40018472011-02-26 01:02:56 +0000310static int
311unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200312 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313{
314 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 /* Resizing is only supported for old unicode objects. */
317 assert(!PyUnicode_IS_COMPACT(unicode));
318 assert(_PyUnicode_WSTR(unicode) != NULL);
319
320 /* ... and only if they have not been readied yet, because
321 callees usually rely on the wstr representation when resizing. */
322 assert(unicode->data.any == NULL);
323
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000324 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200325 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000326 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 /* Resizing shared object (unicode_empty or single character
329 objects) in-place is not allowed. Use PyUnicode_Resize()
330 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000331
Benjamin Peterson14339b62009-01-31 16:36:08 +0000332 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200333 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
334 _PyUnicode_WSTR(unicode)[0] < 256U &&
335 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000337 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 return -1;
339 }
340
Thomas Wouters477c8d52006-05-27 19:21:47 +0000341 /* We allocate one more byte to make sure the string is Ux0000 terminated.
342 The overallocation is also used by fastsearch, which assumes that it's
343 safe to look at str[length] (without making any assumptions about what
344 it contains). */
345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 oldstr = _PyUnicode_WSTR(unicode);
347 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
348 sizeof(Py_UNICODE) * (length + 1));
349 if (!_PyUnicode_WSTR(unicode)) {
350 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 PyErr_NoMemory();
352 return -1;
353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 _PyUnicode_WSTR(unicode)[length] = 0;
355 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356
Benjamin Peterson29060642009-01-31 22:14:21 +0000357 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 if (unicode->data.any != NULL) {
359 PyObject_FREE(unicode->data.any);
360 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
361 PyObject_FREE(unicode->_base.utf8);
362 }
363 unicode->_base.utf8 = NULL;
364 unicode->_base.utf8_length = 0;
365 unicode->data.any = NULL;
366 _PyUnicode_LENGTH(unicode) = 0;
367 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
368 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200370 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000371
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 return 0;
373}
374
375/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000376 Ux0000 terminated; some code (e.g. new_identifier)
377 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378
379 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000380 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381
382*/
383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384#ifdef Py_DEBUG
385int unicode_old_new_calls = 0;
386#endif
387
Alexander Belopolsky40018472011-02-26 01:02:56 +0000388static PyUnicodeObject *
389_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000390{
391 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200392 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393
Thomas Wouters477c8d52006-05-27 19:21:47 +0000394 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000395 if (length == 0 && unicode_empty != NULL) {
396 Py_INCREF(unicode_empty);
397 return unicode_empty;
398 }
399
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000400 /* Ensure we won't overflow the size. */
401 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
402 return (PyUnicodeObject *)PyErr_NoMemory();
403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200404 if (length < 0) {
405 PyErr_SetString(PyExc_SystemError,
406 "Negative size passed to _PyUnicode_New");
407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 }
409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410#ifdef Py_DEBUG
411 ++unicode_old_new_calls;
412#endif
413
414 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
415 if (unicode == NULL)
416 return NULL;
417 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
418 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
419 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 PyErr_NoMemory();
421 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200423
Jeremy Hyltond8082792003-09-16 19:41:39 +0000424 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000425 * the caller fails before initializing str -- unicode_resize()
426 * reads str[0], and the Keep-Alive optimization can keep memory
427 * allocated for str alive across a call to unicode_dealloc(unicode).
428 * We don't want unicode_resize to read uninitialized memory in
429 * that case.
430 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431 _PyUnicode_WSTR(unicode)[0] = 0;
432 _PyUnicode_WSTR(unicode)[length] = 0;
433 _PyUnicode_WSTR_LENGTH(unicode) = length;
434 _PyUnicode_HASH(unicode) = -1;
435 _PyUnicode_STATE(unicode).interned = 0;
436 _PyUnicode_STATE(unicode).kind = 0;
437 _PyUnicode_STATE(unicode).compact = 0;
438 _PyUnicode_STATE(unicode).ready = 0;
439 _PyUnicode_STATE(unicode).ascii = 0;
440 unicode->data.any = NULL;
441 _PyUnicode_LENGTH(unicode) = 0;
442 unicode->_base.utf8 = NULL;
443 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000445
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000447 /* XXX UNREF/NEWREF interface should be more symmetrical */
448 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000449 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000450 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000451 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452}
453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454#ifdef Py_DEBUG
455int unicode_new_new_calls = 0;
456
457/* Functions wrapping macros for use in debugger */
458char *_PyUnicode_utf8(void *unicode){
459 return _PyUnicode_UTF8(unicode);
460}
461
462void *_PyUnicode_compact_data(void *unicode) {
463 return _PyUnicode_COMPACT_DATA(unicode);
464}
465void *_PyUnicode_data(void *unicode){
466 printf("obj %p\n", unicode);
467 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
468 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
469 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
470 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
471 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
472 return PyUnicode_DATA(unicode);
473}
474#endif
475
476PyObject *
477PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
478{
479 PyObject *obj;
480 PyCompactUnicodeObject *unicode;
481 void *data;
482 int kind_state;
483 int is_sharing = 0, is_ascii = 0;
484 Py_ssize_t char_size;
485 Py_ssize_t struct_size;
486
487 /* Optimization for empty strings */
488 if (size == 0 && unicode_empty != NULL) {
489 Py_INCREF(unicode_empty);
490 return (PyObject *)unicode_empty;
491 }
492
493#ifdef Py_DEBUG
494 ++unicode_new_new_calls;
495#endif
496
497 struct_size = sizeof(PyCompactUnicodeObject);
498 if (maxchar < 128) {
499 kind_state = PyUnicode_1BYTE_KIND;
500 char_size = 1;
501 is_ascii = 1;
502 struct_size = sizeof(PyASCIIObject);
503 }
504 else if (maxchar < 256) {
505 kind_state = PyUnicode_1BYTE_KIND;
506 char_size = 1;
507 }
508 else if (maxchar < 65536) {
509 kind_state = PyUnicode_2BYTE_KIND;
510 char_size = 2;
511 if (sizeof(wchar_t) == 2)
512 is_sharing = 1;
513 }
514 else {
515 kind_state = PyUnicode_4BYTE_KIND;
516 char_size = 4;
517 if (sizeof(wchar_t) == 4)
518 is_sharing = 1;
519 }
520
521 /* Ensure we won't overflow the size. */
522 if (size < 0) {
523 PyErr_SetString(PyExc_SystemError,
524 "Negative size passed to PyUnicode_New");
525 return NULL;
526 }
527 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
528 return PyErr_NoMemory();
529
530 /* Duplicated allocation code from _PyObject_New() instead of a call to
531 * PyObject_New() so we are able to allocate space for the object and
532 * it's data buffer.
533 */
534 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
535 if (obj == NULL)
536 return PyErr_NoMemory();
537 obj = PyObject_INIT(obj, &PyUnicode_Type);
538 if (obj == NULL)
539 return NULL;
540
541 unicode = (PyCompactUnicodeObject *)obj;
542 if (is_ascii)
543 data = ((PyASCIIObject*)obj) + 1;
544 else
545 data = unicode + 1;
546 _PyUnicode_LENGTH(unicode) = size;
547 _PyUnicode_HASH(unicode) = -1;
548 _PyUnicode_STATE(unicode).interned = 0;
549 _PyUnicode_STATE(unicode).kind = kind_state;
550 _PyUnicode_STATE(unicode).compact = 1;
551 _PyUnicode_STATE(unicode).ready = 1;
552 _PyUnicode_STATE(unicode).ascii = is_ascii;
553 if (is_ascii) {
554 ((char*)data)[size] = 0;
555 _PyUnicode_WSTR(unicode) = NULL;
556 }
557 else if (kind_state == PyUnicode_1BYTE_KIND) {
558 ((char*)data)[size] = 0;
559 _PyUnicode_WSTR(unicode) = NULL;
560 _PyUnicode_WSTR_LENGTH(unicode) = 0;
561 unicode->utf8_length = 0;
562 unicode->utf8 = NULL;
563 }
564 else {
565 unicode->utf8 = NULL;
566 if (kind_state == PyUnicode_2BYTE_KIND)
567 ((Py_UCS2*)data)[size] = 0;
568 else /* kind_state == PyUnicode_4BYTE_KIND */
569 ((Py_UCS4*)data)[size] = 0;
570 if (is_sharing) {
571 _PyUnicode_WSTR_LENGTH(unicode) = size;
572 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
573 }
574 else {
575 _PyUnicode_WSTR_LENGTH(unicode) = 0;
576 _PyUnicode_WSTR(unicode) = NULL;
577 }
578 }
579 return obj;
580}
581
582#if SIZEOF_WCHAR_T == 2
583/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
584 will decode surrogate pairs, the other conversions are implemented as macros
585 for efficency.
586
587 This function assumes that unicode can hold one more code point than wstr
588 characters for a terminating null character. */
589static int
590unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
591 PyUnicodeObject *unicode)
592{
593 const wchar_t *iter;
594 Py_UCS4 *ucs4_out;
595
596 assert(unicode && PyUnicode_Check(unicode));
597 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
598 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
599
600 for (iter = begin; iter < end; ) {
601 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
602 _PyUnicode_GET_LENGTH(unicode)));
603 if (*iter >= 0xD800 && *iter <= 0xDBFF
604 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
605 {
606 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
607 iter += 2;
608 }
609 else {
610 *ucs4_out++ = *iter;
611 iter++;
612 }
613 }
614 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
615 _PyUnicode_GET_LENGTH(unicode)));
616
617 return 0;
618}
619#endif
620
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200621Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200622PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
623 PyObject *from, Py_ssize_t from_start,
624 Py_ssize_t how_many)
625{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200626 unsigned int from_kind, to_kind;
627 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200628
Victor Stinnerb1536152011-09-30 02:26:10 +0200629 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
630 PyErr_BadInternalCall();
631 return -1;
632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200633
634 if (PyUnicode_READY(from))
635 return -1;
636 if (PyUnicode_READY(to))
637 return -1;
638
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200639 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200640 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
641 PyErr_Format(PyExc_ValueError,
642 "Cannot write %zi characters at %zi "
643 "in a string of %zi characters",
644 how_many, to_start, PyUnicode_GET_LENGTH(to));
645 return -1;
646 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200647 if (how_many == 0)
648 return 0;
649
650 if (Py_REFCNT(to) != 1) {
651 PyErr_SetString(PyExc_ValueError,
652 "Cannot modify a string having more than 1 reference");
653 return -1;
654 }
Victor Stinnerc17f5402011-09-29 00:16:58 +0200655 _PyUnicode_DIRTY(to);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200657 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200658 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200659 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200660 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200661
662 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200663 /* fast path */
Victor Stinnera0702ab2011-09-29 14:14:38 +0200664 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200665 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200666 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200667 + PyUnicode_KIND_SIZE(from_kind, from_start),
668 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200669 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200670 else if (from_kind == PyUnicode_1BYTE_KIND
671 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200672 {
673 _PyUnicode_CONVERT_BYTES(
674 Py_UCS1, Py_UCS2,
675 PyUnicode_1BYTE_DATA(from) + from_start,
676 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
677 PyUnicode_2BYTE_DATA(to) + to_start
678 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200679 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200680 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200681 && to_kind == PyUnicode_4BYTE_KIND)
682 {
683 _PyUnicode_CONVERT_BYTES(
684 Py_UCS1, Py_UCS4,
685 PyUnicode_1BYTE_DATA(from) + from_start,
686 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
687 PyUnicode_4BYTE_DATA(to) + to_start
688 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200689 }
690 else if (from_kind == PyUnicode_2BYTE_KIND
691 && to_kind == PyUnicode_4BYTE_KIND)
692 {
693 _PyUnicode_CONVERT_BYTES(
694 Py_UCS2, Py_UCS4,
695 PyUnicode_2BYTE_DATA(from) + from_start,
696 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
697 PyUnicode_4BYTE_DATA(to) + to_start
698 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200699 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200700 else {
701 int invalid_kinds;
702 if (from_kind > to_kind) {
703 /* slow path to check for character overflow */
704 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
705 Py_UCS4 ch, maxchar;
706 Py_ssize_t i;
707
708 maxchar = 0;
709 invalid_kinds = 0;
710 for (i=0; i < how_many; i++) {
711 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
712 if (ch > maxchar) {
713 maxchar = ch;
714 if (maxchar > to_maxchar) {
715 invalid_kinds = 1;
716 break;
717 }
718 }
719 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
720 }
721 }
722 else
723 invalid_kinds = 1;
724 if (invalid_kinds) {
725 PyErr_Format(PyExc_ValueError,
726 "Cannot copy UCS%u characters "
727 "into a string of UCS%u characters",
728 1 << (from_kind - 1),
729 1 << (to_kind -1));
730 return -1;
731 }
732 }
733 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200734}
735
Victor Stinner17222162011-09-28 22:15:37 +0200736/* Find the maximum code point and count the number of surrogate pairs so a
737 correct string length can be computed before converting a string to UCS4.
738 This function counts single surrogates as a character and not as a pair.
739
740 Return 0 on success, or -1 on error. */
741static int
742find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
743 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744{
745 const wchar_t *iter;
746
747 if (num_surrogates == NULL || maxchar == NULL) {
748 PyErr_SetString(PyExc_SystemError,
749 "unexpected NULL arguments to "
750 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
751 return -1;
752 }
753
754 *num_surrogates = 0;
755 *maxchar = 0;
756
757 for (iter = begin; iter < end; ) {
758 if (*iter > *maxchar)
759 *maxchar = *iter;
760#if SIZEOF_WCHAR_T == 2
761 if (*iter >= 0xD800 && *iter <= 0xDBFF
762 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
763 {
764 Py_UCS4 surrogate_val;
765 surrogate_val = (((iter[0] & 0x3FF)<<10)
766 | (iter[1] & 0x3FF)) + 0x10000;
767 ++(*num_surrogates);
768 if (surrogate_val > *maxchar)
769 *maxchar = surrogate_val;
770 iter += 2;
771 }
772 else
773 iter++;
774#else
775 iter++;
776#endif
777 }
778 return 0;
779}
780
781#ifdef Py_DEBUG
782int unicode_ready_calls = 0;
783#endif
784
785int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200786_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200788 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200789 wchar_t *end;
790 Py_UCS4 maxchar = 0;
791 Py_ssize_t num_surrogates;
792#if SIZEOF_WCHAR_T == 2
793 Py_ssize_t length_wo_surrogates;
794#endif
795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200796 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200797 strings were created using _PyObject_New() and where no canonical
798 representation (the str field) has been set yet aka strings
799 which are not yet ready. */
800 assert(PyUnicode_Check(obj));
801 assert(!PyUnicode_IS_READY(obj));
802 assert(!PyUnicode_IS_COMPACT(obj));
803 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200804 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200805 assert(unicode->data.any == NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200806 assert(unicode->_base.utf8 == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200807 /* Actually, it should neither be interned nor be anything else: */
808 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809
810#ifdef Py_DEBUG
811 ++unicode_ready_calls;
812#endif
813
814 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200815 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200816 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200817 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200818
819 if (maxchar < 256) {
820 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
821 if (!unicode->data.any) {
822 PyErr_NoMemory();
823 return -1;
824 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200825 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826 _PyUnicode_WSTR(unicode), end,
827 PyUnicode_1BYTE_DATA(unicode));
828 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
829 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
830 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
831 if (maxchar < 128) {
832 unicode->_base.utf8 = unicode->data.any;
833 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
834 }
835 else {
836 unicode->_base.utf8 = NULL;
837 unicode->_base.utf8_length = 0;
838 }
839 PyObject_FREE(_PyUnicode_WSTR(unicode));
840 _PyUnicode_WSTR(unicode) = NULL;
841 _PyUnicode_WSTR_LENGTH(unicode) = 0;
842 }
843 /* In this case we might have to convert down from 4-byte native
844 wchar_t to 2-byte unicode. */
845 else if (maxchar < 65536) {
846 assert(num_surrogates == 0 &&
847 "FindMaxCharAndNumSurrogatePairs() messed up");
848
Victor Stinner506f5922011-09-28 22:34:18 +0200849#if SIZEOF_WCHAR_T == 2
850 /* We can share representations and are done. */
851 unicode->data.any = _PyUnicode_WSTR(unicode);
852 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
853 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
854 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
855 unicode->_base.utf8 = NULL;
856 unicode->_base.utf8_length = 0;
857#else
858 /* sizeof(wchar_t) == 4 */
859 unicode->data.any = PyObject_MALLOC(
860 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
861 if (!unicode->data.any) {
862 PyErr_NoMemory();
863 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200864 }
Victor Stinner506f5922011-09-28 22:34:18 +0200865 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
866 _PyUnicode_WSTR(unicode), end,
867 PyUnicode_2BYTE_DATA(unicode));
868 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
869 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
870 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
871 unicode->_base.utf8 = NULL;
872 unicode->_base.utf8_length = 0;
873 PyObject_FREE(_PyUnicode_WSTR(unicode));
874 _PyUnicode_WSTR(unicode) = NULL;
875 _PyUnicode_WSTR_LENGTH(unicode) = 0;
876#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877 }
878 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
879 else {
880#if SIZEOF_WCHAR_T == 2
881 /* in case the native representation is 2-bytes, we need to allocate a
882 new normalized 4-byte version. */
883 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
884 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
885 if (!unicode->data.any) {
886 PyErr_NoMemory();
887 return -1;
888 }
889 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
890 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
891 unicode->_base.utf8 = NULL;
892 unicode->_base.utf8_length = 0;
893 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
894 unicode) < 0) {
895 assert(0 && "ConvertWideCharToUCS4 failed");
896 return -1;
897 }
898 PyObject_FREE(_PyUnicode_WSTR(unicode));
899 _PyUnicode_WSTR(unicode) = NULL;
900 _PyUnicode_WSTR_LENGTH(unicode) = 0;
901#else
902 assert(num_surrogates == 0);
903
904 unicode->data.any = _PyUnicode_WSTR(unicode);
905 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
906 unicode->_base.utf8 = NULL;
907 unicode->_base.utf8_length = 0;
908 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
909#endif
910 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
911 }
912 _PyUnicode_STATE(unicode).ready = 1;
913 return 0;
914}
915
Alexander Belopolsky40018472011-02-26 01:02:56 +0000916static void
917unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000918{
Walter Dörwald16807132007-05-25 13:52:07 +0000919 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 case SSTATE_NOT_INTERNED:
921 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000922
Benjamin Peterson29060642009-01-31 22:14:21 +0000923 case SSTATE_INTERNED_MORTAL:
924 /* revive dead object temporarily for DelItem */
925 Py_REFCNT(unicode) = 3;
926 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
927 Py_FatalError(
928 "deletion of interned string failed");
929 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000930
Benjamin Peterson29060642009-01-31 22:14:21 +0000931 case SSTATE_INTERNED_IMMORTAL:
932 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000933
Benjamin Peterson29060642009-01-31 22:14:21 +0000934 default:
935 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000936 }
937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200938 if (_PyUnicode_WSTR(unicode) &&
939 (!PyUnicode_IS_READY(unicode) ||
940 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
941 PyObject_DEL(_PyUnicode_WSTR(unicode));
942 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
943 PyObject_DEL(unicode->_base.utf8);
944
945 if (PyUnicode_IS_COMPACT(unicode)) {
946 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947 }
948 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200949 if (unicode->data.any)
950 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000951 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000952 }
953}
954
Alexander Belopolsky40018472011-02-26 01:02:56 +0000955static int
956_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000957{
958 register PyUnicodeObject *v;
959
960 /* Argument checks */
961 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000962 PyErr_BadInternalCall();
963 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000964 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000965 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
967 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000968 PyErr_BadInternalCall();
969 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000970 }
971
972 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973 possible since these are being shared.
974 The same goes for new-representation unicode objects or objects which
975 have already been readied.
976 For these, we simply return a fresh copy with the same Unicode content.
977 */
978 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
979 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
980 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000981 PyUnicodeObject *w = _PyUnicode_New(length);
982 if (w == NULL)
983 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
985 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000986 Py_DECREF(*unicode);
987 *unicode = w;
988 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000989 }
990
991 /* Note that we don't have to modify *unicode for unshared Unicode
992 objects, since we can modify them in-place. */
993 return unicode_resize(v, length);
994}
995
Alexander Belopolsky40018472011-02-26 01:02:56 +0000996int
997PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000998{
999 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
1000}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002static PyObject*
1003get_latin1_char(unsigned char ch)
1004{
1005 PyUnicodeObject *unicode = unicode_latin1[ch];
1006 if (!unicode) {
1007 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
1008 if (!unicode)
1009 return NULL;
1010 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1011 unicode_latin1[ch] = unicode;
1012 }
1013 Py_INCREF(unicode);
1014 return (PyObject *)unicode;
1015}
1016
Alexander Belopolsky40018472011-02-26 01:02:56 +00001017PyObject *
1018PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019{
1020 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001021 Py_UCS4 maxchar = 0;
1022 Py_ssize_t num_surrogates;
1023
1024 if (u == NULL)
1025 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001027 /* If the Unicode data is known at construction time, we can apply
1028 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030 /* Optimization for empty strings */
1031 if (size == 0 && unicode_empty != NULL) {
1032 Py_INCREF(unicode_empty);
1033 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001034 }
Tim Petersced69f82003-09-16 20:30:58 +00001035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001036 /* Single character Unicode objects in the Latin-1 range are
1037 shared when using this constructor */
1038 if (size == 1 && *u < 256)
1039 return get_latin1_char((unsigned char)*u);
1040
1041 /* If not empty and not single character, copy the Unicode data
1042 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001043 if (find_maxchar_surrogates(u, u + size,
1044 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 return NULL;
1046
1047 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1048 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049 if (!unicode)
1050 return NULL;
1051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 switch (PyUnicode_KIND(unicode)) {
1053 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001054 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001055 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1056 break;
1057 case PyUnicode_2BYTE_KIND:
1058#if Py_UNICODE_SIZE == 2
1059 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1060#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001061 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1063#endif
1064 break;
1065 case PyUnicode_4BYTE_KIND:
1066#if SIZEOF_WCHAR_T == 2
1067 /* This is the only case which has to process surrogates, thus
1068 a simple copy loop is not enough and we need a function. */
1069 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1070 Py_DECREF(unicode);
1071 return NULL;
1072 }
1073#else
1074 assert(num_surrogates == 0);
1075 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1076#endif
1077 break;
1078 default:
1079 assert(0 && "Impossible state");
1080 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001081
1082 return (PyObject *)unicode;
1083}
1084
Alexander Belopolsky40018472011-02-26 01:02:56 +00001085PyObject *
1086PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001087{
1088 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001089
Benjamin Peterson14339b62009-01-31 16:36:08 +00001090 if (size < 0) {
1091 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001092 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001093 return NULL;
1094 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001095
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001096 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001097 some optimizations which share commonly used objects.
1098 Also, this means the input must be UTF-8, so fall back to the
1099 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001100 if (u != NULL) {
1101
Benjamin Peterson29060642009-01-31 22:14:21 +00001102 /* Optimization for empty strings */
1103 if (size == 0 && unicode_empty != NULL) {
1104 Py_INCREF(unicode_empty);
1105 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001106 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001107
1108 /* Single characters are shared when using this constructor.
1109 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110 if (size == 1 && Py_CHARMASK(*u) < 128)
1111 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001112
1113 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001114 }
1115
Walter Dörwald55507312007-05-18 13:12:10 +00001116 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001117 if (!unicode)
1118 return NULL;
1119
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001120 return (PyObject *)unicode;
1121}
1122
Alexander Belopolsky40018472011-02-26 01:02:56 +00001123PyObject *
1124PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001125{
1126 size_t size = strlen(u);
1127 if (size > PY_SSIZE_T_MAX) {
1128 PyErr_SetString(PyExc_OverflowError, "input too long");
1129 return NULL;
1130 }
1131
1132 return PyUnicode_FromStringAndSize(u, size);
1133}
1134
Victor Stinnere57b1c02011-09-28 22:20:48 +02001135static PyObject*
1136_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138 PyObject *res;
1139 unsigned char max = 127;
1140 Py_ssize_t i;
1141 for (i = 0; i < size; i++) {
1142 if (u[i] & 0x80) {
1143 max = 255;
1144 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001145 }
1146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147 res = PyUnicode_New(size, max);
1148 if (!res)
1149 return NULL;
1150 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1151 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001152}
1153
Victor Stinnere57b1c02011-09-28 22:20:48 +02001154static PyObject*
1155_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156{
1157 PyObject *res;
1158 Py_UCS2 max = 0;
1159 Py_ssize_t i;
1160 for (i = 0; i < size; i++)
1161 if (u[i] > max)
1162 max = u[i];
1163 res = PyUnicode_New(size, max);
1164 if (!res)
1165 return NULL;
1166 if (max >= 256)
1167 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1168 else
1169 for (i = 0; i < size; i++)
1170 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1171 return res;
1172}
1173
Victor Stinnere57b1c02011-09-28 22:20:48 +02001174static PyObject*
1175_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176{
1177 PyObject *res;
1178 Py_UCS4 max = 0;
1179 Py_ssize_t i;
1180 for (i = 0; i < size; i++)
1181 if (u[i] > max)
1182 max = u[i];
1183 res = PyUnicode_New(size, max);
1184 if (!res)
1185 return NULL;
1186 if (max >= 0x10000)
1187 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1188 else {
1189 int kind = PyUnicode_KIND(res);
1190 void *data = PyUnicode_DATA(res);
1191 for (i = 0; i < size; i++)
1192 PyUnicode_WRITE(kind, data, i, u[i]);
1193 }
1194 return res;
1195}
1196
1197PyObject*
1198PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1199{
1200 switch(kind) {
1201 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001202 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001204 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001205 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001206 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 }
1208 assert(0);
1209 return NULL;
1210}
1211
1212
1213/* Widen Unicode objects to larger buffers.
1214 Return NULL if the string is too wide already. */
1215
1216void*
1217_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1218{
1219 Py_ssize_t i;
1220 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1221 void *d = PyUnicode_DATA(s);
1222 unsigned int skind = PyUnicode_KIND(s);
1223 if (PyUnicode_KIND(s) >= kind) {
1224 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1225 return NULL;
1226 }
1227 switch(kind) {
1228 case PyUnicode_2BYTE_KIND: {
1229 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1230 if (!result) {
1231 PyErr_NoMemory();
1232 return 0;
1233 }
1234 for (i = 0; i < len; i++)
1235 result[i] = ((Py_UCS1*)d)[i];
1236 return result;
1237 }
1238 case PyUnicode_4BYTE_KIND: {
1239 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1240 if (!result) {
1241 PyErr_NoMemory();
1242 return 0;
1243 }
1244 for (i = 0; i < len; i++)
1245 result[i] = PyUnicode_READ(skind, d, i);
1246 return result;
1247 }
1248 }
1249 Py_FatalError("invalid kind");
1250 return NULL;
1251}
1252
1253static Py_UCS4*
1254as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1255 int copy_null)
1256{
1257 int kind;
1258 void *data;
1259 Py_ssize_t len, targetlen;
1260 if (PyUnicode_READY(string) == -1)
1261 return NULL;
1262 kind = PyUnicode_KIND(string);
1263 data = PyUnicode_DATA(string);
1264 len = PyUnicode_GET_LENGTH(string);
1265 targetlen = len;
1266 if (copy_null)
1267 targetlen++;
1268 if (!target) {
1269 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1270 PyErr_NoMemory();
1271 return NULL;
1272 }
1273 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1274 if (!target) {
1275 PyErr_NoMemory();
1276 return NULL;
1277 }
1278 }
1279 else {
1280 if (targetsize < targetlen) {
1281 PyErr_Format(PyExc_SystemError,
1282 "string is longer than the buffer");
1283 if (copy_null && 0 < targetsize)
1284 target[0] = 0;
1285 return NULL;
1286 }
1287 }
1288 if (kind != PyUnicode_4BYTE_KIND) {
1289 Py_ssize_t i;
1290 for (i = 0; i < len; i++)
1291 target[i] = PyUnicode_READ(kind, data, i);
1292 }
1293 else
1294 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1295 if (copy_null)
1296 target[len] = 0;
1297 return target;
1298}
1299
1300Py_UCS4*
1301PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1302 int copy_null)
1303{
1304 if (target == NULL || targetsize < 1) {
1305 PyErr_BadInternalCall();
1306 return NULL;
1307 }
1308 return as_ucs4(string, target, targetsize, copy_null);
1309}
1310
1311Py_UCS4*
1312PyUnicode_AsUCS4Copy(PyObject *string)
1313{
1314 return as_ucs4(string, NULL, 0, 1);
1315}
1316
1317#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001318
Alexander Belopolsky40018472011-02-26 01:02:56 +00001319PyObject *
1320PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001323 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001325 PyErr_BadInternalCall();
1326 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 }
1328
Martin v. Löwis790465f2008-04-05 20:41:37 +00001329 if (size == -1) {
1330 size = wcslen(w);
1331 }
1332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334}
1335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001337
Walter Dörwald346737f2007-05-31 10:44:43 +00001338static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001339makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1340 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001341{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001342 *fmt++ = '%';
1343 if (width) {
1344 if (zeropad)
1345 *fmt++ = '0';
1346 fmt += sprintf(fmt, "%d", width);
1347 }
1348 if (precision)
1349 fmt += sprintf(fmt, ".%d", precision);
1350 if (longflag)
1351 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001352 else if (longlongflag) {
1353 /* longlongflag should only ever be nonzero on machines with
1354 HAVE_LONG_LONG defined */
1355#ifdef HAVE_LONG_LONG
1356 char *f = PY_FORMAT_LONG_LONG;
1357 while (*f)
1358 *fmt++ = *f++;
1359#else
1360 /* we shouldn't ever get here */
1361 assert(0);
1362 *fmt++ = 'l';
1363#endif
1364 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001365 else if (size_tflag) {
1366 char *f = PY_FORMAT_SIZE_T;
1367 while (*f)
1368 *fmt++ = *f++;
1369 }
1370 *fmt++ = c;
1371 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001372}
1373
Victor Stinner96865452011-03-01 23:44:09 +00001374/* helper for PyUnicode_FromFormatV() */
1375
1376static const char*
1377parse_format_flags(const char *f,
1378 int *p_width, int *p_precision,
1379 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1380{
1381 int width, precision, longflag, longlongflag, size_tflag;
1382
1383 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1384 f++;
1385 width = 0;
1386 while (Py_ISDIGIT((unsigned)*f))
1387 width = (width*10) + *f++ - '0';
1388 precision = 0;
1389 if (*f == '.') {
1390 f++;
1391 while (Py_ISDIGIT((unsigned)*f))
1392 precision = (precision*10) + *f++ - '0';
1393 if (*f == '%') {
1394 /* "%.3%s" => f points to "3" */
1395 f--;
1396 }
1397 }
1398 if (*f == '\0') {
1399 /* bogus format "%.1" => go backward, f points to "1" */
1400 f--;
1401 }
1402 if (p_width != NULL)
1403 *p_width = width;
1404 if (p_precision != NULL)
1405 *p_precision = precision;
1406
1407 /* Handle %ld, %lu, %lld and %llu. */
1408 longflag = 0;
1409 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001410 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001411
1412 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001413 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001414 longflag = 1;
1415 ++f;
1416 }
1417#ifdef HAVE_LONG_LONG
1418 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001419 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001420 longlongflag = 1;
1421 f += 2;
1422 }
1423#endif
1424 }
1425 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001426 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001427 size_tflag = 1;
1428 ++f;
1429 }
1430 if (p_longflag != NULL)
1431 *p_longflag = longflag;
1432 if (p_longlongflag != NULL)
1433 *p_longlongflag = longlongflag;
1434 if (p_size_tflag != NULL)
1435 *p_size_tflag = size_tflag;
1436 return f;
1437}
1438
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001439/* maximum number of characters required for output of %ld. 21 characters
1440 allows for 64-bit integers (in decimal) and an optional sign. */
1441#define MAX_LONG_CHARS 21
1442/* maximum number of characters required for output of %lld.
1443 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1444 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1445#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1446
Walter Dörwaldd2034312007-05-18 16:29:38 +00001447PyObject *
1448PyUnicode_FromFormatV(const char *format, va_list vargs)
1449{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001450 va_list count;
1451 Py_ssize_t callcount = 0;
1452 PyObject **callresults = NULL;
1453 PyObject **callresult = NULL;
1454 Py_ssize_t n = 0;
1455 int width = 0;
1456 int precision = 0;
1457 int zeropad;
1458 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001460 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001461 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1463 Py_UCS4 argmaxchar;
1464 Py_ssize_t numbersize = 0;
1465 char *numberresults = NULL;
1466 char *numberresult = NULL;
1467 Py_ssize_t i;
1468 int kind;
1469 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001470
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001471 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001472 /* step 1: count the number of %S/%R/%A/%s format specifications
1473 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1474 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475 * result in an array)
1476 * also esimate a upper bound for all the number formats in the string,
1477 * numbers will be formated in step 3 and be keept in a '\0'-separated
1478 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001479 for (f = format; *f; f++) {
1480 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001481 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1483 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1484 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1485 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001488#ifdef HAVE_LONG_LONG
1489 if (longlongflag) {
1490 if (width < MAX_LONG_LONG_CHARS)
1491 width = MAX_LONG_LONG_CHARS;
1492 }
1493 else
1494#endif
1495 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1496 including sign. Decimal takes the most space. This
1497 isn't enough for octal. If a width is specified we
1498 need more (which we allocate later). */
1499 if (width < MAX_LONG_CHARS)
1500 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501
1502 /* account for the size + '\0' to separate numbers
1503 inside of the numberresults buffer */
1504 numbersize += (width + 1);
1505 }
1506 }
1507 else if ((unsigned char)*f > 127) {
1508 PyErr_Format(PyExc_ValueError,
1509 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1510 "string, got a non-ASCII byte: 0x%02x",
1511 (unsigned char)*f);
1512 return NULL;
1513 }
1514 }
1515 /* step 2: allocate memory for the results of
1516 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1517 if (callcount) {
1518 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1519 if (!callresults) {
1520 PyErr_NoMemory();
1521 return NULL;
1522 }
1523 callresult = callresults;
1524 }
1525 /* step 2.5: allocate memory for the results of formating numbers */
1526 if (numbersize) {
1527 numberresults = PyObject_Malloc(numbersize);
1528 if (!numberresults) {
1529 PyErr_NoMemory();
1530 goto fail;
1531 }
1532 numberresult = numberresults;
1533 }
1534
1535 /* step 3: format numbers and figure out how large a buffer we need */
1536 for (f = format; *f; f++) {
1537 if (*f == '%') {
1538 const char* p;
1539 int longflag;
1540 int longlongflag;
1541 int size_tflag;
1542 int numprinted;
1543
1544 p = f;
1545 zeropad = (f[1] == '0');
1546 f = parse_format_flags(f, &width, &precision,
1547 &longflag, &longlongflag, &size_tflag);
1548 switch (*f) {
1549 case 'c':
1550 {
1551 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001552 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001553 n++;
1554 break;
1555 }
1556 case '%':
1557 n++;
1558 break;
1559 case 'i':
1560 case 'd':
1561 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1562 width, precision, *f);
1563 if (longflag)
1564 numprinted = sprintf(numberresult, fmt,
1565 va_arg(count, long));
1566#ifdef HAVE_LONG_LONG
1567 else if (longlongflag)
1568 numprinted = sprintf(numberresult, fmt,
1569 va_arg(count, PY_LONG_LONG));
1570#endif
1571 else if (size_tflag)
1572 numprinted = sprintf(numberresult, fmt,
1573 va_arg(count, Py_ssize_t));
1574 else
1575 numprinted = sprintf(numberresult, fmt,
1576 va_arg(count, int));
1577 n += numprinted;
1578 /* advance by +1 to skip over the '\0' */
1579 numberresult += (numprinted + 1);
1580 assert(*(numberresult - 1) == '\0');
1581 assert(*(numberresult - 2) != '\0');
1582 assert(numprinted >= 0);
1583 assert(numberresult <= numberresults + numbersize);
1584 break;
1585 case 'u':
1586 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1587 width, precision, 'u');
1588 if (longflag)
1589 numprinted = sprintf(numberresult, fmt,
1590 va_arg(count, unsigned long));
1591#ifdef HAVE_LONG_LONG
1592 else if (longlongflag)
1593 numprinted = sprintf(numberresult, fmt,
1594 va_arg(count, unsigned PY_LONG_LONG));
1595#endif
1596 else if (size_tflag)
1597 numprinted = sprintf(numberresult, fmt,
1598 va_arg(count, size_t));
1599 else
1600 numprinted = sprintf(numberresult, fmt,
1601 va_arg(count, unsigned int));
1602 n += numprinted;
1603 numberresult += (numprinted + 1);
1604 assert(*(numberresult - 1) == '\0');
1605 assert(*(numberresult - 2) != '\0');
1606 assert(numprinted >= 0);
1607 assert(numberresult <= numberresults + numbersize);
1608 break;
1609 case 'x':
1610 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1611 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1612 n += numprinted;
1613 numberresult += (numprinted + 1);
1614 assert(*(numberresult - 1) == '\0');
1615 assert(*(numberresult - 2) != '\0');
1616 assert(numprinted >= 0);
1617 assert(numberresult <= numberresults + numbersize);
1618 break;
1619 case 'p':
1620 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1621 /* %p is ill-defined: ensure leading 0x. */
1622 if (numberresult[1] == 'X')
1623 numberresult[1] = 'x';
1624 else if (numberresult[1] != 'x') {
1625 memmove(numberresult + 2, numberresult,
1626 strlen(numberresult) + 1);
1627 numberresult[0] = '0';
1628 numberresult[1] = 'x';
1629 numprinted += 2;
1630 }
1631 n += numprinted;
1632 numberresult += (numprinted + 1);
1633 assert(*(numberresult - 1) == '\0');
1634 assert(*(numberresult - 2) != '\0');
1635 assert(numprinted >= 0);
1636 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001637 break;
1638 case 's':
1639 {
1640 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001641 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001642 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1643 if (!str)
1644 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001645 /* since PyUnicode_DecodeUTF8 returns already flexible
1646 unicode objects, there is no need to call ready on them */
1647 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001648 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001649 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001650 /* Remember the str and switch to the next slot */
1651 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001652 break;
1653 }
1654 case 'U':
1655 {
1656 PyObject *obj = va_arg(count, PyObject *);
1657 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 if (PyUnicode_READY(obj) == -1)
1659 goto fail;
1660 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001661 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001663 break;
1664 }
1665 case 'V':
1666 {
1667 PyObject *obj = va_arg(count, PyObject *);
1668 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001669 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001670 assert(obj || str);
1671 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001672 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 if (PyUnicode_READY(obj) == -1)
1674 goto fail;
1675 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001676 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001677 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001678 *callresult++ = NULL;
1679 }
1680 else {
1681 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1682 if (!str_obj)
1683 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001685 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001686 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001687 *callresult++ = str_obj;
1688 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001689 break;
1690 }
1691 case 'S':
1692 {
1693 PyObject *obj = va_arg(count, PyObject *);
1694 PyObject *str;
1695 assert(obj);
1696 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001698 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001700 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001702 /* Remember the str and switch to the next slot */
1703 *callresult++ = str;
1704 break;
1705 }
1706 case 'R':
1707 {
1708 PyObject *obj = va_arg(count, PyObject *);
1709 PyObject *repr;
1710 assert(obj);
1711 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001712 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001713 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001715 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001717 /* Remember the repr and switch to the next slot */
1718 *callresult++ = repr;
1719 break;
1720 }
1721 case 'A':
1722 {
1723 PyObject *obj = va_arg(count, PyObject *);
1724 PyObject *ascii;
1725 assert(obj);
1726 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001727 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001728 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001730 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001732 /* Remember the repr and switch to the next slot */
1733 *callresult++ = ascii;
1734 break;
1735 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001736 default:
1737 /* if we stumble upon an unknown
1738 formatting code, copy the rest of
1739 the format string to the output
1740 string. (we cannot just skip the
1741 code, since there's no way to know
1742 what's in the argument list) */
1743 n += strlen(p);
1744 goto expand;
1745 }
1746 } else
1747 n++;
1748 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001749 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001750 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001752 we don't have to resize the string.
1753 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001755 if (!string)
1756 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 kind = PyUnicode_KIND(string);
1758 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001759 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001763 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001764 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001765
1766 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1768 /* checking for == because the last argument could be a empty
1769 string, which causes i to point to end, the assert at the end of
1770 the loop */
1771 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001772
Benjamin Peterson14339b62009-01-31 16:36:08 +00001773 switch (*f) {
1774 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001775 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 const int ordinal = va_arg(vargs, int);
1777 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001778 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001779 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001780 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001781 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001782 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001783 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 case 'p':
1785 /* unused, since we already have the result */
1786 if (*f == 'p')
1787 (void) va_arg(vargs, void *);
1788 else
1789 (void) va_arg(vargs, int);
1790 /* extract the result from numberresults and append. */
1791 for (; *numberresult; ++i, ++numberresult)
1792 PyUnicode_WRITE(kind, data, i, *numberresult);
1793 /* skip over the separating '\0' */
1794 assert(*numberresult == '\0');
1795 numberresult++;
1796 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001797 break;
1798 case 's':
1799 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001800 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001802 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 size = PyUnicode_GET_LENGTH(*callresult);
1804 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001805 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1806 *callresult, 0,
1807 size) < 0)
1808 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001810 /* We're done with the unicode()/repr() => forget it */
1811 Py_DECREF(*callresult);
1812 /* switch to next unicode()/repr() result */
1813 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001814 break;
1815 }
1816 case 'U':
1817 {
1818 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819 Py_ssize_t size;
1820 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1821 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001822 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1823 obj, 0,
1824 size) < 0)
1825 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001827 break;
1828 }
1829 case 'V':
1830 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001832 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001833 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001834 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 size = PyUnicode_GET_LENGTH(obj);
1836 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001837 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1838 obj, 0,
1839 size) < 0)
1840 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001842 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843 size = PyUnicode_GET_LENGTH(*callresult);
1844 assert(PyUnicode_KIND(*callresult) <=
1845 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001846 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1847 *callresult,
1848 0, size) < 0)
1849 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001850 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001851 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001852 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001853 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001854 break;
1855 }
1856 case 'S':
1857 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001858 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001859 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001860 /* unused, since we already have the result */
1861 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001863 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1864 *callresult, 0,
1865 PyUnicode_GET_LENGTH(*callresult)) < 0)
1866 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001868 /* We're done with the unicode()/repr() => forget it */
1869 Py_DECREF(*callresult);
1870 /* switch to next unicode()/repr() result */
1871 ++callresult;
1872 break;
1873 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001874 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001876 break;
1877 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001878 for (; *p; ++p, ++i)
1879 PyUnicode_WRITE(kind, data, i, *p);
1880 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001881 goto end;
1882 }
Victor Stinner1205f272010-09-11 00:54:47 +00001883 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001884 else {
1885 assert(i < PyUnicode_GET_LENGTH(string));
1886 PyUnicode_WRITE(kind, data, i++, *f);
1887 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001888 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001889 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001890
Benjamin Peterson29060642009-01-31 22:14:21 +00001891 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001892 if (callresults)
1893 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001894 if (numberresults)
1895 PyObject_Free(numberresults);
1896 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001897 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001898 if (callresults) {
1899 PyObject **callresult2 = callresults;
1900 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001901 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001902 ++callresult2;
1903 }
1904 PyObject_Free(callresults);
1905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906 if (numberresults)
1907 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001908 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001909}
1910
Walter Dörwaldd2034312007-05-18 16:29:38 +00001911PyObject *
1912PyUnicode_FromFormat(const char *format, ...)
1913{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001914 PyObject* ret;
1915 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001916
1917#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001918 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001919#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001920 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001921#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001922 ret = PyUnicode_FromFormatV(format, vargs);
1923 va_end(vargs);
1924 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001925}
1926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927#ifdef HAVE_WCHAR_H
1928
Victor Stinner5593d8a2010-10-02 11:11:27 +00001929/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1930 convert a Unicode object to a wide character string.
1931
Victor Stinnerd88d9832011-09-06 02:00:05 +02001932 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001933 character) required to convert the unicode object. Ignore size argument.
1934
Victor Stinnerd88d9832011-09-06 02:00:05 +02001935 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001936 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001937 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001938static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001939unicode_aswidechar(PyUnicodeObject *unicode,
1940 wchar_t *w,
1941 Py_ssize_t size)
1942{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001943 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001944 const wchar_t *wstr;
1945
1946 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1947 if (wstr == NULL)
1948 return -1;
1949
Victor Stinner5593d8a2010-10-02 11:11:27 +00001950 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001951 if (size > res)
1952 size = res + 1;
1953 else
1954 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001955 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001956 return res;
1957 }
1958 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001960}
1961
1962Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001963PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001964 wchar_t *w,
1965 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966{
1967 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001968 PyErr_BadInternalCall();
1969 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001971 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972}
1973
Victor Stinner137c34c2010-09-29 10:25:54 +00001974wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001975PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001976 Py_ssize_t *size)
1977{
1978 wchar_t* buffer;
1979 Py_ssize_t buflen;
1980
1981 if (unicode == NULL) {
1982 PyErr_BadInternalCall();
1983 return NULL;
1984 }
1985
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001986 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987 if (buflen == -1)
1988 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001989 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001990 PyErr_NoMemory();
1991 return NULL;
1992 }
1993
Victor Stinner137c34c2010-09-29 10:25:54 +00001994 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1995 if (buffer == NULL) {
1996 PyErr_NoMemory();
1997 return NULL;
1998 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001999 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 if (buflen == -1)
2001 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002002 if (size != NULL)
2003 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002004 return buffer;
2005}
2006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008
Alexander Belopolsky40018472011-02-26 01:02:56 +00002009PyObject *
2010PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002013 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002014 PyErr_SetString(PyExc_ValueError,
2015 "chr() arg not in range(0x110000)");
2016 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002017 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 if (ordinal < 256)
2020 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 v = PyUnicode_New(1, ordinal);
2023 if (v == NULL)
2024 return NULL;
2025 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2026 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002027}
2028
Alexander Belopolsky40018472011-02-26 01:02:56 +00002029PyObject *
2030PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002032 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002033 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002034 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002035 Py_INCREF(obj);
2036 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002037 }
2038 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002039 /* For a Unicode subtype that's not a Unicode object,
2040 return a true Unicode object with the same data. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 if (PyUnicode_READY(obj) == -1)
2042 return NULL;
2043 return substring((PyUnicodeObject *)obj, 0, PyUnicode_GET_LENGTH(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002044 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002045 PyErr_Format(PyExc_TypeError,
2046 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002047 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002048 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002049}
2050
Alexander Belopolsky40018472011-02-26 01:02:56 +00002051PyObject *
2052PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002053 const char *encoding,
2054 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002055{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002056 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002057 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002058
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002060 PyErr_BadInternalCall();
2061 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002063
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002064 /* Decoding bytes objects is the most common case and should be fast */
2065 if (PyBytes_Check(obj)) {
2066 if (PyBytes_GET_SIZE(obj) == 0) {
2067 Py_INCREF(unicode_empty);
2068 v = (PyObject *) unicode_empty;
2069 }
2070 else {
2071 v = PyUnicode_Decode(
2072 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2073 encoding, errors);
2074 }
2075 return v;
2076 }
2077
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002078 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002079 PyErr_SetString(PyExc_TypeError,
2080 "decoding str is not supported");
2081 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002082 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002083
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002084 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2085 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2086 PyErr_Format(PyExc_TypeError,
2087 "coercing to str: need bytes, bytearray "
2088 "or buffer-like object, %.80s found",
2089 Py_TYPE(obj)->tp_name);
2090 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002091 }
Tim Petersced69f82003-09-16 20:30:58 +00002092
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002093 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002094 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002095 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096 }
Tim Petersced69f82003-09-16 20:30:58 +00002097 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002098 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002099
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002100 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002101 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102}
2103
Victor Stinner600d3be2010-06-10 12:00:55 +00002104/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002105 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2106 1 on success. */
2107static int
2108normalize_encoding(const char *encoding,
2109 char *lower,
2110 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002112 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002113 char *l;
2114 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002115
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002116 e = encoding;
2117 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002118 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002119 while (*e) {
2120 if (l == l_end)
2121 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002122 if (Py_ISUPPER(*e)) {
2123 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002124 }
2125 else if (*e == '_') {
2126 *l++ = '-';
2127 e++;
2128 }
2129 else {
2130 *l++ = *e++;
2131 }
2132 }
2133 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002134 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002135}
2136
Alexander Belopolsky40018472011-02-26 01:02:56 +00002137PyObject *
2138PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002139 Py_ssize_t size,
2140 const char *encoding,
2141 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002142{
2143 PyObject *buffer = NULL, *unicode;
2144 Py_buffer info;
2145 char lower[11]; /* Enough for any encoding shortcut */
2146
2147 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002148 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002149
2150 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002151 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002152 if ((strcmp(lower, "utf-8") == 0) ||
2153 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002154 return PyUnicode_DecodeUTF8(s, size, errors);
2155 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002156 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002157 (strcmp(lower, "iso-8859-1") == 0))
2158 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002159#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002160 else if (strcmp(lower, "mbcs") == 0)
2161 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002162#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002163 else if (strcmp(lower, "ascii") == 0)
2164 return PyUnicode_DecodeASCII(s, size, errors);
2165 else if (strcmp(lower, "utf-16") == 0)
2166 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2167 else if (strcmp(lower, "utf-32") == 0)
2168 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170
2171 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002172 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002173 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002174 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002175 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176 if (buffer == NULL)
2177 goto onError;
2178 unicode = PyCodec_Decode(buffer, encoding, errors);
2179 if (unicode == NULL)
2180 goto onError;
2181 if (!PyUnicode_Check(unicode)) {
2182 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002183 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002184 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002185 Py_DECREF(unicode);
2186 goto onError;
2187 }
2188 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 if (PyUnicode_READY(unicode)) {
2190 Py_DECREF(unicode);
2191 return NULL;
2192 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002194
Benjamin Peterson29060642009-01-31 22:14:21 +00002195 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 Py_XDECREF(buffer);
2197 return NULL;
2198}
2199
Alexander Belopolsky40018472011-02-26 01:02:56 +00002200PyObject *
2201PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002202 const char *encoding,
2203 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002204{
2205 PyObject *v;
2206
2207 if (!PyUnicode_Check(unicode)) {
2208 PyErr_BadArgument();
2209 goto onError;
2210 }
2211
2212 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002213 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002214
2215 /* Decode via the codec registry */
2216 v = PyCodec_Decode(unicode, encoding, errors);
2217 if (v == NULL)
2218 goto onError;
2219 return v;
2220
Benjamin Peterson29060642009-01-31 22:14:21 +00002221 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002222 return NULL;
2223}
2224
Alexander Belopolsky40018472011-02-26 01:02:56 +00002225PyObject *
2226PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002227 const char *encoding,
2228 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002229{
2230 PyObject *v;
2231
2232 if (!PyUnicode_Check(unicode)) {
2233 PyErr_BadArgument();
2234 goto onError;
2235 }
2236
2237 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002238 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002239
2240 /* Decode via the codec registry */
2241 v = PyCodec_Decode(unicode, encoding, errors);
2242 if (v == NULL)
2243 goto onError;
2244 if (!PyUnicode_Check(v)) {
2245 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002246 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002247 Py_TYPE(v)->tp_name);
2248 Py_DECREF(v);
2249 goto onError;
2250 }
2251 return v;
2252
Benjamin Peterson29060642009-01-31 22:14:21 +00002253 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002254 return NULL;
2255}
2256
Alexander Belopolsky40018472011-02-26 01:02:56 +00002257PyObject *
2258PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002259 Py_ssize_t size,
2260 const char *encoding,
2261 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262{
2263 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002264
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 unicode = PyUnicode_FromUnicode(s, size);
2266 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2269 Py_DECREF(unicode);
2270 return v;
2271}
2272
Alexander Belopolsky40018472011-02-26 01:02:56 +00002273PyObject *
2274PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002275 const char *encoding,
2276 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002277{
2278 PyObject *v;
2279
2280 if (!PyUnicode_Check(unicode)) {
2281 PyErr_BadArgument();
2282 goto onError;
2283 }
2284
2285 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002286 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002287
2288 /* Encode via the codec registry */
2289 v = PyCodec_Encode(unicode, encoding, errors);
2290 if (v == NULL)
2291 goto onError;
2292 return v;
2293
Benjamin Peterson29060642009-01-31 22:14:21 +00002294 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002295 return NULL;
2296}
2297
Victor Stinnerad158722010-10-27 00:25:46 +00002298PyObject *
2299PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002300{
Victor Stinner99b95382011-07-04 14:23:54 +02002301#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002302 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2303 PyUnicode_GET_SIZE(unicode),
2304 NULL);
2305#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002307#else
Victor Stinner793b5312011-04-27 00:24:21 +02002308 PyInterpreterState *interp = PyThreadState_GET()->interp;
2309 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2310 cannot use it to encode and decode filenames before it is loaded. Load
2311 the Python codec requires to encode at least its own filename. Use the C
2312 version of the locale codec until the codec registry is initialized and
2313 the Python codec is loaded.
2314
2315 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2316 cannot only rely on it: check also interp->fscodec_initialized for
2317 subinterpreters. */
2318 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002319 return PyUnicode_AsEncodedString(unicode,
2320 Py_FileSystemDefaultEncoding,
2321 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002322 }
2323 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002324 /* locale encoding with surrogateescape */
2325 wchar_t *wchar;
2326 char *bytes;
2327 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002328 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002329
2330 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2331 if (wchar == NULL)
2332 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002333 bytes = _Py_wchar2char(wchar, &error_pos);
2334 if (bytes == NULL) {
2335 if (error_pos != (size_t)-1) {
2336 char *errmsg = strerror(errno);
2337 PyObject *exc = NULL;
2338 if (errmsg == NULL)
2339 errmsg = "Py_wchar2char() failed";
2340 raise_encode_exception(&exc,
2341 "filesystemencoding",
2342 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2343 error_pos, error_pos+1,
2344 errmsg);
2345 Py_XDECREF(exc);
2346 }
2347 else
2348 PyErr_NoMemory();
2349 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002350 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002351 }
2352 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002353
2354 bytes_obj = PyBytes_FromString(bytes);
2355 PyMem_Free(bytes);
2356 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002357 }
Victor Stinnerad158722010-10-27 00:25:46 +00002358#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002359}
2360
Alexander Belopolsky40018472011-02-26 01:02:56 +00002361PyObject *
2362PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002363 const char *encoding,
2364 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365{
2366 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002367 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002368
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369 if (!PyUnicode_Check(unicode)) {
2370 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002372 }
Fred Drakee4315f52000-05-09 19:53:39 +00002373
Victor Stinner2f283c22011-03-02 01:21:46 +00002374 if (encoding == NULL) {
2375 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002376 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002377 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002379 }
Fred Drakee4315f52000-05-09 19:53:39 +00002380
2381 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002382 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002383 if ((strcmp(lower, "utf-8") == 0) ||
2384 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002385 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002386 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002388 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002389 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002390 }
Victor Stinner37296e82010-06-10 13:36:23 +00002391 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002392 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002393 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002394 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002395#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002396 else if (strcmp(lower, "mbcs") == 0)
2397 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2398 PyUnicode_GET_SIZE(unicode),
2399 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002400#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002401 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002403 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002404
2405 /* Encode via the codec registry */
2406 v = PyCodec_Encode(unicode, encoding, errors);
2407 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002408 return NULL;
2409
2410 /* The normal path */
2411 if (PyBytes_Check(v))
2412 return v;
2413
2414 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002415 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002416 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002417 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002418
2419 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2420 "encoder %s returned bytearray instead of bytes",
2421 encoding);
2422 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002423 Py_DECREF(v);
2424 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002425 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002426
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002427 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2428 Py_DECREF(v);
2429 return b;
2430 }
2431
2432 PyErr_Format(PyExc_TypeError,
2433 "encoder did not return a bytes object (type=%.400s)",
2434 Py_TYPE(v)->tp_name);
2435 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002436 return NULL;
2437}
2438
Alexander Belopolsky40018472011-02-26 01:02:56 +00002439PyObject *
2440PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002441 const char *encoding,
2442 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002443{
2444 PyObject *v;
2445
2446 if (!PyUnicode_Check(unicode)) {
2447 PyErr_BadArgument();
2448 goto onError;
2449 }
2450
2451 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002452 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002453
2454 /* Encode via the codec registry */
2455 v = PyCodec_Encode(unicode, encoding, errors);
2456 if (v == NULL)
2457 goto onError;
2458 if (!PyUnicode_Check(v)) {
2459 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002460 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002461 Py_TYPE(v)->tp_name);
2462 Py_DECREF(v);
2463 goto onError;
2464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002466
Benjamin Peterson29060642009-01-31 22:14:21 +00002467 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468 return NULL;
2469}
2470
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002471PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002472PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002473 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002474 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2475}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002476
Christian Heimes5894ba72007-11-04 11:43:14 +00002477PyObject*
2478PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2479{
Victor Stinner99b95382011-07-04 14:23:54 +02002480#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002481 return PyUnicode_DecodeMBCS(s, size, NULL);
2482#elif defined(__APPLE__)
2483 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2484#else
Victor Stinner793b5312011-04-27 00:24:21 +02002485 PyInterpreterState *interp = PyThreadState_GET()->interp;
2486 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2487 cannot use it to encode and decode filenames before it is loaded. Load
2488 the Python codec requires to encode at least its own filename. Use the C
2489 version of the locale codec until the codec registry is initialized and
2490 the Python codec is loaded.
2491
2492 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2493 cannot only rely on it: check also interp->fscodec_initialized for
2494 subinterpreters. */
2495 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002496 return PyUnicode_Decode(s, size,
2497 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002498 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002499 }
2500 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002501 /* locale encoding with surrogateescape */
2502 wchar_t *wchar;
2503 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002504 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002505
2506 if (s[size] != '\0' || size != strlen(s)) {
2507 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2508 return NULL;
2509 }
2510
Victor Stinner168e1172010-10-16 23:16:16 +00002511 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002512 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002513 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002514
Victor Stinner168e1172010-10-16 23:16:16 +00002515 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002516 PyMem_Free(wchar);
2517 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002518 }
Victor Stinnerad158722010-10-27 00:25:46 +00002519#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002520}
2521
Martin v. Löwis011e8422009-05-05 04:43:17 +00002522
2523int
2524PyUnicode_FSConverter(PyObject* arg, void* addr)
2525{
2526 PyObject *output = NULL;
2527 Py_ssize_t size;
2528 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002529 if (arg == NULL) {
2530 Py_DECREF(*(PyObject**)addr);
2531 return 1;
2532 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002533 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002534 output = arg;
2535 Py_INCREF(output);
2536 }
2537 else {
2538 arg = PyUnicode_FromObject(arg);
2539 if (!arg)
2540 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002541 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002542 Py_DECREF(arg);
2543 if (!output)
2544 return 0;
2545 if (!PyBytes_Check(output)) {
2546 Py_DECREF(output);
2547 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2548 return 0;
2549 }
2550 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002551 size = PyBytes_GET_SIZE(output);
2552 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002553 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002554 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002555 Py_DECREF(output);
2556 return 0;
2557 }
2558 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002559 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002560}
2561
2562
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002563int
2564PyUnicode_FSDecoder(PyObject* arg, void* addr)
2565{
2566 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002567 if (arg == NULL) {
2568 Py_DECREF(*(PyObject**)addr);
2569 return 1;
2570 }
2571 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002572 if (PyUnicode_READY(arg))
2573 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002574 output = arg;
2575 Py_INCREF(output);
2576 }
2577 else {
2578 arg = PyBytes_FromObject(arg);
2579 if (!arg)
2580 return 0;
2581 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2582 PyBytes_GET_SIZE(arg));
2583 Py_DECREF(arg);
2584 if (!output)
2585 return 0;
2586 if (!PyUnicode_Check(output)) {
2587 Py_DECREF(output);
2588 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2589 return 0;
2590 }
2591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002592 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2593 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002594 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2595 Py_DECREF(output);
2596 return 0;
2597 }
2598 *(PyObject**)addr = output;
2599 return Py_CLEANUP_SUPPORTED;
2600}
2601
2602
Martin v. Löwis5b222132007-06-10 09:51:05 +00002603char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002605{
Christian Heimesf3863112007-11-22 07:46:41 +00002606 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2608
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002609 if (!PyUnicode_Check(unicode)) {
2610 PyErr_BadArgument();
2611 return NULL;
2612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002614 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002615
2616 if (_PyUnicode_UTF8(unicode) == NULL) {
2617 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2618 if (bytes == NULL)
2619 return NULL;
2620 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2621 if (u->_base.utf8 == NULL) {
2622 Py_DECREF(bytes);
2623 return NULL;
2624 }
2625 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2626 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2627 Py_DECREF(bytes);
2628 }
2629
2630 if (psize)
2631 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2632 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002633}
2634
2635char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002637{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2639}
2640
2641#ifdef Py_DEBUG
2642int unicode_as_unicode_calls = 0;
2643#endif
2644
2645
2646Py_UNICODE *
2647PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2648{
2649 PyUnicodeObject *u;
2650 const unsigned char *one_byte;
2651#if SIZEOF_WCHAR_T == 4
2652 const Py_UCS2 *two_bytes;
2653#else
2654 const Py_UCS4 *four_bytes;
2655 const Py_UCS4 *ucs4_end;
2656 Py_ssize_t num_surrogates;
2657#endif
2658 wchar_t *w;
2659 wchar_t *wchar_end;
2660
2661 if (!PyUnicode_Check(unicode)) {
2662 PyErr_BadArgument();
2663 return NULL;
2664 }
2665 u = (PyUnicodeObject*)unicode;
2666 if (_PyUnicode_WSTR(u) == NULL) {
2667 /* Non-ASCII compact unicode object */
2668 assert(_PyUnicode_KIND(u) != 0);
2669 assert(PyUnicode_IS_READY(u));
2670
2671#ifdef Py_DEBUG
2672 ++unicode_as_unicode_calls;
2673#endif
2674
2675 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2676#if SIZEOF_WCHAR_T == 2
2677 four_bytes = PyUnicode_4BYTE_DATA(u);
2678 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2679 num_surrogates = 0;
2680
2681 for (; four_bytes < ucs4_end; ++four_bytes) {
2682 if (*four_bytes > 0xFFFF)
2683 ++num_surrogates;
2684 }
2685
2686 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2687 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2688 if (!_PyUnicode_WSTR(u)) {
2689 PyErr_NoMemory();
2690 return NULL;
2691 }
2692 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2693
2694 w = _PyUnicode_WSTR(u);
2695 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2696 four_bytes = PyUnicode_4BYTE_DATA(u);
2697 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2698 if (*four_bytes > 0xFFFF) {
2699 /* encode surrogate pair in this case */
2700 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2701 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2702 }
2703 else
2704 *w = *four_bytes;
2705
2706 if (w > wchar_end) {
2707 assert(0 && "Miscalculated string end");
2708 }
2709 }
2710 *w = 0;
2711#else
2712 /* sizeof(wchar_t) == 4 */
2713 Py_FatalError("Impossible unicode object state, wstr and str "
2714 "should share memory already.");
2715 return NULL;
2716#endif
2717 }
2718 else {
2719 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2720 (_PyUnicode_LENGTH(u) + 1));
2721 if (!_PyUnicode_WSTR(u)) {
2722 PyErr_NoMemory();
2723 return NULL;
2724 }
2725 if (!PyUnicode_IS_COMPACT_ASCII(u))
2726 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2727 w = _PyUnicode_WSTR(u);
2728 wchar_end = w + _PyUnicode_LENGTH(u);
2729
2730 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2731 one_byte = PyUnicode_1BYTE_DATA(u);
2732 for (; w < wchar_end; ++one_byte, ++w)
2733 *w = *one_byte;
2734 /* null-terminate the wstr */
2735 *w = 0;
2736 }
2737 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2738#if SIZEOF_WCHAR_T == 4
2739 two_bytes = PyUnicode_2BYTE_DATA(u);
2740 for (; w < wchar_end; ++two_bytes, ++w)
2741 *w = *two_bytes;
2742 /* null-terminate the wstr */
2743 *w = 0;
2744#else
2745 /* sizeof(wchar_t) == 2 */
2746 PyObject_FREE(_PyUnicode_WSTR(u));
2747 _PyUnicode_WSTR(u) = NULL;
2748 Py_FatalError("Impossible unicode object state, wstr "
2749 "and str should share memory already.");
2750 return NULL;
2751#endif
2752 }
2753 else {
2754 assert(0 && "This should never happen.");
2755 }
2756 }
2757 }
2758 if (size != NULL)
2759 *size = PyUnicode_WSTR_LENGTH(u);
2760 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002761}
2762
Alexander Belopolsky40018472011-02-26 01:02:56 +00002763Py_UNICODE *
2764PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002766 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767}
2768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002769
Alexander Belopolsky40018472011-02-26 01:02:56 +00002770Py_ssize_t
2771PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772{
2773 if (!PyUnicode_Check(unicode)) {
2774 PyErr_BadArgument();
2775 goto onError;
2776 }
2777 return PyUnicode_GET_SIZE(unicode);
2778
Benjamin Peterson29060642009-01-31 22:14:21 +00002779 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 return -1;
2781}
2782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783Py_ssize_t
2784PyUnicode_GetLength(PyObject *unicode)
2785{
2786 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2787 PyErr_BadArgument();
2788 return -1;
2789 }
2790
2791 return PyUnicode_GET_LENGTH(unicode);
2792}
2793
2794Py_UCS4
2795PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2796{
2797 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2798 return PyErr_BadArgument();
2799 return (Py_UCS4)-1;
2800 }
2801 return PyUnicode_READ_CHAR(unicode, index);
2802}
2803
2804int
2805PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2806{
2807 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2808 return PyErr_BadArgument();
2809 return -1;
2810 }
2811
2812 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2813 index, ch);
2814 return 0;
2815}
2816
Alexander Belopolsky40018472011-02-26 01:02:56 +00002817const char *
2818PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002819{
Victor Stinner42cb4622010-09-01 19:39:01 +00002820 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002821}
2822
Victor Stinner554f3f02010-06-16 23:33:54 +00002823/* create or adjust a UnicodeDecodeError */
2824static void
2825make_decode_exception(PyObject **exceptionObject,
2826 const char *encoding,
2827 const char *input, Py_ssize_t length,
2828 Py_ssize_t startpos, Py_ssize_t endpos,
2829 const char *reason)
2830{
2831 if (*exceptionObject == NULL) {
2832 *exceptionObject = PyUnicodeDecodeError_Create(
2833 encoding, input, length, startpos, endpos, reason);
2834 }
2835 else {
2836 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2837 goto onError;
2838 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2839 goto onError;
2840 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2841 goto onError;
2842 }
2843 return;
2844
2845onError:
2846 Py_DECREF(*exceptionObject);
2847 *exceptionObject = NULL;
2848}
2849
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002850/* error handling callback helper:
2851 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002852 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002853 and adjust various state variables.
2854 return 0 on success, -1 on error
2855*/
2856
Alexander Belopolsky40018472011-02-26 01:02:56 +00002857static int
2858unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002859 const char *encoding, const char *reason,
2860 const char **input, const char **inend, Py_ssize_t *startinpos,
2861 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2862 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002863{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002864 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002865
2866 PyObject *restuple = NULL;
2867 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002868 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002869 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002870 Py_ssize_t requiredsize;
2871 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002872 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002873 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002874 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002875 int res = -1;
2876
2877 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002878 *errorHandler = PyCodec_LookupError(errors);
2879 if (*errorHandler == NULL)
2880 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002881 }
2882
Victor Stinner554f3f02010-06-16 23:33:54 +00002883 make_decode_exception(exceptionObject,
2884 encoding,
2885 *input, *inend - *input,
2886 *startinpos, *endinpos,
2887 reason);
2888 if (*exceptionObject == NULL)
2889 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002890
2891 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2892 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002894 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002895 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002896 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002897 }
2898 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002899 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002900
2901 /* Copy back the bytes variables, which might have been modified by the
2902 callback */
2903 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2904 if (!inputobj)
2905 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002906 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002907 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002908 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002909 *input = PyBytes_AS_STRING(inputobj);
2910 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002911 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002912 /* we can DECREF safely, as the exception has another reference,
2913 so the object won't go away. */
2914 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002915
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002916 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002917 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002918 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2920 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002921 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002922
2923 /* need more space? (at least enough for what we
2924 have+the replacement+the rest of the string (starting
2925 at the new input position), so we won't have to check space
2926 when there are no errors in the rest of the string) */
2927 repptr = PyUnicode_AS_UNICODE(repunicode);
2928 repsize = PyUnicode_GET_SIZE(repunicode);
2929 requiredsize = *outpos + repsize + insize-newpos;
2930 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002931 if (requiredsize<2*outsize)
2932 requiredsize = 2*outsize;
2933 if (_PyUnicode_Resize(output, requiredsize) < 0)
2934 goto onError;
2935 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002936 }
2937 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002938 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002939 Py_UNICODE_COPY(*outptr, repptr, repsize);
2940 *outptr += repsize;
2941 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002942
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002943 /* we made it! */
2944 res = 0;
2945
Benjamin Peterson29060642009-01-31 22:14:21 +00002946 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002947 Py_XDECREF(restuple);
2948 return res;
2949}
2950
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002951/* --- UTF-7 Codec -------------------------------------------------------- */
2952
Antoine Pitrou244651a2009-05-04 18:56:13 +00002953/* See RFC2152 for details. We encode conservatively and decode liberally. */
2954
2955/* Three simple macros defining base-64. */
2956
2957/* Is c a base-64 character? */
2958
2959#define IS_BASE64(c) \
2960 (((c) >= 'A' && (c) <= 'Z') || \
2961 ((c) >= 'a' && (c) <= 'z') || \
2962 ((c) >= '0' && (c) <= '9') || \
2963 (c) == '+' || (c) == '/')
2964
2965/* given that c is a base-64 character, what is its base-64 value? */
2966
2967#define FROM_BASE64(c) \
2968 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2969 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2970 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2971 (c) == '+' ? 62 : 63)
2972
2973/* What is the base-64 character of the bottom 6 bits of n? */
2974
2975#define TO_BASE64(n) \
2976 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2977
2978/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2979 * decoded as itself. We are permissive on decoding; the only ASCII
2980 * byte not decoding to itself is the + which begins a base64
2981 * string. */
2982
2983#define DECODE_DIRECT(c) \
2984 ((c) <= 127 && (c) != '+')
2985
2986/* The UTF-7 encoder treats ASCII characters differently according to
2987 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2988 * the above). See RFC2152. This array identifies these different
2989 * sets:
2990 * 0 : "Set D"
2991 * alphanumeric and '(),-./:?
2992 * 1 : "Set O"
2993 * !"#$%&*;<=>@[]^_`{|}
2994 * 2 : "whitespace"
2995 * ht nl cr sp
2996 * 3 : special (must be base64 encoded)
2997 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2998 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002999
Tim Petersced69f82003-09-16 20:30:58 +00003000static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003001char utf7_category[128] = {
3002/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3003 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3004/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3005 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3006/* sp ! " # $ % & ' ( ) * + , - . / */
3007 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3008/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3009 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3010/* @ A B C D E F G H I J K L M N O */
3011 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3012/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3013 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3014/* ` a b c d e f g h i j k l m n o */
3015 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3016/* p q r s t u v w x y z { | } ~ del */
3017 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003018};
3019
Antoine Pitrou244651a2009-05-04 18:56:13 +00003020/* ENCODE_DIRECT: this character should be encoded as itself. The
3021 * answer depends on whether we are encoding set O as itself, and also
3022 * on whether we are encoding whitespace as itself. RFC2152 makes it
3023 * clear that the answers to these questions vary between
3024 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003025
Antoine Pitrou244651a2009-05-04 18:56:13 +00003026#define ENCODE_DIRECT(c, directO, directWS) \
3027 ((c) < 128 && (c) > 0 && \
3028 ((utf7_category[(c)] == 0) || \
3029 (directWS && (utf7_category[(c)] == 2)) || \
3030 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003031
Alexander Belopolsky40018472011-02-26 01:02:56 +00003032PyObject *
3033PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003034 Py_ssize_t size,
3035 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003036{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003037 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3038}
3039
Antoine Pitrou244651a2009-05-04 18:56:13 +00003040/* The decoder. The only state we preserve is our read position,
3041 * i.e. how many characters we have consumed. So if we end in the
3042 * middle of a shift sequence we have to back off the read position
3043 * and the output to the beginning of the sequence, otherwise we lose
3044 * all the shift state (seen bits, number of bits seen, high
3045 * surrogate). */
3046
Alexander Belopolsky40018472011-02-26 01:02:56 +00003047PyObject *
3048PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003049 Py_ssize_t size,
3050 const char *errors,
3051 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003052{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003054 Py_ssize_t startinpos;
3055 Py_ssize_t endinpos;
3056 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003057 const char *e;
3058 PyUnicodeObject *unicode;
3059 Py_UNICODE *p;
3060 const char *errmsg = "";
3061 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003062 Py_UNICODE *shiftOutStart;
3063 unsigned int base64bits = 0;
3064 unsigned long base64buffer = 0;
3065 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 PyObject *errorHandler = NULL;
3067 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003068
3069 unicode = _PyUnicode_New(size);
3070 if (!unicode)
3071 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003072 if (size == 0) {
3073 if (consumed)
3074 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003075 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003076 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003078 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003079 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003080 e = s + size;
3081
3082 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003083 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003085 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003086
Antoine Pitrou244651a2009-05-04 18:56:13 +00003087 if (inShift) { /* in a base-64 section */
3088 if (IS_BASE64(ch)) { /* consume a base-64 character */
3089 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3090 base64bits += 6;
3091 s++;
3092 if (base64bits >= 16) {
3093 /* we have enough bits for a UTF-16 value */
3094 Py_UNICODE outCh = (Py_UNICODE)
3095 (base64buffer >> (base64bits-16));
3096 base64bits -= 16;
3097 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3098 if (surrogate) {
3099 /* expecting a second surrogate */
3100 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3101#ifdef Py_UNICODE_WIDE
3102 *p++ = (((surrogate & 0x3FF)<<10)
3103 | (outCh & 0x3FF)) + 0x10000;
3104#else
3105 *p++ = surrogate;
3106 *p++ = outCh;
3107#endif
3108 surrogate = 0;
3109 }
3110 else {
3111 surrogate = 0;
3112 errmsg = "second surrogate missing";
3113 goto utf7Error;
3114 }
3115 }
3116 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3117 /* first surrogate */
3118 surrogate = outCh;
3119 }
3120 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3121 errmsg = "unexpected second surrogate";
3122 goto utf7Error;
3123 }
3124 else {
3125 *p++ = outCh;
3126 }
3127 }
3128 }
3129 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003130 inShift = 0;
3131 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003132 if (surrogate) {
3133 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003134 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003135 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003136 if (base64bits > 0) { /* left-over bits */
3137 if (base64bits >= 6) {
3138 /* We've seen at least one base-64 character */
3139 errmsg = "partial character in shift sequence";
3140 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003141 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003142 else {
3143 /* Some bits remain; they should be zero */
3144 if (base64buffer != 0) {
3145 errmsg = "non-zero padding bits in shift sequence";
3146 goto utf7Error;
3147 }
3148 }
3149 }
3150 if (ch != '-') {
3151 /* '-' is absorbed; other terminating
3152 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003153 *p++ = ch;
3154 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003155 }
3156 }
3157 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003158 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003159 s++; /* consume '+' */
3160 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003161 s++;
3162 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003163 }
3164 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003165 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003166 shiftOutStart = p;
3167 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003168 }
3169 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003170 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003171 *p++ = ch;
3172 s++;
3173 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003174 else {
3175 startinpos = s-starts;
3176 s++;
3177 errmsg = "unexpected special character";
3178 goto utf7Error;
3179 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003180 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003181utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003182 outpos = p-PyUnicode_AS_UNICODE(unicode);
3183 endinpos = s-starts;
3184 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003185 errors, &errorHandler,
3186 "utf7", errmsg,
3187 &starts, &e, &startinpos, &endinpos, &exc, &s,
3188 &unicode, &outpos, &p))
3189 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003190 }
3191
Antoine Pitrou244651a2009-05-04 18:56:13 +00003192 /* end of string */
3193
3194 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3195 /* if we're in an inconsistent state, that's an error */
3196 if (surrogate ||
3197 (base64bits >= 6) ||
3198 (base64bits > 0 && base64buffer != 0)) {
3199 outpos = p-PyUnicode_AS_UNICODE(unicode);
3200 endinpos = size;
3201 if (unicode_decode_call_errorhandler(
3202 errors, &errorHandler,
3203 "utf7", "unterminated shift sequence",
3204 &starts, &e, &startinpos, &endinpos, &exc, &s,
3205 &unicode, &outpos, &p))
3206 goto onError;
3207 if (s < e)
3208 goto restart;
3209 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003210 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003211
3212 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003213 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003214 if (inShift) {
3215 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003216 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003217 }
3218 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003219 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003220 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003221 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003222
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003223 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003224 goto onError;
3225
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226 Py_XDECREF(errorHandler);
3227 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003228 if (PyUnicode_READY(unicode) == -1) {
3229 Py_DECREF(unicode);
3230 return NULL;
3231 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003232 return (PyObject *)unicode;
3233
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003235 Py_XDECREF(errorHandler);
3236 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003237 Py_DECREF(unicode);
3238 return NULL;
3239}
3240
3241
Alexander Belopolsky40018472011-02-26 01:02:56 +00003242PyObject *
3243PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003244 Py_ssize_t size,
3245 int base64SetO,
3246 int base64WhiteSpace,
3247 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003248{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003249 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003250 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003251 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003252 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003253 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003254 unsigned int base64bits = 0;
3255 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003256 char * out;
3257 char * start;
3258
3259 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003260 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003261
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003262 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003263 return PyErr_NoMemory();
3264
Antoine Pitrou244651a2009-05-04 18:56:13 +00003265 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003266 if (v == NULL)
3267 return NULL;
3268
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003269 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003270 for (;i < size; ++i) {
3271 Py_UNICODE ch = s[i];
3272
Antoine Pitrou244651a2009-05-04 18:56:13 +00003273 if (inShift) {
3274 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3275 /* shifting out */
3276 if (base64bits) { /* output remaining bits */
3277 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3278 base64buffer = 0;
3279 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003280 }
3281 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003282 /* Characters not in the BASE64 set implicitly unshift the sequence
3283 so no '-' is required, except if the character is itself a '-' */
3284 if (IS_BASE64(ch) || ch == '-') {
3285 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003286 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003287 *out++ = (char) ch;
3288 }
3289 else {
3290 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003291 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003292 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003293 else { /* not in a shift sequence */
3294 if (ch == '+') {
3295 *out++ = '+';
3296 *out++ = '-';
3297 }
3298 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3299 *out++ = (char) ch;
3300 }
3301 else {
3302 *out++ = '+';
3303 inShift = 1;
3304 goto encode_char;
3305 }
3306 }
3307 continue;
3308encode_char:
3309#ifdef Py_UNICODE_WIDE
3310 if (ch >= 0x10000) {
3311 /* code first surrogate */
3312 base64bits += 16;
3313 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3314 while (base64bits >= 6) {
3315 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3316 base64bits -= 6;
3317 }
3318 /* prepare second surrogate */
3319 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3320 }
3321#endif
3322 base64bits += 16;
3323 base64buffer = (base64buffer << 16) | ch;
3324 while (base64bits >= 6) {
3325 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3326 base64bits -= 6;
3327 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003328 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003329 if (base64bits)
3330 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3331 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003332 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003333 if (_PyBytes_Resize(&v, out - start) < 0)
3334 return NULL;
3335 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003336}
3337
Antoine Pitrou244651a2009-05-04 18:56:13 +00003338#undef IS_BASE64
3339#undef FROM_BASE64
3340#undef TO_BASE64
3341#undef DECODE_DIRECT
3342#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003343
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344/* --- UTF-8 Codec -------------------------------------------------------- */
3345
Tim Petersced69f82003-09-16 20:30:58 +00003346static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003348 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3349 illegal prefix. See RFC 3629 for details */
3350 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3351 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003352 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3354 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3355 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3356 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003357 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3358 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3360 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003361 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3362 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3363 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3364 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3365 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366};
3367
Alexander Belopolsky40018472011-02-26 01:02:56 +00003368PyObject *
3369PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003370 Py_ssize_t size,
3371 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372{
Walter Dörwald69652032004-09-07 20:24:22 +00003373 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3374}
3375
Antoine Pitrouab868312009-01-10 15:40:25 +00003376/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3377#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3378
3379/* Mask to quickly check whether a C 'long' contains a
3380 non-ASCII, UTF8-encoded char. */
3381#if (SIZEOF_LONG == 8)
3382# define ASCII_CHAR_MASK 0x8080808080808080L
3383#elif (SIZEOF_LONG == 4)
3384# define ASCII_CHAR_MASK 0x80808080L
3385#else
3386# error C 'long' size should be either 4 or 8!
3387#endif
3388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003389/* Scans a UTF-8 string and returns the maximum character to be expected,
3390 the size of the decoded unicode string and if any major errors were
3391 encountered.
3392
3393 This function does check basic UTF-8 sanity, it does however NOT CHECK
3394 if the string contains surrogates, and if all continuation bytes are
3395 within the correct ranges, these checks are performed in
3396 PyUnicode_DecodeUTF8Stateful.
3397
3398 If it sets has_errors to 1, it means the value of unicode_size and max_char
3399 will be bogus and you should not rely on useful information in them.
3400 */
3401static Py_UCS4
3402utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3403 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3404 int *has_errors)
3405{
3406 Py_ssize_t n;
3407 Py_ssize_t char_count = 0;
3408 Py_UCS4 max_char = 127, new_max;
3409 Py_UCS4 upper_bound;
3410 const unsigned char *p = (const unsigned char *)s;
3411 const unsigned char *end = p + string_size;
3412 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3413 int err = 0;
3414
3415 for (; p < end && !err; ++p, ++char_count) {
3416 /* Only check value if it's not a ASCII char... */
3417 if (*p < 0x80) {
3418 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3419 an explanation. */
3420 if (!((size_t) p & LONG_PTR_MASK)) {
3421 /* Help register allocation */
3422 register const unsigned char *_p = p;
3423 while (_p < aligned_end) {
3424 unsigned long value = *(unsigned long *) _p;
3425 if (value & ASCII_CHAR_MASK)
3426 break;
3427 _p += SIZEOF_LONG;
3428 char_count += SIZEOF_LONG;
3429 }
3430 p = _p;
3431 if (p == end)
3432 break;
3433 }
3434 }
3435 if (*p >= 0x80) {
3436 n = utf8_code_length[*p];
3437 new_max = max_char;
3438 switch (n) {
3439 /* invalid start byte */
3440 case 0:
3441 err = 1;
3442 break;
3443 case 2:
3444 /* Code points between 0x00FF and 0x07FF inclusive.
3445 Approximate the upper bound of the code point,
3446 if this flips over 255 we can be sure it will be more
3447 than 255 and the string will need 2 bytes per code coint,
3448 if it stays under or equal to 255, we can be sure 1 byte
3449 is enough.
3450 ((*p & 0b00011111) << 6) | 0b00111111 */
3451 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3452 if (max_char < upper_bound)
3453 new_max = upper_bound;
3454 /* Ensure we track at least that we left ASCII space. */
3455 if (new_max < 128)
3456 new_max = 128;
3457 break;
3458 case 3:
3459 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3460 always > 255 and <= 65535 and will always need 2 bytes. */
3461 if (max_char < 65535)
3462 new_max = 65535;
3463 break;
3464 case 4:
3465 /* Code point will be above 0xFFFF for sure in this case. */
3466 new_max = 65537;
3467 break;
3468 /* Internal error, this should be caught by the first if */
3469 case 1:
3470 default:
3471 assert(0 && "Impossible case in utf8_max_char_and_size");
3472 err = 1;
3473 }
3474 /* Instead of number of overall bytes for this code point,
3475 n containts the number of following bytes: */
3476 --n;
3477 /* Check if the follow up chars are all valid continuation bytes */
3478 if (n >= 1) {
3479 const unsigned char *cont;
3480 if ((p + n) >= end) {
3481 if (consumed == 0)
3482 /* incomplete data, non-incremental decoding */
3483 err = 1;
3484 break;
3485 }
3486 for (cont = p + 1; cont < (p + n); ++cont) {
3487 if ((*cont & 0xc0) != 0x80) {
3488 err = 1;
3489 break;
3490 }
3491 }
3492 p += n;
3493 }
3494 else
3495 err = 1;
3496 max_char = new_max;
3497 }
3498 }
3499
3500 if (unicode_size)
3501 *unicode_size = char_count;
3502 if (has_errors)
3503 *has_errors = err;
3504 return max_char;
3505}
3506
3507/* Similar to PyUnicode_WRITE but can also write into wstr field
3508 of the legacy unicode representation */
3509#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3510 do { \
3511 const int k_ = (kind); \
3512 if (k_ == PyUnicode_WCHAR_KIND) \
3513 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3514 else if (k_ == PyUnicode_1BYTE_KIND) \
3515 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3516 else if (k_ == PyUnicode_2BYTE_KIND) \
3517 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3518 else \
3519 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3520 } while (0)
3521
Alexander Belopolsky40018472011-02-26 01:02:56 +00003522PyObject *
3523PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003524 Py_ssize_t size,
3525 const char *errors,
3526 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003527{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003528 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003530 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003531 Py_ssize_t startinpos;
3532 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003533 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003535 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536 PyObject *errorHandler = NULL;
3537 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003538 Py_UCS4 maxchar = 0;
3539 Py_ssize_t unicode_size;
3540 Py_ssize_t i;
3541 int kind;
3542 void *data;
3543 int has_errors;
3544 Py_UNICODE *error_outptr;
3545#if SIZEOF_WCHAR_T == 2
3546 Py_ssize_t wchar_offset = 0;
3547#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548
Walter Dörwald69652032004-09-07 20:24:22 +00003549 if (size == 0) {
3550 if (consumed)
3551 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003552 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003553 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003554 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3555 consumed, &has_errors);
3556 if (has_errors) {
3557 unicode = _PyUnicode_New(size);
3558 if (!unicode)
3559 return NULL;
3560 kind = PyUnicode_WCHAR_KIND;
3561 data = PyUnicode_AS_UNICODE(unicode);
3562 assert(data != NULL);
3563 }
3564 else {
3565 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3566 if (!unicode)
3567 return NULL;
3568 /* When the string is ASCII only, just use memcpy and return.
3569 unicode_size may be != size if there is an incomplete UTF-8
3570 sequence at the end of the ASCII block. */
3571 if (maxchar < 128 && size == unicode_size) {
3572 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3573 return (PyObject *)unicode;
3574 }
3575 kind = PyUnicode_KIND(unicode);
3576 data = PyUnicode_DATA(unicode);
3577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003579 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003581 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582
3583 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003584 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585
3586 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003587 /* Fast path for runs of ASCII characters. Given that common UTF-8
3588 input will consist of an overwhelming majority of ASCII
3589 characters, we try to optimize for this case by checking
3590 as many characters as a C 'long' can contain.
3591 First, check if we can do an aligned read, as most CPUs have
3592 a penalty for unaligned reads.
3593 */
3594 if (!((size_t) s & LONG_PTR_MASK)) {
3595 /* Help register allocation */
3596 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003597 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003598 while (_s < aligned_end) {
3599 /* Read a whole long at a time (either 4 or 8 bytes),
3600 and do a fast unrolled copy if it only contains ASCII
3601 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003602 unsigned long value = *(unsigned long *) _s;
3603 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003604 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003605 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3606 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3607 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3608 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003609#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003610 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3611 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3612 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3613 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003614#endif
3615 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003616 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003617 }
3618 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003619 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003620 if (s == e)
3621 break;
3622 ch = (unsigned char)*s;
3623 }
3624 }
3625
3626 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003627 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 s++;
3629 continue;
3630 }
3631
3632 n = utf8_code_length[ch];
3633
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003634 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 if (consumed)
3636 break;
3637 else {
3638 errmsg = "unexpected end of data";
3639 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003640 endinpos = startinpos+1;
3641 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3642 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003643 goto utf8Error;
3644 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003645 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646
3647 switch (n) {
3648
3649 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003650 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003651 startinpos = s-starts;
3652 endinpos = startinpos+1;
3653 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654
3655 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003656 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003657 startinpos = s-starts;
3658 endinpos = startinpos+1;
3659 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660
3661 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003662 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003663 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003664 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003665 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003666 goto utf8Error;
3667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003669 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003670 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 break;
3672
3673 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003674 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3675 will result in surrogates in range d800-dfff. Surrogates are
3676 not valid UTF-8 so they are rejected.
3677 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3678 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003679 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003680 (s[2] & 0xc0) != 0x80 ||
3681 ((unsigned char)s[0] == 0xE0 &&
3682 (unsigned char)s[1] < 0xA0) ||
3683 ((unsigned char)s[0] == 0xED &&
3684 (unsigned char)s[1] > 0x9F)) {
3685 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003686 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003687 endinpos = startinpos + 1;
3688
3689 /* if s[1] first two bits are 1 and 0, then the invalid
3690 continuation byte is s[2], so increment endinpos by 1,
3691 if not, s[1] is invalid and endinpos doesn't need to
3692 be incremented. */
3693 if ((s[1] & 0xC0) == 0x80)
3694 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003695 goto utf8Error;
3696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003698 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003699 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003700 break;
3701
3702 case 4:
3703 if ((s[1] & 0xc0) != 0x80 ||
3704 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003705 (s[3] & 0xc0) != 0x80 ||
3706 ((unsigned char)s[0] == 0xF0 &&
3707 (unsigned char)s[1] < 0x90) ||
3708 ((unsigned char)s[0] == 0xF4 &&
3709 (unsigned char)s[1] > 0x8F)) {
3710 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003711 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003712 endinpos = startinpos + 1;
3713 if ((s[1] & 0xC0) == 0x80) {
3714 endinpos++;
3715 if ((s[2] & 0xC0) == 0x80)
3716 endinpos++;
3717 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003718 goto utf8Error;
3719 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003720 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003721 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3722 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003724 /* If the string is flexible or we have native UCS-4, write
3725 directly.. */
3726 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3727 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729 else {
3730 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003732 /* translate from 10000..10FFFF to 0..FFFF */
3733 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003735 /* high surrogate = top 10 bits added to D800 */
3736 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3737 (Py_UNICODE)(0xD800 + (ch >> 10)));
3738
3739 /* low surrogate = bottom 10 bits added to DC00 */
3740 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3741 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3742 }
3743#if SIZEOF_WCHAR_T == 2
3744 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003745#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003746 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 }
3748 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003749 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003750
Benjamin Peterson29060642009-01-31 22:14:21 +00003751 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752 /* If this is not yet a resizable string, make it one.. */
3753 if (kind != PyUnicode_WCHAR_KIND) {
3754 const Py_UNICODE *u;
3755 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3756 if (!new_unicode)
3757 goto onError;
3758 u = PyUnicode_AsUnicode((PyObject *)unicode);
3759 if (!u)
3760 goto onError;
3761#if SIZEOF_WCHAR_T == 2
3762 i += wchar_offset;
3763#endif
3764 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3765 Py_DECREF(unicode);
3766 unicode = new_unicode;
3767 kind = 0;
3768 data = PyUnicode_AS_UNICODE(new_unicode);
3769 assert(data != NULL);
3770 }
3771 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003772 if (unicode_decode_call_errorhandler(
3773 errors, &errorHandler,
3774 "utf8", errmsg,
3775 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003777 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003778 /* Update data because unicode_decode_call_errorhandler might have
3779 re-created or resized the unicode object. */
3780 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003781 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003783 /* Ensure the unicode_size calculation above was correct: */
3784 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3785
Walter Dörwald69652032004-09-07 20:24:22 +00003786 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003787 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003789 /* Adjust length and ready string when it contained errors and
3790 is of the old resizable kind. */
3791 if (kind == PyUnicode_WCHAR_KIND) {
3792 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3793 PyUnicode_READY(unicode) == -1)
3794 goto onError;
3795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003797 Py_XDECREF(errorHandler);
3798 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003799 if (PyUnicode_READY(unicode) == -1) {
3800 Py_DECREF(unicode);
3801 return NULL;
3802 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803 return (PyObject *)unicode;
3804
Benjamin Peterson29060642009-01-31 22:14:21 +00003805 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003806 Py_XDECREF(errorHandler);
3807 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 Py_DECREF(unicode);
3809 return NULL;
3810}
3811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003813
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003814#ifdef __APPLE__
3815
3816/* Simplified UTF-8 decoder using surrogateescape error handler,
3817 used to decode the command line arguments on Mac OS X. */
3818
3819wchar_t*
3820_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3821{
3822 int n;
3823 const char *e;
3824 wchar_t *unicode, *p;
3825
3826 /* Note: size will always be longer than the resulting Unicode
3827 character count */
3828 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3829 PyErr_NoMemory();
3830 return NULL;
3831 }
3832 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3833 if (!unicode)
3834 return NULL;
3835
3836 /* Unpack UTF-8 encoded data */
3837 p = unicode;
3838 e = s + size;
3839 while (s < e) {
3840 Py_UCS4 ch = (unsigned char)*s;
3841
3842 if (ch < 0x80) {
3843 *p++ = (wchar_t)ch;
3844 s++;
3845 continue;
3846 }
3847
3848 n = utf8_code_length[ch];
3849 if (s + n > e) {
3850 goto surrogateescape;
3851 }
3852
3853 switch (n) {
3854 case 0:
3855 case 1:
3856 goto surrogateescape;
3857
3858 case 2:
3859 if ((s[1] & 0xc0) != 0x80)
3860 goto surrogateescape;
3861 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3862 assert ((ch > 0x007F) && (ch <= 0x07FF));
3863 *p++ = (wchar_t)ch;
3864 break;
3865
3866 case 3:
3867 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3868 will result in surrogates in range d800-dfff. Surrogates are
3869 not valid UTF-8 so they are rejected.
3870 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3871 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3872 if ((s[1] & 0xc0) != 0x80 ||
3873 (s[2] & 0xc0) != 0x80 ||
3874 ((unsigned char)s[0] == 0xE0 &&
3875 (unsigned char)s[1] < 0xA0) ||
3876 ((unsigned char)s[0] == 0xED &&
3877 (unsigned char)s[1] > 0x9F)) {
3878
3879 goto surrogateescape;
3880 }
3881 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3882 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003884 break;
3885
3886 case 4:
3887 if ((s[1] & 0xc0) != 0x80 ||
3888 (s[2] & 0xc0) != 0x80 ||
3889 (s[3] & 0xc0) != 0x80 ||
3890 ((unsigned char)s[0] == 0xF0 &&
3891 (unsigned char)s[1] < 0x90) ||
3892 ((unsigned char)s[0] == 0xF4 &&
3893 (unsigned char)s[1] > 0x8F)) {
3894 goto surrogateescape;
3895 }
3896 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3897 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3898 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3899
3900#if SIZEOF_WCHAR_T == 4
3901 *p++ = (wchar_t)ch;
3902#else
3903 /* compute and append the two surrogates: */
3904
3905 /* translate from 10000..10FFFF to 0..FFFF */
3906 ch -= 0x10000;
3907
3908 /* high surrogate = top 10 bits added to D800 */
3909 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3910
3911 /* low surrogate = bottom 10 bits added to DC00 */
3912 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3913#endif
3914 break;
3915 }
3916 s += n;
3917 continue;
3918
3919 surrogateescape:
3920 *p++ = 0xDC00 + ch;
3921 s++;
3922 }
3923 *p = L'\0';
3924 return unicode;
3925}
3926
3927#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003929/* Primary internal function which creates utf8 encoded bytes objects.
3930
3931 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003932 and allocate exactly as much space needed at the end. Else allocate the
3933 maximum possible needed (4 result bytes per Unicode character), and return
3934 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003935*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003936PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003937_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938{
Tim Peters602f7402002-04-27 18:03:26 +00003939#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003940
Guido van Rossum98297ee2007-11-06 21:34:58 +00003941 Py_ssize_t i; /* index into s of next input byte */
3942 PyObject *result; /* result string object */
3943 char *p; /* next free byte in output buffer */
3944 Py_ssize_t nallocated; /* number of result bytes allocated */
3945 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003946 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003947 PyObject *errorHandler = NULL;
3948 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949 int kind;
3950 void *data;
3951 Py_ssize_t size;
3952 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3953#if SIZEOF_WCHAR_T == 2
3954 Py_ssize_t wchar_offset = 0;
3955#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003957 if (!PyUnicode_Check(unicode)) {
3958 PyErr_BadArgument();
3959 return NULL;
3960 }
3961
3962 if (PyUnicode_READY(unicode) == -1)
3963 return NULL;
3964
3965 if (_PyUnicode_UTF8(unicode))
3966 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
3967 _PyUnicode_UTF8_LENGTH(unicode));
3968
3969 kind = PyUnicode_KIND(unicode);
3970 data = PyUnicode_DATA(unicode);
3971 size = PyUnicode_GET_LENGTH(unicode);
3972
Tim Peters602f7402002-04-27 18:03:26 +00003973 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974
Tim Peters602f7402002-04-27 18:03:26 +00003975 if (size <= MAX_SHORT_UNICHARS) {
3976 /* Write into the stack buffer; nallocated can't overflow.
3977 * At the end, we'll allocate exactly as much heap space as it
3978 * turns out we need.
3979 */
3980 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003981 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00003982 p = stackbuf;
3983 }
3984 else {
3985 /* Overallocate on the heap, and give the excess back at the end. */
3986 nallocated = size * 4;
3987 if (nallocated / 4 != size) /* overflow! */
3988 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00003989 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003990 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00003991 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003992 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003993 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003994
Tim Peters602f7402002-04-27 18:03:26 +00003995 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00003997
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003998 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00003999 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004001
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004003 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004004 *p++ = (char)(0xc0 | (ch >> 6));
4005 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004006 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 Py_ssize_t newpos;
4008 PyObject *rep;
4009 Py_ssize_t repsize, k, startpos;
4010 startpos = i-1;
4011#if SIZEOF_WCHAR_T == 2
4012 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004013#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014 rep = unicode_encode_call_errorhandler(
4015 errors, &errorHandler, "utf-8", "surrogates not allowed",
4016 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4017 &exc, startpos, startpos+1, &newpos);
4018 if (!rep)
4019 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004021 if (PyBytes_Check(rep))
4022 repsize = PyBytes_GET_SIZE(rep);
4023 else
4024 repsize = PyUnicode_GET_SIZE(rep);
4025
4026 if (repsize > 4) {
4027 Py_ssize_t offset;
4028
4029 if (result == NULL)
4030 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004031 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4035 /* integer overflow */
4036 PyErr_NoMemory();
4037 goto error;
4038 }
4039 nallocated += repsize - 4;
4040 if (result != NULL) {
4041 if (_PyBytes_Resize(&result, nallocated) < 0)
4042 goto error;
4043 } else {
4044 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004045 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004046 goto error;
4047 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4048 }
4049 p = PyBytes_AS_STRING(result) + offset;
4050 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004052 if (PyBytes_Check(rep)) {
4053 char *prep = PyBytes_AS_STRING(rep);
4054 for(k = repsize; k > 0; k--)
4055 *p++ = *prep++;
4056 } else /* rep is unicode */ {
4057 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4058 Py_UNICODE c;
4059
4060 for(k=0; k<repsize; k++) {
4061 c = prep[k];
4062 if (0x80 <= c) {
4063 raise_encode_exception(&exc, "utf-8",
4064 PyUnicode_AS_UNICODE(unicode),
4065 size, i-1, i,
4066 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004067 goto error;
4068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004070 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004071 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004072 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004073 } else if (ch < 0x10000) {
4074 *p++ = (char)(0xe0 | (ch >> 12));
4075 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4076 *p++ = (char)(0x80 | (ch & 0x3f));
4077 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004078 /* Encode UCS4 Unicode ordinals */
4079 *p++ = (char)(0xf0 | (ch >> 18));
4080 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4081 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4082 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083#if SIZEOF_WCHAR_T == 2
4084 wchar_offset++;
4085#endif
Tim Peters602f7402002-04-27 18:03:26 +00004086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004088
Guido van Rossum98297ee2007-11-06 21:34:58 +00004089 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004090 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004091 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004092 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004093 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004094 }
4095 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004096 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004097 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004098 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004099 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004101
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004102 Py_XDECREF(errorHandler);
4103 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004104 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004105 error:
4106 Py_XDECREF(errorHandler);
4107 Py_XDECREF(exc);
4108 Py_XDECREF(result);
4109 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004110
Tim Peters602f7402002-04-27 18:03:26 +00004111#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112}
4113
Alexander Belopolsky40018472011-02-26 01:02:56 +00004114PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4116 Py_ssize_t size,
4117 const char *errors)
4118{
4119 PyObject *v, *unicode;
4120
4121 unicode = PyUnicode_FromUnicode(s, size);
4122 if (unicode == NULL)
4123 return NULL;
4124 v = _PyUnicode_AsUTF8String(unicode, errors);
4125 Py_DECREF(unicode);
4126 return v;
4127}
4128
4129PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004130PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004132 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133}
4134
Walter Dörwald41980ca2007-08-16 21:55:45 +00004135/* --- UTF-32 Codec ------------------------------------------------------- */
4136
4137PyObject *
4138PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004139 Py_ssize_t size,
4140 const char *errors,
4141 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004142{
4143 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4144}
4145
4146PyObject *
4147PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004148 Py_ssize_t size,
4149 const char *errors,
4150 int *byteorder,
4151 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004152{
4153 const char *starts = s;
4154 Py_ssize_t startinpos;
4155 Py_ssize_t endinpos;
4156 Py_ssize_t outpos;
4157 PyUnicodeObject *unicode;
4158 Py_UNICODE *p;
4159#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004160 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004161 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004162#else
4163 const int pairs = 0;
4164#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004165 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004166 int bo = 0; /* assume native ordering by default */
4167 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004168 /* Offsets from q for retrieving bytes in the right order. */
4169#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4170 int iorder[] = {0, 1, 2, 3};
4171#else
4172 int iorder[] = {3, 2, 1, 0};
4173#endif
4174 PyObject *errorHandler = NULL;
4175 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004176
Walter Dörwald41980ca2007-08-16 21:55:45 +00004177 q = (unsigned char *)s;
4178 e = q + size;
4179
4180 if (byteorder)
4181 bo = *byteorder;
4182
4183 /* Check for BOM marks (U+FEFF) in the input and adjust current
4184 byte order setting accordingly. In native mode, the leading BOM
4185 mark is skipped, in all other modes, it is copied to the output
4186 stream as-is (giving a ZWNBSP character). */
4187 if (bo == 0) {
4188 if (size >= 4) {
4189 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004191#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 if (bom == 0x0000FEFF) {
4193 q += 4;
4194 bo = -1;
4195 }
4196 else if (bom == 0xFFFE0000) {
4197 q += 4;
4198 bo = 1;
4199 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004200#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004201 if (bom == 0x0000FEFF) {
4202 q += 4;
4203 bo = 1;
4204 }
4205 else if (bom == 0xFFFE0000) {
4206 q += 4;
4207 bo = -1;
4208 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004209#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004210 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004211 }
4212
4213 if (bo == -1) {
4214 /* force LE */
4215 iorder[0] = 0;
4216 iorder[1] = 1;
4217 iorder[2] = 2;
4218 iorder[3] = 3;
4219 }
4220 else if (bo == 1) {
4221 /* force BE */
4222 iorder[0] = 3;
4223 iorder[1] = 2;
4224 iorder[2] = 1;
4225 iorder[3] = 0;
4226 }
4227
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004228 /* On narrow builds we split characters outside the BMP into two
4229 codepoints => count how much extra space we need. */
4230#ifndef Py_UNICODE_WIDE
4231 for (qq = q; qq < e; qq += 4)
4232 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4233 pairs++;
4234#endif
4235
4236 /* This might be one to much, because of a BOM */
4237 unicode = _PyUnicode_New((size+3)/4+pairs);
4238 if (!unicode)
4239 return NULL;
4240 if (size == 0)
4241 return (PyObject *)unicode;
4242
4243 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004244 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004245
Walter Dörwald41980ca2007-08-16 21:55:45 +00004246 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 Py_UCS4 ch;
4248 /* remaining bytes at the end? (size should be divisible by 4) */
4249 if (e-q<4) {
4250 if (consumed)
4251 break;
4252 errmsg = "truncated data";
4253 startinpos = ((const char *)q)-starts;
4254 endinpos = ((const char *)e)-starts;
4255 goto utf32Error;
4256 /* The remaining input chars are ignored if the callback
4257 chooses to skip the input */
4258 }
4259 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4260 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004261
Benjamin Peterson29060642009-01-31 22:14:21 +00004262 if (ch >= 0x110000)
4263 {
4264 errmsg = "codepoint not in range(0x110000)";
4265 startinpos = ((const char *)q)-starts;
4266 endinpos = startinpos+4;
4267 goto utf32Error;
4268 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004269#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004270 if (ch >= 0x10000)
4271 {
4272 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4273 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4274 }
4275 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004276#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004277 *p++ = ch;
4278 q += 4;
4279 continue;
4280 utf32Error:
4281 outpos = p-PyUnicode_AS_UNICODE(unicode);
4282 if (unicode_decode_call_errorhandler(
4283 errors, &errorHandler,
4284 "utf32", errmsg,
4285 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4286 &unicode, &outpos, &p))
4287 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004288 }
4289
4290 if (byteorder)
4291 *byteorder = bo;
4292
4293 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004295
4296 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004297 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004298 goto onError;
4299
4300 Py_XDECREF(errorHandler);
4301 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004302 if (PyUnicode_READY(unicode) == -1) {
4303 Py_DECREF(unicode);
4304 return NULL;
4305 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004306 return (PyObject *)unicode;
4307
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004309 Py_DECREF(unicode);
4310 Py_XDECREF(errorHandler);
4311 Py_XDECREF(exc);
4312 return NULL;
4313}
4314
4315PyObject *
4316PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004317 Py_ssize_t size,
4318 const char *errors,
4319 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004320{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004321 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004322 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004323 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004324#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004325 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004326#else
4327 const int pairs = 0;
4328#endif
4329 /* Offsets from p for storing byte pairs in the right order. */
4330#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4331 int iorder[] = {0, 1, 2, 3};
4332#else
4333 int iorder[] = {3, 2, 1, 0};
4334#endif
4335
Benjamin Peterson29060642009-01-31 22:14:21 +00004336#define STORECHAR(CH) \
4337 do { \
4338 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4339 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4340 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4341 p[iorder[0]] = (CH) & 0xff; \
4342 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004343 } while(0)
4344
4345 /* In narrow builds we can output surrogate pairs as one codepoint,
4346 so we need less space. */
4347#ifndef Py_UNICODE_WIDE
4348 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004349 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4350 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4351 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004352#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004353 nsize = (size - pairs + (byteorder == 0));
4354 bytesize = nsize * 4;
4355 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004356 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004357 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004358 if (v == NULL)
4359 return NULL;
4360
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004361 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004362 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004363 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004364 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004365 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004366
4367 if (byteorder == -1) {
4368 /* force LE */
4369 iorder[0] = 0;
4370 iorder[1] = 1;
4371 iorder[2] = 2;
4372 iorder[3] = 3;
4373 }
4374 else if (byteorder == 1) {
4375 /* force BE */
4376 iorder[0] = 3;
4377 iorder[1] = 2;
4378 iorder[2] = 1;
4379 iorder[3] = 0;
4380 }
4381
4382 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004383 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004384#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004385 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4386 Py_UCS4 ch2 = *s;
4387 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4388 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4389 s++;
4390 size--;
4391 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004392 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004393#endif
4394 STORECHAR(ch);
4395 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004396
4397 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004398 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004399#undef STORECHAR
4400}
4401
Alexander Belopolsky40018472011-02-26 01:02:56 +00004402PyObject *
4403PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004404{
4405 if (!PyUnicode_Check(unicode)) {
4406 PyErr_BadArgument();
4407 return NULL;
4408 }
4409 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 PyUnicode_GET_SIZE(unicode),
4411 NULL,
4412 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004413}
4414
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415/* --- UTF-16 Codec ------------------------------------------------------- */
4416
Tim Peters772747b2001-08-09 22:21:55 +00004417PyObject *
4418PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004419 Py_ssize_t size,
4420 const char *errors,
4421 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422{
Walter Dörwald69652032004-09-07 20:24:22 +00004423 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4424}
4425
Antoine Pitrouab868312009-01-10 15:40:25 +00004426/* Two masks for fast checking of whether a C 'long' may contain
4427 UTF16-encoded surrogate characters. This is an efficient heuristic,
4428 assuming that non-surrogate characters with a code point >= 0x8000 are
4429 rare in most input.
4430 FAST_CHAR_MASK is used when the input is in native byte ordering,
4431 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004432*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004433#if (SIZEOF_LONG == 8)
4434# define FAST_CHAR_MASK 0x8000800080008000L
4435# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4436#elif (SIZEOF_LONG == 4)
4437# define FAST_CHAR_MASK 0x80008000L
4438# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4439#else
4440# error C 'long' size should be either 4 or 8!
4441#endif
4442
Walter Dörwald69652032004-09-07 20:24:22 +00004443PyObject *
4444PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 Py_ssize_t size,
4446 const char *errors,
4447 int *byteorder,
4448 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004449{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004451 Py_ssize_t startinpos;
4452 Py_ssize_t endinpos;
4453 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454 PyUnicodeObject *unicode;
4455 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004456 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004457 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004458 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004459 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004460 /* Offsets from q for retrieving byte pairs in the right order. */
4461#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4462 int ihi = 1, ilo = 0;
4463#else
4464 int ihi = 0, ilo = 1;
4465#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004466 PyObject *errorHandler = NULL;
4467 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468
4469 /* Note: size will always be longer than the resulting Unicode
4470 character count */
4471 unicode = _PyUnicode_New(size);
4472 if (!unicode)
4473 return NULL;
4474 if (size == 0)
4475 return (PyObject *)unicode;
4476
4477 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004478 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004479 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004480 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481
4482 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004483 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004485 /* Check for BOM marks (U+FEFF) in the input and adjust current
4486 byte order setting accordingly. In native mode, the leading BOM
4487 mark is skipped, in all other modes, it is copied to the output
4488 stream as-is (giving a ZWNBSP character). */
4489 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004490 if (size >= 2) {
4491 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004492#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 if (bom == 0xFEFF) {
4494 q += 2;
4495 bo = -1;
4496 }
4497 else if (bom == 0xFFFE) {
4498 q += 2;
4499 bo = 1;
4500 }
Tim Petersced69f82003-09-16 20:30:58 +00004501#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004502 if (bom == 0xFEFF) {
4503 q += 2;
4504 bo = 1;
4505 }
4506 else if (bom == 0xFFFE) {
4507 q += 2;
4508 bo = -1;
4509 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004510#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513
Tim Peters772747b2001-08-09 22:21:55 +00004514 if (bo == -1) {
4515 /* force LE */
4516 ihi = 1;
4517 ilo = 0;
4518 }
4519 else if (bo == 1) {
4520 /* force BE */
4521 ihi = 0;
4522 ilo = 1;
4523 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004524#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4525 native_ordering = ilo < ihi;
4526#else
4527 native_ordering = ilo > ihi;
4528#endif
Tim Peters772747b2001-08-09 22:21:55 +00004529
Antoine Pitrouab868312009-01-10 15:40:25 +00004530 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004531 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004532 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004533 /* First check for possible aligned read of a C 'long'. Unaligned
4534 reads are more expensive, better to defer to another iteration. */
4535 if (!((size_t) q & LONG_PTR_MASK)) {
4536 /* Fast path for runs of non-surrogate chars. */
4537 register const unsigned char *_q = q;
4538 Py_UNICODE *_p = p;
4539 if (native_ordering) {
4540 /* Native ordering is simple: as long as the input cannot
4541 possibly contain a surrogate char, do an unrolled copy
4542 of several 16-bit code points to the target object.
4543 The non-surrogate check is done on several input bytes
4544 at a time (as many as a C 'long' can contain). */
4545 while (_q < aligned_end) {
4546 unsigned long data = * (unsigned long *) _q;
4547 if (data & FAST_CHAR_MASK)
4548 break;
4549 _p[0] = ((unsigned short *) _q)[0];
4550 _p[1] = ((unsigned short *) _q)[1];
4551#if (SIZEOF_LONG == 8)
4552 _p[2] = ((unsigned short *) _q)[2];
4553 _p[3] = ((unsigned short *) _q)[3];
4554#endif
4555 _q += SIZEOF_LONG;
4556 _p += SIZEOF_LONG / 2;
4557 }
4558 }
4559 else {
4560 /* Byteswapped ordering is similar, but we must decompose
4561 the copy bytewise, and take care of zero'ing out the
4562 upper bytes if the target object is in 32-bit units
4563 (that is, in UCS-4 builds). */
4564 while (_q < aligned_end) {
4565 unsigned long data = * (unsigned long *) _q;
4566 if (data & SWAPPED_FAST_CHAR_MASK)
4567 break;
4568 /* Zero upper bytes in UCS-4 builds */
4569#if (Py_UNICODE_SIZE > 2)
4570 _p[0] = 0;
4571 _p[1] = 0;
4572#if (SIZEOF_LONG == 8)
4573 _p[2] = 0;
4574 _p[3] = 0;
4575#endif
4576#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004577 /* Issue #4916; UCS-4 builds on big endian machines must
4578 fill the two last bytes of each 4-byte unit. */
4579#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4580# define OFF 2
4581#else
4582# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004583#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004584 ((unsigned char *) _p)[OFF + 1] = _q[0];
4585 ((unsigned char *) _p)[OFF + 0] = _q[1];
4586 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4587 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4588#if (SIZEOF_LONG == 8)
4589 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4590 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4591 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4592 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4593#endif
4594#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004595 _q += SIZEOF_LONG;
4596 _p += SIZEOF_LONG / 2;
4597 }
4598 }
4599 p = _p;
4600 q = _q;
4601 if (q >= e)
4602 break;
4603 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004604 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605
Benjamin Peterson14339b62009-01-31 16:36:08 +00004606 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004607
4608 if (ch < 0xD800 || ch > 0xDFFF) {
4609 *p++ = ch;
4610 continue;
4611 }
4612
4613 /* UTF-16 code pair: */
4614 if (q > e) {
4615 errmsg = "unexpected end of data";
4616 startinpos = (((const char *)q) - 2) - starts;
4617 endinpos = ((const char *)e) + 1 - starts;
4618 goto utf16Error;
4619 }
4620 if (0xD800 <= ch && ch <= 0xDBFF) {
4621 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4622 q += 2;
4623 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004624#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 *p++ = ch;
4626 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004627#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004628 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004629#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004630 continue;
4631 }
4632 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004633 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004634 startinpos = (((const char *)q)-4)-starts;
4635 endinpos = startinpos+2;
4636 goto utf16Error;
4637 }
4638
Benjamin Peterson14339b62009-01-31 16:36:08 +00004639 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004640 errmsg = "illegal encoding";
4641 startinpos = (((const char *)q)-2)-starts;
4642 endinpos = startinpos+2;
4643 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004644
Benjamin Peterson29060642009-01-31 22:14:21 +00004645 utf16Error:
4646 outpos = p - PyUnicode_AS_UNICODE(unicode);
4647 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004648 errors,
4649 &errorHandler,
4650 "utf16", errmsg,
4651 &starts,
4652 (const char **)&e,
4653 &startinpos,
4654 &endinpos,
4655 &exc,
4656 (const char **)&q,
4657 &unicode,
4658 &outpos,
4659 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004661 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004662 /* remaining byte at the end? (size should be even) */
4663 if (e == q) {
4664 if (!consumed) {
4665 errmsg = "truncated data";
4666 startinpos = ((const char *)q) - starts;
4667 endinpos = ((const char *)e) + 1 - starts;
4668 outpos = p - PyUnicode_AS_UNICODE(unicode);
4669 if (unicode_decode_call_errorhandler(
4670 errors,
4671 &errorHandler,
4672 "utf16", errmsg,
4673 &starts,
4674 (const char **)&e,
4675 &startinpos,
4676 &endinpos,
4677 &exc,
4678 (const char **)&q,
4679 &unicode,
4680 &outpos,
4681 &p))
4682 goto onError;
4683 /* The remaining input chars are ignored if the callback
4684 chooses to skip the input */
4685 }
4686 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687
4688 if (byteorder)
4689 *byteorder = bo;
4690
Walter Dörwald69652032004-09-07 20:24:22 +00004691 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004692 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004693
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004695 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 goto onError;
4697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 Py_XDECREF(errorHandler);
4699 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004700 if (PyUnicode_READY(unicode) == -1) {
4701 Py_DECREF(unicode);
4702 return NULL;
4703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004704 return (PyObject *)unicode;
4705
Benjamin Peterson29060642009-01-31 22:14:21 +00004706 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004708 Py_XDECREF(errorHandler);
4709 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710 return NULL;
4711}
4712
Antoine Pitrouab868312009-01-10 15:40:25 +00004713#undef FAST_CHAR_MASK
4714#undef SWAPPED_FAST_CHAR_MASK
4715
Tim Peters772747b2001-08-09 22:21:55 +00004716PyObject *
4717PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004718 Py_ssize_t size,
4719 const char *errors,
4720 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004722 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004723 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004724 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004725#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004726 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004727#else
4728 const int pairs = 0;
4729#endif
Tim Peters772747b2001-08-09 22:21:55 +00004730 /* Offsets from p for storing byte pairs in the right order. */
4731#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4732 int ihi = 1, ilo = 0;
4733#else
4734 int ihi = 0, ilo = 1;
4735#endif
4736
Benjamin Peterson29060642009-01-31 22:14:21 +00004737#define STORECHAR(CH) \
4738 do { \
4739 p[ihi] = ((CH) >> 8) & 0xff; \
4740 p[ilo] = (CH) & 0xff; \
4741 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004742 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004744#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004745 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004746 if (s[i] >= 0x10000)
4747 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004748#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004749 /* 2 * (size + pairs + (byteorder == 0)) */
4750 if (size > PY_SSIZE_T_MAX ||
4751 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004753 nsize = size + pairs + (byteorder == 0);
4754 bytesize = nsize * 2;
4755 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004756 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004757 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758 if (v == NULL)
4759 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004761 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004763 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004764 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004765 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004766
4767 if (byteorder == -1) {
4768 /* force LE */
4769 ihi = 1;
4770 ilo = 0;
4771 }
4772 else if (byteorder == 1) {
4773 /* force BE */
4774 ihi = 0;
4775 ilo = 1;
4776 }
4777
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004778 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 Py_UNICODE ch = *s++;
4780 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004781#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 if (ch >= 0x10000) {
4783 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4784 ch = 0xD800 | ((ch-0x10000) >> 10);
4785 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004786#endif
Tim Peters772747b2001-08-09 22:21:55 +00004787 STORECHAR(ch);
4788 if (ch2)
4789 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004790 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004791
4792 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004793 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004794#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795}
4796
Alexander Belopolsky40018472011-02-26 01:02:56 +00004797PyObject *
4798PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799{
4800 if (!PyUnicode_Check(unicode)) {
4801 PyErr_BadArgument();
4802 return NULL;
4803 }
4804 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004805 PyUnicode_GET_SIZE(unicode),
4806 NULL,
4807 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808}
4809
4810/* --- Unicode Escape Codec ----------------------------------------------- */
4811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004812/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4813 if all the escapes in the string make it still a valid ASCII string.
4814 Returns -1 if any escapes were found which cause the string to
4815 pop out of ASCII range. Otherwise returns the length of the
4816 required buffer to hold the string.
4817 */
4818Py_ssize_t
4819length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4820{
4821 const unsigned char *p = (const unsigned char *)s;
4822 const unsigned char *end = p + size;
4823 Py_ssize_t length = 0;
4824
4825 if (size < 0)
4826 return -1;
4827
4828 for (; p < end; ++p) {
4829 if (*p > 127) {
4830 /* Non-ASCII */
4831 return -1;
4832 }
4833 else if (*p != '\\') {
4834 /* Normal character */
4835 ++length;
4836 }
4837 else {
4838 /* Backslash-escape, check next char */
4839 ++p;
4840 /* Escape sequence reaches till end of string or
4841 non-ASCII follow-up. */
4842 if (p >= end || *p > 127)
4843 return -1;
4844 switch (*p) {
4845 case '\n':
4846 /* backslash + \n result in zero characters */
4847 break;
4848 case '\\': case '\'': case '\"':
4849 case 'b': case 'f': case 't':
4850 case 'n': case 'r': case 'v': case 'a':
4851 ++length;
4852 break;
4853 case '0': case '1': case '2': case '3':
4854 case '4': case '5': case '6': case '7':
4855 case 'x': case 'u': case 'U': case 'N':
4856 /* these do not guarantee ASCII characters */
4857 return -1;
4858 default:
4859 /* count the backslash + the other character */
4860 length += 2;
4861 }
4862 }
4863 }
4864 return length;
4865}
4866
4867/* Similar to PyUnicode_WRITE but either write into wstr field
4868 or treat string as ASCII. */
4869#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4870 do { \
4871 if ((kind) != PyUnicode_WCHAR_KIND) \
4872 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4873 else \
4874 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4875 } while (0)
4876
4877#define WRITE_WSTR(buf, index, value) \
4878 assert(kind == PyUnicode_WCHAR_KIND), \
4879 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4880
4881
Fredrik Lundh06d12682001-01-24 07:59:11 +00004882static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004883
Alexander Belopolsky40018472011-02-26 01:02:56 +00004884PyObject *
4885PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004886 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02004887 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004890 Py_ssize_t startinpos;
4891 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004892 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004894 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004896 char* message;
4897 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004898 PyObject *errorHandler = NULL;
4899 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004900 Py_ssize_t ascii_length;
4901 Py_ssize_t i;
4902 int kind;
4903 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004905 ascii_length = length_of_escaped_ascii_string(s, size);
4906
4907 /* After length_of_escaped_ascii_string() there are two alternatives,
4908 either the string is pure ASCII with named escapes like \n, etc.
4909 and we determined it's exact size (common case)
4910 or it contains \x, \u, ... escape sequences. then we create a
4911 legacy wchar string and resize it at the end of this function. */
4912 if (ascii_length >= 0) {
4913 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4914 if (!v)
4915 goto onError;
4916 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4917 kind = PyUnicode_1BYTE_KIND;
4918 data = PyUnicode_DATA(v);
4919 }
4920 else {
4921 /* Escaped strings will always be longer than the resulting
4922 Unicode string, so we start with size here and then reduce the
4923 length after conversion to the true value.
4924 (but if the error callback returns a long replacement string
4925 we'll have to allocate more space) */
4926 v = _PyUnicode_New(size);
4927 if (!v)
4928 goto onError;
4929 kind = PyUnicode_WCHAR_KIND;
4930 data = PyUnicode_AS_UNICODE(v);
4931 }
4932
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933 if (size == 0)
4934 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004935 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004937
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 while (s < end) {
4939 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004940 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004941 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004943 if (kind == PyUnicode_WCHAR_KIND) {
4944 assert(i < _PyUnicode_WSTR_LENGTH(v));
4945 }
4946 else {
4947 /* The only case in which i == ascii_length is a backslash
4948 followed by a newline. */
4949 assert(i <= ascii_length);
4950 }
4951
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 /* Non-escape characters are interpreted as Unicode ordinals */
4953 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004954 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955 continue;
4956 }
4957
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004958 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959 /* \ - Escapes */
4960 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004961 c = *s++;
4962 if (s > end)
4963 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004964
4965 if (kind == PyUnicode_WCHAR_KIND) {
4966 assert(i < _PyUnicode_WSTR_LENGTH(v));
4967 }
4968 else {
4969 /* The only case in which i == ascii_length is a backslash
4970 followed by a newline. */
4971 assert(i < ascii_length || (i == ascii_length && c == '\n'));
4972 }
4973
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004974 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004978 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
4979 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
4980 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
4981 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
4982 /* FF */
4983 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
4984 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
4985 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
4986 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
4987 /* VT */
4988 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
4989 /* BEL, not classic C */
4990 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991
Benjamin Peterson29060642009-01-31 22:14:21 +00004992 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993 case '0': case '1': case '2': case '3':
4994 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004995 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004996 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004997 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004998 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00004999 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005001 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002 break;
5003
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 /* hex escapes */
5005 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005007 digits = 2;
5008 message = "truncated \\xXX escape";
5009 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010
Benjamin Peterson29060642009-01-31 22:14:21 +00005011 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005013 digits = 4;
5014 message = "truncated \\uXXXX escape";
5015 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016
Benjamin Peterson29060642009-01-31 22:14:21 +00005017 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005018 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005019 digits = 8;
5020 message = "truncated \\UXXXXXXXX escape";
5021 hexescape:
5022 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005023 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005024 if (s+digits>end) {
5025 endinpos = size;
5026 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005027 errors, &errorHandler,
5028 "unicodeescape", "end of string in escape sequence",
5029 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005030 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005031 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005032 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005033 goto nextByte;
5034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005035 for (j = 0; j < digits; ++j) {
5036 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005037 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005038 endinpos = (s+j+1)-starts;
5039 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005040 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005041 errors, &errorHandler,
5042 "unicodeescape", message,
5043 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005044 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005045 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005046 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005047 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005048 }
5049 chr = (chr<<4) & ~0xF;
5050 if (c >= '0' && c <= '9')
5051 chr += c - '0';
5052 else if (c >= 'a' && c <= 'f')
5053 chr += 10 + c - 'a';
5054 else
5055 chr += 10 + c - 'A';
5056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005057 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005058 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005059 /* _decoding_error will have already written into the
5060 target buffer. */
5061 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005062 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005063 /* when we get here, chr is a 32-bit unicode character */
5064 if (chr <= 0xffff)
5065 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005066 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005067 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005068 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005069 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005070#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005071 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005072#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005073 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005074 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5075 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005076#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005077 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005078 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005079 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005080 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005081 errors, &errorHandler,
5082 "unicodeescape", "illegal Unicode character",
5083 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005084 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005085 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005086 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005087 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005088 break;
5089
Benjamin Peterson29060642009-01-31 22:14:21 +00005090 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005091 case 'N':
5092 message = "malformed \\N character escape";
5093 if (ucnhash_CAPI == NULL) {
5094 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005095 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5096 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005097 if (ucnhash_CAPI == NULL)
5098 goto ucnhashError;
5099 }
5100 if (*s == '{') {
5101 const char *start = s+1;
5102 /* look for the closing brace */
5103 while (*s != '}' && s < end)
5104 s++;
5105 if (s > start && s < end && *s == '}') {
5106 /* found a name. look it up in the unicode database */
5107 message = "unknown Unicode character name";
5108 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005109 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5110 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005111 goto store;
5112 }
5113 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005115 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005116 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005117 errors, &errorHandler,
5118 "unicodeescape", message,
5119 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005120 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005121 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005122 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005123 break;
5124
5125 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005126 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005127 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005128 message = "\\ at end of string";
5129 s--;
5130 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005131 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005132 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005133 errors, &errorHandler,
5134 "unicodeescape", message,
5135 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005136 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005137 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005138 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005139 }
5140 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005141 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5142 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005143 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005144 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005146 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005147 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005149 /* Ensure the length prediction worked in case of ASCII strings */
5150 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5151
5152 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5153 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005154 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005155 Py_XDECREF(errorHandler);
5156 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005158
Benjamin Peterson29060642009-01-31 22:14:21 +00005159 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005160 PyErr_SetString(
5161 PyExc_UnicodeError,
5162 "\\N escapes not supported (can't load unicodedata module)"
5163 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005164 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005165 Py_XDECREF(errorHandler);
5166 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005167 return NULL;
5168
Benjamin Peterson29060642009-01-31 22:14:21 +00005169 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005171 Py_XDECREF(errorHandler);
5172 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173 return NULL;
5174}
5175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005176#undef WRITE_ASCII_OR_WSTR
5177#undef WRITE_WSTR
5178
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179/* Return a Unicode-Escape string version of the Unicode object.
5180
5181 If quotes is true, the string is enclosed in u"" or u'' quotes as
5182 appropriate.
5183
5184*/
5185
Walter Dörwald79e913e2007-05-12 11:08:06 +00005186static const char *hexdigits = "0123456789abcdef";
5187
Alexander Belopolsky40018472011-02-26 01:02:56 +00005188PyObject *
5189PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005190 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005192 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005195#ifdef Py_UNICODE_WIDE
5196 const Py_ssize_t expandsize = 10;
5197#else
5198 const Py_ssize_t expandsize = 6;
5199#endif
5200
Thomas Wouters89f507f2006-12-13 04:49:30 +00005201 /* XXX(nnorwitz): rather than over-allocating, it would be
5202 better to choose a different scheme. Perhaps scan the
5203 first N-chars of the string and allocate based on that size.
5204 */
5205 /* Initial allocation is based on the longest-possible unichr
5206 escape.
5207
5208 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5209 unichr, so in this case it's the longest unichr escape. In
5210 narrow (UTF-16) builds this is five chars per source unichr
5211 since there are two unichrs in the surrogate pair, so in narrow
5212 (UTF-16) builds it's not the longest unichr escape.
5213
5214 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5215 so in the narrow (UTF-16) build case it's the longest unichr
5216 escape.
5217 */
5218
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005219 if (size == 0)
5220 return PyBytes_FromStringAndSize(NULL, 0);
5221
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005222 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005223 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005224
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005225 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005226 2
5227 + expandsize*size
5228 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 if (repr == NULL)
5230 return NULL;
5231
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005232 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 while (size-- > 0) {
5235 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005236
Walter Dörwald79e913e2007-05-12 11:08:06 +00005237 /* Escape backslashes */
5238 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239 *p++ = '\\';
5240 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005241 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005242 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005243
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005244#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005245 /* Map 21-bit characters to '\U00xxxxxx' */
5246 else if (ch >= 0x10000) {
5247 *p++ = '\\';
5248 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005249 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5250 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5251 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5252 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5253 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5254 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5255 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5256 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005258 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005259#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005260 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5261 else if (ch >= 0xD800 && ch < 0xDC00) {
5262 Py_UNICODE ch2;
5263 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005264
Benjamin Peterson29060642009-01-31 22:14:21 +00005265 ch2 = *s++;
5266 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005267 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005268 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5269 *p++ = '\\';
5270 *p++ = 'U';
5271 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5272 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5273 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5274 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5275 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5276 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5277 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5278 *p++ = hexdigits[ucs & 0x0000000F];
5279 continue;
5280 }
5281 /* Fall through: isolated surrogates are copied as-is */
5282 s--;
5283 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005284 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005285#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005286
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005288 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 *p++ = '\\';
5290 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005291 *p++ = hexdigits[(ch >> 12) & 0x000F];
5292 *p++ = hexdigits[(ch >> 8) & 0x000F];
5293 *p++ = hexdigits[(ch >> 4) & 0x000F];
5294 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005296
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005297 /* Map special whitespace to '\t', \n', '\r' */
5298 else if (ch == '\t') {
5299 *p++ = '\\';
5300 *p++ = 't';
5301 }
5302 else if (ch == '\n') {
5303 *p++ = '\\';
5304 *p++ = 'n';
5305 }
5306 else if (ch == '\r') {
5307 *p++ = '\\';
5308 *p++ = 'r';
5309 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005310
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005311 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005312 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005314 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005315 *p++ = hexdigits[(ch >> 4) & 0x000F];
5316 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005317 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005318
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 /* Copy everything else as-is */
5320 else
5321 *p++ = (char) ch;
5322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005324 assert(p - PyBytes_AS_STRING(repr) > 0);
5325 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5326 return NULL;
5327 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328}
5329
Alexander Belopolsky40018472011-02-26 01:02:56 +00005330PyObject *
5331PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005333 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 if (!PyUnicode_Check(unicode)) {
5335 PyErr_BadArgument();
5336 return NULL;
5337 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005338 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5339 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005340 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341}
5342
5343/* --- Raw Unicode Escape Codec ------------------------------------------- */
5344
Alexander Belopolsky40018472011-02-26 01:02:56 +00005345PyObject *
5346PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005347 Py_ssize_t size,
5348 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005350 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005351 Py_ssize_t startinpos;
5352 Py_ssize_t endinpos;
5353 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005355 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356 const char *end;
5357 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005358 PyObject *errorHandler = NULL;
5359 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005360
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 /* Escaped strings will always be longer than the resulting
5362 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005363 length after conversion to the true value. (But decoding error
5364 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 v = _PyUnicode_New(size);
5366 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005370 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 end = s + size;
5372 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 unsigned char c;
5374 Py_UCS4 x;
5375 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005376 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 /* Non-escape characters are interpreted as Unicode ordinals */
5379 if (*s != '\\') {
5380 *p++ = (unsigned char)*s++;
5381 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005382 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 startinpos = s-starts;
5384
5385 /* \u-escapes are only interpreted iff the number of leading
5386 backslashes if odd */
5387 bs = s;
5388 for (;s < end;) {
5389 if (*s != '\\')
5390 break;
5391 *p++ = (unsigned char)*s++;
5392 }
5393 if (((s - bs) & 1) == 0 ||
5394 s >= end ||
5395 (*s != 'u' && *s != 'U')) {
5396 continue;
5397 }
5398 p--;
5399 count = *s=='u' ? 4 : 8;
5400 s++;
5401
5402 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5403 outpos = p-PyUnicode_AS_UNICODE(v);
5404 for (x = 0, i = 0; i < count; ++i, ++s) {
5405 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005406 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 endinpos = s-starts;
5408 if (unicode_decode_call_errorhandler(
5409 errors, &errorHandler,
5410 "rawunicodeescape", "truncated \\uXXXX",
5411 &starts, &end, &startinpos, &endinpos, &exc, &s,
5412 &v, &outpos, &p))
5413 goto onError;
5414 goto nextByte;
5415 }
5416 x = (x<<4) & ~0xF;
5417 if (c >= '0' && c <= '9')
5418 x += c - '0';
5419 else if (c >= 'a' && c <= 'f')
5420 x += 10 + c - 'a';
5421 else
5422 x += 10 + c - 'A';
5423 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005424 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 /* UCS-2 character */
5426 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005427 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 /* UCS-4 character. Either store directly, or as
5429 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005430#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005431 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005432#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 x -= 0x10000L;
5434 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5435 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005436#endif
5437 } else {
5438 endinpos = s-starts;
5439 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005440 if (unicode_decode_call_errorhandler(
5441 errors, &errorHandler,
5442 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005443 &starts, &end, &startinpos, &endinpos, &exc, &s,
5444 &v, &outpos, &p))
5445 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005446 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005447 nextByte:
5448 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005450 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005451 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005452 Py_XDECREF(errorHandler);
5453 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005454 if (PyUnicode_READY(v) == -1) {
5455 Py_DECREF(v);
5456 return NULL;
5457 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005459
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005462 Py_XDECREF(errorHandler);
5463 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 return NULL;
5465}
5466
Alexander Belopolsky40018472011-02-26 01:02:56 +00005467PyObject *
5468PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005469 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005471 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 char *p;
5473 char *q;
5474
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005475#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005476 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005477#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005478 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005479#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005480
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005481 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005483
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005484 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 if (repr == NULL)
5486 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005487 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005488 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005490 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 while (size-- > 0) {
5492 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005493#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 /* Map 32-bit characters to '\Uxxxxxxxx' */
5495 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005496 *p++ = '\\';
5497 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005498 *p++ = hexdigits[(ch >> 28) & 0xf];
5499 *p++ = hexdigits[(ch >> 24) & 0xf];
5500 *p++ = hexdigits[(ch >> 20) & 0xf];
5501 *p++ = hexdigits[(ch >> 16) & 0xf];
5502 *p++ = hexdigits[(ch >> 12) & 0xf];
5503 *p++ = hexdigits[(ch >> 8) & 0xf];
5504 *p++ = hexdigits[(ch >> 4) & 0xf];
5505 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005506 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005507 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005508#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5510 if (ch >= 0xD800 && ch < 0xDC00) {
5511 Py_UNICODE ch2;
5512 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005513
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 ch2 = *s++;
5515 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005516 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5518 *p++ = '\\';
5519 *p++ = 'U';
5520 *p++ = hexdigits[(ucs >> 28) & 0xf];
5521 *p++ = hexdigits[(ucs >> 24) & 0xf];
5522 *p++ = hexdigits[(ucs >> 20) & 0xf];
5523 *p++ = hexdigits[(ucs >> 16) & 0xf];
5524 *p++ = hexdigits[(ucs >> 12) & 0xf];
5525 *p++ = hexdigits[(ucs >> 8) & 0xf];
5526 *p++ = hexdigits[(ucs >> 4) & 0xf];
5527 *p++ = hexdigits[ucs & 0xf];
5528 continue;
5529 }
5530 /* Fall through: isolated surrogates are copied as-is */
5531 s--;
5532 size++;
5533 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005534#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005535 /* Map 16-bit characters to '\uxxxx' */
5536 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 *p++ = '\\';
5538 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005539 *p++ = hexdigits[(ch >> 12) & 0xf];
5540 *p++ = hexdigits[(ch >> 8) & 0xf];
5541 *p++ = hexdigits[(ch >> 4) & 0xf];
5542 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005544 /* Copy everything else as-is */
5545 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 *p++ = (char) ch;
5547 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005548 size = p - q;
5549
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005550 assert(size > 0);
5551 if (_PyBytes_Resize(&repr, size) < 0)
5552 return NULL;
5553 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554}
5555
Alexander Belopolsky40018472011-02-26 01:02:56 +00005556PyObject *
5557PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005559 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005561 PyErr_BadArgument();
5562 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005564 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5565 PyUnicode_GET_SIZE(unicode));
5566
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005567 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568}
5569
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005570/* --- Unicode Internal Codec ------------------------------------------- */
5571
Alexander Belopolsky40018472011-02-26 01:02:56 +00005572PyObject *
5573_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005574 Py_ssize_t size,
5575 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005576{
5577 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005578 Py_ssize_t startinpos;
5579 Py_ssize_t endinpos;
5580 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005581 PyUnicodeObject *v;
5582 Py_UNICODE *p;
5583 const char *end;
5584 const char *reason;
5585 PyObject *errorHandler = NULL;
5586 PyObject *exc = NULL;
5587
Neal Norwitzd43069c2006-01-08 01:12:10 +00005588#ifdef Py_UNICODE_WIDE
5589 Py_UNICODE unimax = PyUnicode_GetMax();
5590#endif
5591
Thomas Wouters89f507f2006-12-13 04:49:30 +00005592 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005593 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5594 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005596 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5597 as string was created with the old API. */
5598 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005600 p = PyUnicode_AS_UNICODE(v);
5601 end = s + size;
5602
5603 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005604 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005605 /* We have to sanity check the raw data, otherwise doom looms for
5606 some malformed UCS-4 data. */
5607 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005608#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005609 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005610#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005611 end-s < Py_UNICODE_SIZE
5612 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005614 startinpos = s - starts;
5615 if (end-s < Py_UNICODE_SIZE) {
5616 endinpos = end-starts;
5617 reason = "truncated input";
5618 }
5619 else {
5620 endinpos = s - starts + Py_UNICODE_SIZE;
5621 reason = "illegal code point (> 0x10FFFF)";
5622 }
5623 outpos = p - PyUnicode_AS_UNICODE(v);
5624 if (unicode_decode_call_errorhandler(
5625 errors, &errorHandler,
5626 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005627 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005628 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005629 goto onError;
5630 }
5631 }
5632 else {
5633 p++;
5634 s += Py_UNICODE_SIZE;
5635 }
5636 }
5637
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005638 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005639 goto onError;
5640 Py_XDECREF(errorHandler);
5641 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642 if (PyUnicode_READY(v) == -1) {
5643 Py_DECREF(v);
5644 return NULL;
5645 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005646 return (PyObject *)v;
5647
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005649 Py_XDECREF(v);
5650 Py_XDECREF(errorHandler);
5651 Py_XDECREF(exc);
5652 return NULL;
5653}
5654
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655/* --- Latin-1 Codec ------------------------------------------------------ */
5656
Alexander Belopolsky40018472011-02-26 01:02:56 +00005657PyObject *
5658PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005659 Py_ssize_t size,
5660 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005663 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664}
5665
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005666/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005667static void
5668make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005669 const char *encoding,
5670 const Py_UNICODE *unicode, Py_ssize_t size,
5671 Py_ssize_t startpos, Py_ssize_t endpos,
5672 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005674 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005675 *exceptionObject = PyUnicodeEncodeError_Create(
5676 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 }
5678 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5680 goto onError;
5681 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5682 goto onError;
5683 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5684 goto onError;
5685 return;
5686 onError:
5687 Py_DECREF(*exceptionObject);
5688 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 }
5690}
5691
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005692/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005693static void
5694raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005695 const char *encoding,
5696 const Py_UNICODE *unicode, Py_ssize_t size,
5697 Py_ssize_t startpos, Py_ssize_t endpos,
5698 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005699{
5700 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005704}
5705
5706/* error handling callback helper:
5707 build arguments, call the callback and check the arguments,
5708 put the result into newpos and return the replacement string, which
5709 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005710static PyObject *
5711unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005712 PyObject **errorHandler,
5713 const char *encoding, const char *reason,
5714 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5715 Py_ssize_t startpos, Py_ssize_t endpos,
5716 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005717{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005718 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005719
5720 PyObject *restuple;
5721 PyObject *resunicode;
5722
5723 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005724 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005725 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005727 }
5728
5729 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005732 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005733
5734 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005736 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005738 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005739 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 Py_DECREF(restuple);
5741 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005742 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005743 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005744 &resunicode, newpos)) {
5745 Py_DECREF(restuple);
5746 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005747 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005748 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5749 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5750 Py_DECREF(restuple);
5751 return NULL;
5752 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005754 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005755 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5757 Py_DECREF(restuple);
5758 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005759 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005760 Py_INCREF(resunicode);
5761 Py_DECREF(restuple);
5762 return resunicode;
5763}
5764
Alexander Belopolsky40018472011-02-26 01:02:56 +00005765static PyObject *
5766unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005767 Py_ssize_t size,
5768 const char *errors,
5769 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770{
5771 /* output object */
5772 PyObject *res;
5773 /* pointers to the beginning and end+1 of input */
5774 const Py_UNICODE *startp = p;
5775 const Py_UNICODE *endp = p + size;
5776 /* pointer to the beginning of the unencodable characters */
5777 /* const Py_UNICODE *badp = NULL; */
5778 /* pointer into the output */
5779 char *str;
5780 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005781 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005782 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5783 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784 PyObject *errorHandler = NULL;
5785 PyObject *exc = NULL;
5786 /* the following variable is used for caching string comparisons
5787 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5788 int known_errorHandler = -1;
5789
5790 /* allocate enough for a simple encoding without
5791 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005792 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005793 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005794 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005796 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005797 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798 ressize = size;
5799
5800 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 /* can we encode this? */
5804 if (c<limit) {
5805 /* no overflow check, because we know that the space is enough */
5806 *str++ = (char)c;
5807 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005808 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 else {
5810 Py_ssize_t unicodepos = p-startp;
5811 Py_ssize_t requiredsize;
5812 PyObject *repunicode;
5813 Py_ssize_t repsize;
5814 Py_ssize_t newpos;
5815 Py_ssize_t respos;
5816 Py_UNICODE *uni2;
5817 /* startpos for collecting unencodable chars */
5818 const Py_UNICODE *collstart = p;
5819 const Py_UNICODE *collend = p;
5820 /* find all unecodable characters */
5821 while ((collend < endp) && ((*collend)>=limit))
5822 ++collend;
5823 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5824 if (known_errorHandler==-1) {
5825 if ((errors==NULL) || (!strcmp(errors, "strict")))
5826 known_errorHandler = 1;
5827 else if (!strcmp(errors, "replace"))
5828 known_errorHandler = 2;
5829 else if (!strcmp(errors, "ignore"))
5830 known_errorHandler = 3;
5831 else if (!strcmp(errors, "xmlcharrefreplace"))
5832 known_errorHandler = 4;
5833 else
5834 known_errorHandler = 0;
5835 }
5836 switch (known_errorHandler) {
5837 case 1: /* strict */
5838 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5839 goto onError;
5840 case 2: /* replace */
5841 while (collstart++<collend)
5842 *str++ = '?'; /* fall through */
5843 case 3: /* ignore */
5844 p = collend;
5845 break;
5846 case 4: /* xmlcharrefreplace */
5847 respos = str - PyBytes_AS_STRING(res);
5848 /* determine replacement size (temporarily (mis)uses p) */
5849 for (p = collstart, repsize = 0; p < collend; ++p) {
5850 if (*p<10)
5851 repsize += 2+1+1;
5852 else if (*p<100)
5853 repsize += 2+2+1;
5854 else if (*p<1000)
5855 repsize += 2+3+1;
5856 else if (*p<10000)
5857 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005858#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 else
5860 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005861#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005862 else if (*p<100000)
5863 repsize += 2+5+1;
5864 else if (*p<1000000)
5865 repsize += 2+6+1;
5866 else
5867 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005868#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005869 }
5870 requiredsize = respos+repsize+(endp-collend);
5871 if (requiredsize > ressize) {
5872 if (requiredsize<2*ressize)
5873 requiredsize = 2*ressize;
5874 if (_PyBytes_Resize(&res, requiredsize))
5875 goto onError;
5876 str = PyBytes_AS_STRING(res) + respos;
5877 ressize = requiredsize;
5878 }
5879 /* generate replacement (temporarily (mis)uses p) */
5880 for (p = collstart; p < collend; ++p) {
5881 str += sprintf(str, "&#%d;", (int)*p);
5882 }
5883 p = collend;
5884 break;
5885 default:
5886 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5887 encoding, reason, startp, size, &exc,
5888 collstart-startp, collend-startp, &newpos);
5889 if (repunicode == NULL)
5890 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005891 if (PyBytes_Check(repunicode)) {
5892 /* Directly copy bytes result to output. */
5893 repsize = PyBytes_Size(repunicode);
5894 if (repsize > 1) {
5895 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005896 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005897 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5898 Py_DECREF(repunicode);
5899 goto onError;
5900 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005901 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005902 ressize += repsize-1;
5903 }
5904 memcpy(str, PyBytes_AsString(repunicode), repsize);
5905 str += repsize;
5906 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005907 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005908 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005909 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 /* need more space? (at least enough for what we
5911 have+the replacement+the rest of the string, so
5912 we won't have to check space for encodable characters) */
5913 respos = str - PyBytes_AS_STRING(res);
5914 repsize = PyUnicode_GET_SIZE(repunicode);
5915 requiredsize = respos+repsize+(endp-collend);
5916 if (requiredsize > ressize) {
5917 if (requiredsize<2*ressize)
5918 requiredsize = 2*ressize;
5919 if (_PyBytes_Resize(&res, requiredsize)) {
5920 Py_DECREF(repunicode);
5921 goto onError;
5922 }
5923 str = PyBytes_AS_STRING(res) + respos;
5924 ressize = requiredsize;
5925 }
5926 /* check if there is anything unencodable in the replacement
5927 and copy it to the output */
5928 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5929 c = *uni2;
5930 if (c >= limit) {
5931 raise_encode_exception(&exc, encoding, startp, size,
5932 unicodepos, unicodepos+1, reason);
5933 Py_DECREF(repunicode);
5934 goto onError;
5935 }
5936 *str = (char)c;
5937 }
5938 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005939 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005940 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005941 }
5942 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005943 /* Resize if we allocated to much */
5944 size = str - PyBytes_AS_STRING(res);
5945 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005946 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005947 if (_PyBytes_Resize(&res, size) < 0)
5948 goto onError;
5949 }
5950
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005951 Py_XDECREF(errorHandler);
5952 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005953 return res;
5954
5955 onError:
5956 Py_XDECREF(res);
5957 Py_XDECREF(errorHandler);
5958 Py_XDECREF(exc);
5959 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005960}
5961
Alexander Belopolsky40018472011-02-26 01:02:56 +00005962PyObject *
5963PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005964 Py_ssize_t size,
5965 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005967 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968}
5969
Alexander Belopolsky40018472011-02-26 01:02:56 +00005970PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005971_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972{
5973 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 PyErr_BadArgument();
5975 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005977 if (PyUnicode_READY(unicode) == -1)
5978 return NULL;
5979 /* Fast path: if it is a one-byte string, construct
5980 bytes object directly. */
5981 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
5982 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
5983 PyUnicode_GET_LENGTH(unicode));
5984 /* Non-Latin-1 characters present. Defer to above function to
5985 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005988 errors);
5989}
5990
5991PyObject*
5992PyUnicode_AsLatin1String(PyObject *unicode)
5993{
5994 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995}
5996
5997/* --- 7-bit ASCII Codec -------------------------------------------------- */
5998
Alexander Belopolsky40018472011-02-26 01:02:56 +00005999PyObject *
6000PyUnicode_DecodeASCII(const char *s,
6001 Py_ssize_t size,
6002 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006004 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 PyUnicodeObject *v;
6006 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006007 Py_ssize_t startinpos;
6008 Py_ssize_t endinpos;
6009 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006011 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006012 PyObject *errorHandler = NULL;
6013 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006014 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006015
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006017 if (size == 1 && *(unsigned char*)s < 128)
6018 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6019
6020 /* Fast path. Assume the input actually *is* ASCII, and allocate
6021 a single-block Unicode object with that assumption. If there is
6022 an error, drop the object and start over. */
6023 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6024 if (v == NULL)
6025 goto onError;
6026 d = PyUnicode_1BYTE_DATA(v);
6027 for (i = 0; i < size; i++) {
6028 unsigned char ch = ((unsigned char*)s)[i];
6029 if (ch < 128)
6030 d[i] = ch;
6031 else
6032 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006034 if (i == size)
6035 return (PyObject*)v;
6036 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006037
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 v = _PyUnicode_New(size);
6039 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006044 e = s + size;
6045 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 register unsigned char c = (unsigned char)*s;
6047 if (c < 128) {
6048 *p++ = c;
6049 ++s;
6050 }
6051 else {
6052 startinpos = s-starts;
6053 endinpos = startinpos + 1;
6054 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6055 if (unicode_decode_call_errorhandler(
6056 errors, &errorHandler,
6057 "ascii", "ordinal not in range(128)",
6058 &starts, &e, &startinpos, &endinpos, &exc, &s,
6059 &v, &outpos, &p))
6060 goto onError;
6061 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006063 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6065 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006066 Py_XDECREF(errorHandler);
6067 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006068 if (PyUnicode_READY(v) == -1) {
6069 Py_DECREF(v);
6070 return NULL;
6071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006073
Benjamin Peterson29060642009-01-31 22:14:21 +00006074 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006076 Py_XDECREF(errorHandler);
6077 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 return NULL;
6079}
6080
Alexander Belopolsky40018472011-02-26 01:02:56 +00006081PyObject *
6082PyUnicode_EncodeASCII(const Py_UNICODE *p,
6083 Py_ssize_t size,
6084 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006086 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087}
6088
Alexander Belopolsky40018472011-02-26 01:02:56 +00006089PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006090_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091{
6092 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 PyErr_BadArgument();
6094 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006096 if (PyUnicode_READY(unicode) == -1)
6097 return NULL;
6098 /* Fast path: if it is an ASCII-only string, construct bytes object
6099 directly. Else defer to above function to raise the exception. */
6100 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6101 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6102 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006104 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006105 errors);
6106}
6107
6108PyObject *
6109PyUnicode_AsASCIIString(PyObject *unicode)
6110{
6111 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112}
6113
Victor Stinner99b95382011-07-04 14:23:54 +02006114#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006115
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006116/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006117
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006118#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006119#define NEED_RETRY
6120#endif
6121
6122/* XXX This code is limited to "true" double-byte encodings, as
6123 a) it assumes an incomplete character consists of a single byte, and
6124 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006126
Alexander Belopolsky40018472011-02-26 01:02:56 +00006127static int
6128is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006129{
6130 const char *curr = s + offset;
6131
6132 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 const char *prev = CharPrev(s, curr);
6134 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006135 }
6136 return 0;
6137}
6138
6139/*
6140 * Decode MBCS string into unicode object. If 'final' is set, converts
6141 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6142 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006143static int
6144decode_mbcs(PyUnicodeObject **v,
6145 const char *s, /* MBCS string */
6146 int size, /* sizeof MBCS string */
6147 int final,
6148 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006149{
6150 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006151 Py_ssize_t n;
6152 DWORD usize;
6153 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006154
6155 assert(size >= 0);
6156
Victor Stinner554f3f02010-06-16 23:33:54 +00006157 /* check and handle 'errors' arg */
6158 if (errors==NULL || strcmp(errors, "strict")==0)
6159 flags = MB_ERR_INVALID_CHARS;
6160 else if (strcmp(errors, "ignore")==0)
6161 flags = 0;
6162 else {
6163 PyErr_Format(PyExc_ValueError,
6164 "mbcs encoding does not support errors='%s'",
6165 errors);
6166 return -1;
6167 }
6168
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006169 /* Skip trailing lead-byte unless 'final' is set */
6170 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006172
6173 /* First get the size of the result */
6174 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006175 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6176 if (usize==0)
6177 goto mbcs_decode_error;
6178 } else
6179 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006180
6181 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 /* Create unicode object */
6183 *v = _PyUnicode_New(usize);
6184 if (*v == NULL)
6185 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006186 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006187 }
6188 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 /* Extend unicode object */
6190 n = PyUnicode_GET_SIZE(*v);
6191 if (_PyUnicode_Resize(v, n + usize) < 0)
6192 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006193 }
6194
6195 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006196 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006198 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6199 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006201 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006202 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006203
6204mbcs_decode_error:
6205 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6206 we raise a UnicodeDecodeError - else it is a 'generic'
6207 windows error
6208 */
6209 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6210 /* Ideally, we should get reason from FormatMessage - this
6211 is the Windows 2000 English version of the message
6212 */
6213 PyObject *exc = NULL;
6214 const char *reason = "No mapping for the Unicode character exists "
6215 "in the target multi-byte code page.";
6216 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6217 if (exc != NULL) {
6218 PyCodec_StrictErrors(exc);
6219 Py_DECREF(exc);
6220 }
6221 } else {
6222 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6223 }
6224 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006225}
6226
Alexander Belopolsky40018472011-02-26 01:02:56 +00006227PyObject *
6228PyUnicode_DecodeMBCSStateful(const char *s,
6229 Py_ssize_t size,
6230 const char *errors,
6231 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006232{
6233 PyUnicodeObject *v = NULL;
6234 int done;
6235
6236 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006237 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006238
6239#ifdef NEED_RETRY
6240 retry:
6241 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006242 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006243 else
6244#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006245 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006246
6247 if (done < 0) {
6248 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006250 }
6251
6252 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006254
6255#ifdef NEED_RETRY
6256 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 s += done;
6258 size -= done;
6259 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006260 }
6261#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006262 if (PyUnicode_READY(v) == -1) {
6263 Py_DECREF(v);
6264 return NULL;
6265 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006266 return (PyObject *)v;
6267}
6268
Alexander Belopolsky40018472011-02-26 01:02:56 +00006269PyObject *
6270PyUnicode_DecodeMBCS(const char *s,
6271 Py_ssize_t size,
6272 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006273{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006274 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6275}
6276
6277/*
6278 * Convert unicode into string object (MBCS).
6279 * Returns 0 if succeed, -1 otherwise.
6280 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006281static int
6282encode_mbcs(PyObject **repr,
6283 const Py_UNICODE *p, /* unicode */
6284 int size, /* size of unicode */
6285 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006286{
Victor Stinner554f3f02010-06-16 23:33:54 +00006287 BOOL usedDefaultChar = FALSE;
6288 BOOL *pusedDefaultChar;
6289 int mbcssize;
6290 Py_ssize_t n;
6291 PyObject *exc = NULL;
6292 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006293
6294 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006295
Victor Stinner554f3f02010-06-16 23:33:54 +00006296 /* check and handle 'errors' arg */
6297 if (errors==NULL || strcmp(errors, "strict")==0) {
6298 flags = WC_NO_BEST_FIT_CHARS;
6299 pusedDefaultChar = &usedDefaultChar;
6300 } else if (strcmp(errors, "replace")==0) {
6301 flags = 0;
6302 pusedDefaultChar = NULL;
6303 } else {
6304 PyErr_Format(PyExc_ValueError,
6305 "mbcs encoding does not support errors='%s'",
6306 errors);
6307 return -1;
6308 }
6309
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006310 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006311 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006312 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6313 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 if (mbcssize == 0) {
6315 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6316 return -1;
6317 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006318 /* If we used a default char, then we failed! */
6319 if (pusedDefaultChar && *pusedDefaultChar)
6320 goto mbcs_encode_error;
6321 } else {
6322 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006323 }
6324
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006325 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 /* Create string object */
6327 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6328 if (*repr == NULL)
6329 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006330 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006331 }
6332 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 /* Extend string object */
6334 n = PyBytes_Size(*repr);
6335 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6336 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006337 }
6338
6339 /* Do the conversion */
6340 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006342 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6343 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6345 return -1;
6346 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006347 if (pusedDefaultChar && *pusedDefaultChar)
6348 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006349 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006350 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006351
6352mbcs_encode_error:
6353 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6354 Py_XDECREF(exc);
6355 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006356}
6357
Alexander Belopolsky40018472011-02-26 01:02:56 +00006358PyObject *
6359PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6360 Py_ssize_t size,
6361 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006362{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006363 PyObject *repr = NULL;
6364 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006365
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006366#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006368 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006369 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006370 else
6371#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006372 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006373
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006374 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 Py_XDECREF(repr);
6376 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006377 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006378
6379#ifdef NEED_RETRY
6380 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 p += INT_MAX;
6382 size -= INT_MAX;
6383 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006384 }
6385#endif
6386
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006387 return repr;
6388}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006389
Alexander Belopolsky40018472011-02-26 01:02:56 +00006390PyObject *
6391PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006392{
6393 if (!PyUnicode_Check(unicode)) {
6394 PyErr_BadArgument();
6395 return NULL;
6396 }
6397 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 PyUnicode_GET_SIZE(unicode),
6399 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006400}
6401
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006402#undef NEED_RETRY
6403
Victor Stinner99b95382011-07-04 14:23:54 +02006404#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006405
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406/* --- Character Mapping Codec -------------------------------------------- */
6407
Alexander Belopolsky40018472011-02-26 01:02:56 +00006408PyObject *
6409PyUnicode_DecodeCharmap(const char *s,
6410 Py_ssize_t size,
6411 PyObject *mapping,
6412 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006415 Py_ssize_t startinpos;
6416 Py_ssize_t endinpos;
6417 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006418 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 PyUnicodeObject *v;
6420 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006421 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 PyObject *errorHandler = NULL;
6423 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006424 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006425 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006426
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427 /* Default to Latin-1 */
6428 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430
6431 v = _PyUnicode_New(size);
6432 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006437 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006438 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 mapstring = PyUnicode_AS_UNICODE(mapping);
6440 maplen = PyUnicode_GET_SIZE(mapping);
6441 while (s < e) {
6442 unsigned char ch = *s;
6443 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 if (ch < maplen)
6446 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 if (x == 0xfffe) {
6449 /* undefined mapping */
6450 outpos = p-PyUnicode_AS_UNICODE(v);
6451 startinpos = s-starts;
6452 endinpos = startinpos+1;
6453 if (unicode_decode_call_errorhandler(
6454 errors, &errorHandler,
6455 "charmap", "character maps to <undefined>",
6456 &starts, &e, &startinpos, &endinpos, &exc, &s,
6457 &v, &outpos, &p)) {
6458 goto onError;
6459 }
6460 continue;
6461 }
6462 *p++ = x;
6463 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006464 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006465 }
6466 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 while (s < e) {
6468 unsigned char ch = *s;
6469 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006470
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6472 w = PyLong_FromLong((long)ch);
6473 if (w == NULL)
6474 goto onError;
6475 x = PyObject_GetItem(mapping, w);
6476 Py_DECREF(w);
6477 if (x == NULL) {
6478 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6479 /* No mapping found means: mapping is undefined. */
6480 PyErr_Clear();
6481 x = Py_None;
6482 Py_INCREF(x);
6483 } else
6484 goto onError;
6485 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006486
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 /* Apply mapping */
6488 if (PyLong_Check(x)) {
6489 long value = PyLong_AS_LONG(x);
6490 if (value < 0 || value > 65535) {
6491 PyErr_SetString(PyExc_TypeError,
6492 "character mapping must be in range(65536)");
6493 Py_DECREF(x);
6494 goto onError;
6495 }
6496 *p++ = (Py_UNICODE)value;
6497 }
6498 else if (x == Py_None) {
6499 /* undefined mapping */
6500 outpos = p-PyUnicode_AS_UNICODE(v);
6501 startinpos = s-starts;
6502 endinpos = startinpos+1;
6503 if (unicode_decode_call_errorhandler(
6504 errors, &errorHandler,
6505 "charmap", "character maps to <undefined>",
6506 &starts, &e, &startinpos, &endinpos, &exc, &s,
6507 &v, &outpos, &p)) {
6508 Py_DECREF(x);
6509 goto onError;
6510 }
6511 Py_DECREF(x);
6512 continue;
6513 }
6514 else if (PyUnicode_Check(x)) {
6515 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006516
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 if (targetsize == 1)
6518 /* 1-1 mapping */
6519 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006520
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 else if (targetsize > 1) {
6522 /* 1-n mapping */
6523 if (targetsize > extrachars) {
6524 /* resize first */
6525 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6526 Py_ssize_t needed = (targetsize - extrachars) + \
6527 (targetsize << 2);
6528 extrachars += needed;
6529 /* XXX overflow detection missing */
6530 if (_PyUnicode_Resize(&v,
6531 PyUnicode_GET_SIZE(v) + needed) < 0) {
6532 Py_DECREF(x);
6533 goto onError;
6534 }
6535 p = PyUnicode_AS_UNICODE(v) + oldpos;
6536 }
6537 Py_UNICODE_COPY(p,
6538 PyUnicode_AS_UNICODE(x),
6539 targetsize);
6540 p += targetsize;
6541 extrachars -= targetsize;
6542 }
6543 /* 1-0 mapping: skip the character */
6544 }
6545 else {
6546 /* wrong return value */
6547 PyErr_SetString(PyExc_TypeError,
6548 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006549 Py_DECREF(x);
6550 goto onError;
6551 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 Py_DECREF(x);
6553 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555 }
6556 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6558 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006559 Py_XDECREF(errorHandler);
6560 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006561 if (PyUnicode_READY(v) == -1) {
6562 Py_DECREF(v);
6563 return NULL;
6564 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006566
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006568 Py_XDECREF(errorHandler);
6569 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 Py_XDECREF(v);
6571 return NULL;
6572}
6573
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006574/* Charmap encoding: the lookup table */
6575
Alexander Belopolsky40018472011-02-26 01:02:56 +00006576struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 PyObject_HEAD
6578 unsigned char level1[32];
6579 int count2, count3;
6580 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006581};
6582
6583static PyObject*
6584encoding_map_size(PyObject *obj, PyObject* args)
6585{
6586 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006587 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006589}
6590
6591static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006592 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 PyDoc_STR("Return the size (in bytes) of this object") },
6594 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006595};
6596
6597static void
6598encoding_map_dealloc(PyObject* o)
6599{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006600 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006601}
6602
6603static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006604 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 "EncodingMap", /*tp_name*/
6606 sizeof(struct encoding_map), /*tp_basicsize*/
6607 0, /*tp_itemsize*/
6608 /* methods */
6609 encoding_map_dealloc, /*tp_dealloc*/
6610 0, /*tp_print*/
6611 0, /*tp_getattr*/
6612 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006613 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 0, /*tp_repr*/
6615 0, /*tp_as_number*/
6616 0, /*tp_as_sequence*/
6617 0, /*tp_as_mapping*/
6618 0, /*tp_hash*/
6619 0, /*tp_call*/
6620 0, /*tp_str*/
6621 0, /*tp_getattro*/
6622 0, /*tp_setattro*/
6623 0, /*tp_as_buffer*/
6624 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6625 0, /*tp_doc*/
6626 0, /*tp_traverse*/
6627 0, /*tp_clear*/
6628 0, /*tp_richcompare*/
6629 0, /*tp_weaklistoffset*/
6630 0, /*tp_iter*/
6631 0, /*tp_iternext*/
6632 encoding_map_methods, /*tp_methods*/
6633 0, /*tp_members*/
6634 0, /*tp_getset*/
6635 0, /*tp_base*/
6636 0, /*tp_dict*/
6637 0, /*tp_descr_get*/
6638 0, /*tp_descr_set*/
6639 0, /*tp_dictoffset*/
6640 0, /*tp_init*/
6641 0, /*tp_alloc*/
6642 0, /*tp_new*/
6643 0, /*tp_free*/
6644 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006645};
6646
6647PyObject*
6648PyUnicode_BuildEncodingMap(PyObject* string)
6649{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006650 PyObject *result;
6651 struct encoding_map *mresult;
6652 int i;
6653 int need_dict = 0;
6654 unsigned char level1[32];
6655 unsigned char level2[512];
6656 unsigned char *mlevel1, *mlevel2, *mlevel3;
6657 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006658 int kind;
6659 void *data;
6660 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006662 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006663 PyErr_BadArgument();
6664 return NULL;
6665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006666 kind = PyUnicode_KIND(string);
6667 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006668 memset(level1, 0xFF, sizeof level1);
6669 memset(level2, 0xFF, sizeof level2);
6670
6671 /* If there isn't a one-to-one mapping of NULL to \0,
6672 or if there are non-BMP characters, we need to use
6673 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006674 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006675 need_dict = 1;
6676 for (i = 1; i < 256; i++) {
6677 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006678 ch = PyUnicode_READ(kind, data, i);
6679 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006680 need_dict = 1;
6681 break;
6682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006683 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006684 /* unmapped character */
6685 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006686 l1 = ch >> 11;
6687 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006688 if (level1[l1] == 0xFF)
6689 level1[l1] = count2++;
6690 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006691 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006692 }
6693
6694 if (count2 >= 0xFF || count3 >= 0xFF)
6695 need_dict = 1;
6696
6697 if (need_dict) {
6698 PyObject *result = PyDict_New();
6699 PyObject *key, *value;
6700 if (!result)
6701 return NULL;
6702 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006703 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006704 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006705 if (!key || !value)
6706 goto failed1;
6707 if (PyDict_SetItem(result, key, value) == -1)
6708 goto failed1;
6709 Py_DECREF(key);
6710 Py_DECREF(value);
6711 }
6712 return result;
6713 failed1:
6714 Py_XDECREF(key);
6715 Py_XDECREF(value);
6716 Py_DECREF(result);
6717 return NULL;
6718 }
6719
6720 /* Create a three-level trie */
6721 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6722 16*count2 + 128*count3 - 1);
6723 if (!result)
6724 return PyErr_NoMemory();
6725 PyObject_Init(result, &EncodingMapType);
6726 mresult = (struct encoding_map*)result;
6727 mresult->count2 = count2;
6728 mresult->count3 = count3;
6729 mlevel1 = mresult->level1;
6730 mlevel2 = mresult->level23;
6731 mlevel3 = mresult->level23 + 16*count2;
6732 memcpy(mlevel1, level1, 32);
6733 memset(mlevel2, 0xFF, 16*count2);
6734 memset(mlevel3, 0, 128*count3);
6735 count3 = 0;
6736 for (i = 1; i < 256; i++) {
6737 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006738 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006739 /* unmapped character */
6740 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006741 o1 = PyUnicode_READ(kind, data, i)>>11;
6742 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006743 i2 = 16*mlevel1[o1] + o2;
6744 if (mlevel2[i2] == 0xFF)
6745 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006746 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006747 i3 = 128*mlevel2[i2] + o3;
6748 mlevel3[i3] = i;
6749 }
6750 return result;
6751}
6752
6753static int
6754encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6755{
6756 struct encoding_map *map = (struct encoding_map*)mapping;
6757 int l1 = c>>11;
6758 int l2 = (c>>7) & 0xF;
6759 int l3 = c & 0x7F;
6760 int i;
6761
6762#ifdef Py_UNICODE_WIDE
6763 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006765 }
6766#endif
6767 if (c == 0)
6768 return 0;
6769 /* level 1*/
6770 i = map->level1[l1];
6771 if (i == 0xFF) {
6772 return -1;
6773 }
6774 /* level 2*/
6775 i = map->level23[16*i+l2];
6776 if (i == 0xFF) {
6777 return -1;
6778 }
6779 /* level 3 */
6780 i = map->level23[16*map->count2 + 128*i + l3];
6781 if (i == 0) {
6782 return -1;
6783 }
6784 return i;
6785}
6786
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006787/* Lookup the character ch in the mapping. If the character
6788 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006789 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006790static PyObject *
6791charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792{
Christian Heimes217cfd12007-12-02 14:31:20 +00006793 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006794 PyObject *x;
6795
6796 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006798 x = PyObject_GetItem(mapping, w);
6799 Py_DECREF(w);
6800 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6802 /* No mapping found means: mapping is undefined. */
6803 PyErr_Clear();
6804 x = Py_None;
6805 Py_INCREF(x);
6806 return x;
6807 } else
6808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006810 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006812 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 long value = PyLong_AS_LONG(x);
6814 if (value < 0 || value > 255) {
6815 PyErr_SetString(PyExc_TypeError,
6816 "character mapping must be in range(256)");
6817 Py_DECREF(x);
6818 return NULL;
6819 }
6820 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006822 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 /* wrong return value */
6826 PyErr_Format(PyExc_TypeError,
6827 "character mapping must return integer, bytes or None, not %.400s",
6828 x->ob_type->tp_name);
6829 Py_DECREF(x);
6830 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 }
6832}
6833
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006834static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006835charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006836{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006837 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6838 /* exponentially overallocate to minimize reallocations */
6839 if (requiredsize < 2*outsize)
6840 requiredsize = 2*outsize;
6841 if (_PyBytes_Resize(outobj, requiredsize))
6842 return -1;
6843 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006844}
6845
Benjamin Peterson14339b62009-01-31 16:36:08 +00006846typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006848} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006849/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006850 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006851 space is available. Return a new reference to the object that
6852 was put in the output buffer, or Py_None, if the mapping was undefined
6853 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006854 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006855static charmapencode_result
6856charmapencode_output(Py_UNICODE c, PyObject *mapping,
6857 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006858{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006859 PyObject *rep;
6860 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006861 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006862
Christian Heimes90aa7642007-12-19 02:45:37 +00006863 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006864 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006866 if (res == -1)
6867 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 if (outsize<requiredsize)
6869 if (charmapencode_resize(outobj, outpos, requiredsize))
6870 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006871 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 outstart[(*outpos)++] = (char)res;
6873 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006874 }
6875
6876 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006877 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006879 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 Py_DECREF(rep);
6881 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006882 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 if (PyLong_Check(rep)) {
6884 Py_ssize_t requiredsize = *outpos+1;
6885 if (outsize<requiredsize)
6886 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6887 Py_DECREF(rep);
6888 return enc_EXCEPTION;
6889 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006890 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006892 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 else {
6894 const char *repchars = PyBytes_AS_STRING(rep);
6895 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6896 Py_ssize_t requiredsize = *outpos+repsize;
6897 if (outsize<requiredsize)
6898 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6899 Py_DECREF(rep);
6900 return enc_EXCEPTION;
6901 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006902 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 memcpy(outstart + *outpos, repchars, repsize);
6904 *outpos += repsize;
6905 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006906 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006907 Py_DECREF(rep);
6908 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006909}
6910
6911/* handle an error in PyUnicode_EncodeCharmap
6912 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006913static int
6914charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006915 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006916 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006917 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006918 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006919{
6920 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006921 Py_ssize_t repsize;
6922 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006923 Py_UNICODE *uni2;
6924 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006925 Py_ssize_t collstartpos = *inpos;
6926 Py_ssize_t collendpos = *inpos+1;
6927 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006928 char *encoding = "charmap";
6929 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006930 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006931
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006932 /* find all unencodable characters */
6933 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006934 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006935 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 int res = encoding_map_lookup(p[collendpos], mapping);
6937 if (res != -1)
6938 break;
6939 ++collendpos;
6940 continue;
6941 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006942
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 rep = charmapencode_lookup(p[collendpos], mapping);
6944 if (rep==NULL)
6945 return -1;
6946 else if (rep!=Py_None) {
6947 Py_DECREF(rep);
6948 break;
6949 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006950 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006952 }
6953 /* cache callback name lookup
6954 * (if not done yet, i.e. it's the first error) */
6955 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006956 if ((errors==NULL) || (!strcmp(errors, "strict")))
6957 *known_errorHandler = 1;
6958 else if (!strcmp(errors, "replace"))
6959 *known_errorHandler = 2;
6960 else if (!strcmp(errors, "ignore"))
6961 *known_errorHandler = 3;
6962 else if (!strcmp(errors, "xmlcharrefreplace"))
6963 *known_errorHandler = 4;
6964 else
6965 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006966 }
6967 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006968 case 1: /* strict */
6969 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6970 return -1;
6971 case 2: /* replace */
6972 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 x = charmapencode_output('?', mapping, res, respos);
6974 if (x==enc_EXCEPTION) {
6975 return -1;
6976 }
6977 else if (x==enc_FAILED) {
6978 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6979 return -1;
6980 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006981 }
6982 /* fall through */
6983 case 3: /* ignore */
6984 *inpos = collendpos;
6985 break;
6986 case 4: /* xmlcharrefreplace */
6987 /* generate replacement (temporarily (mis)uses p) */
6988 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 char buffer[2+29+1+1];
6990 char *cp;
6991 sprintf(buffer, "&#%d;", (int)p[collpos]);
6992 for (cp = buffer; *cp; ++cp) {
6993 x = charmapencode_output(*cp, mapping, res, respos);
6994 if (x==enc_EXCEPTION)
6995 return -1;
6996 else if (x==enc_FAILED) {
6997 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6998 return -1;
6999 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007000 }
7001 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007002 *inpos = collendpos;
7003 break;
7004 default:
7005 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 encoding, reason, p, size, exceptionObject,
7007 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007008 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007010 if (PyBytes_Check(repunicode)) {
7011 /* Directly copy bytes result to output. */
7012 Py_ssize_t outsize = PyBytes_Size(*res);
7013 Py_ssize_t requiredsize;
7014 repsize = PyBytes_Size(repunicode);
7015 requiredsize = *respos + repsize;
7016 if (requiredsize > outsize)
7017 /* Make room for all additional bytes. */
7018 if (charmapencode_resize(res, respos, requiredsize)) {
7019 Py_DECREF(repunicode);
7020 return -1;
7021 }
7022 memcpy(PyBytes_AsString(*res) + *respos,
7023 PyBytes_AsString(repunicode), repsize);
7024 *respos += repsize;
7025 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007026 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007027 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007028 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007029 /* generate replacement */
7030 repsize = PyUnicode_GET_SIZE(repunicode);
7031 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 x = charmapencode_output(*uni2, mapping, res, respos);
7033 if (x==enc_EXCEPTION) {
7034 return -1;
7035 }
7036 else if (x==enc_FAILED) {
7037 Py_DECREF(repunicode);
7038 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7039 return -1;
7040 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007041 }
7042 *inpos = newpos;
7043 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007044 }
7045 return 0;
7046}
7047
Alexander Belopolsky40018472011-02-26 01:02:56 +00007048PyObject *
7049PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7050 Py_ssize_t size,
7051 PyObject *mapping,
7052 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007054 /* output object */
7055 PyObject *res = NULL;
7056 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007057 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007058 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007059 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007060 PyObject *errorHandler = NULL;
7061 PyObject *exc = NULL;
7062 /* the following variable is used for caching string comparisons
7063 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7064 * 3=ignore, 4=xmlcharrefreplace */
7065 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066
7067 /* Default to Latin-1 */
7068 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007071 /* allocate enough for a simple encoding without
7072 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007073 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007074 if (res == NULL)
7075 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007076 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007077 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007079 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007080 /* try to encode it */
7081 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7082 if (x==enc_EXCEPTION) /* error */
7083 goto onError;
7084 if (x==enc_FAILED) { /* unencodable character */
7085 if (charmap_encoding_error(p, size, &inpos, mapping,
7086 &exc,
7087 &known_errorHandler, &errorHandler, errors,
7088 &res, &respos)) {
7089 goto onError;
7090 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007091 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 else
7093 /* done with this character => adjust input position */
7094 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007097 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007098 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007099 if (_PyBytes_Resize(&res, respos) < 0)
7100 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007101
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007102 Py_XDECREF(exc);
7103 Py_XDECREF(errorHandler);
7104 return res;
7105
Benjamin Peterson29060642009-01-31 22:14:21 +00007106 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007107 Py_XDECREF(res);
7108 Py_XDECREF(exc);
7109 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110 return NULL;
7111}
7112
Alexander Belopolsky40018472011-02-26 01:02:56 +00007113PyObject *
7114PyUnicode_AsCharmapString(PyObject *unicode,
7115 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116{
7117 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007118 PyErr_BadArgument();
7119 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120 }
7121 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007122 PyUnicode_GET_SIZE(unicode),
7123 mapping,
7124 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125}
7126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007127/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007128static void
7129make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007130 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007131 Py_ssize_t startpos, Py_ssize_t endpos,
7132 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007134 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007135 *exceptionObject = _PyUnicodeTranslateError_Create(
7136 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 }
7138 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007139 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7140 goto onError;
7141 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7142 goto onError;
7143 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7144 goto onError;
7145 return;
7146 onError:
7147 Py_DECREF(*exceptionObject);
7148 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 }
7150}
7151
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007152/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007153static void
7154raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007155 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007156 Py_ssize_t startpos, Py_ssize_t endpos,
7157 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007158{
7159 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007160 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007161 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007163}
7164
7165/* error handling callback helper:
7166 build arguments, call the callback and check the arguments,
7167 put the result into newpos and return the replacement string, which
7168 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007169static PyObject *
7170unicode_translate_call_errorhandler(const char *errors,
7171 PyObject **errorHandler,
7172 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007173 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007174 Py_ssize_t startpos, Py_ssize_t endpos,
7175 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007176{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007177 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007178
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007179 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007180 PyObject *restuple;
7181 PyObject *resunicode;
7182
7183 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007185 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007186 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007187 }
7188
7189 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007190 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007191 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007193
7194 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007195 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007196 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007198 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007199 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 Py_DECREF(restuple);
7201 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007202 }
7203 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007204 &resunicode, &i_newpos)) {
7205 Py_DECREF(restuple);
7206 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007207 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007208 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007209 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007210 else
7211 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007212 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7214 Py_DECREF(restuple);
7215 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007216 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007217 Py_INCREF(resunicode);
7218 Py_DECREF(restuple);
7219 return resunicode;
7220}
7221
7222/* Lookup the character ch in the mapping and put the result in result,
7223 which must be decrefed by the caller.
7224 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007225static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007226charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007227{
Christian Heimes217cfd12007-12-02 14:31:20 +00007228 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007229 PyObject *x;
7230
7231 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007233 x = PyObject_GetItem(mapping, w);
7234 Py_DECREF(w);
7235 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7237 /* No mapping found means: use 1:1 mapping. */
7238 PyErr_Clear();
7239 *result = NULL;
7240 return 0;
7241 } else
7242 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007243 }
7244 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007245 *result = x;
7246 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007247 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007248 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007249 long value = PyLong_AS_LONG(x);
7250 long max = PyUnicode_GetMax();
7251 if (value < 0 || value > max) {
7252 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007253 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 Py_DECREF(x);
7255 return -1;
7256 }
7257 *result = x;
7258 return 0;
7259 }
7260 else if (PyUnicode_Check(x)) {
7261 *result = x;
7262 return 0;
7263 }
7264 else {
7265 /* wrong return value */
7266 PyErr_SetString(PyExc_TypeError,
7267 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007268 Py_DECREF(x);
7269 return -1;
7270 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007271}
7272/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 if not reallocate and adjust various state variables.
7274 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007275static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007276charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007278{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007279 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007280 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007281 /* exponentially overallocate to minimize reallocations */
7282 if (requiredsize < 2 * oldsize)
7283 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007284 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7285 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007286 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007287 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007288 }
7289 return 0;
7290}
7291/* lookup the character, put the result in the output string and adjust
7292 various state variables. Return a new reference to the object that
7293 was put in the output buffer in *result, or Py_None, if the mapping was
7294 undefined (in which case no character was written).
7295 The called must decref result.
7296 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007297static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007298charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7299 PyObject *mapping, Py_UCS4 **output,
7300 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007301 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007302{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007303 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7304 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007305 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007306 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007307 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007308 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007309 }
7310 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007312 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007314 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007315 }
7316 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007317 Py_ssize_t repsize;
7318 if (PyUnicode_READY(*res) == -1)
7319 return -1;
7320 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007321 if (repsize==1) {
7322 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007323 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007324 }
7325 else if (repsize!=0) {
7326 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007327 Py_ssize_t requiredsize = *opos +
7328 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007329 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007330 Py_ssize_t i;
7331 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007333 for(i = 0; i < repsize; i++)
7334 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007336 }
7337 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007339 return 0;
7340}
7341
Alexander Belopolsky40018472011-02-26 01:02:56 +00007342PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007343_PyUnicode_TranslateCharmap(PyObject *input,
7344 PyObject *mapping,
7345 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007347 /* input object */
7348 char *idata;
7349 Py_ssize_t size, i;
7350 int kind;
7351 /* output buffer */
7352 Py_UCS4 *output = NULL;
7353 Py_ssize_t osize;
7354 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007355 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007356 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007357 char *reason = "character maps to <undefined>";
7358 PyObject *errorHandler = NULL;
7359 PyObject *exc = NULL;
7360 /* the following variable is used for caching string comparisons
7361 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7362 * 3=ignore, 4=xmlcharrefreplace */
7363 int known_errorHandler = -1;
7364
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007366 PyErr_BadArgument();
7367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007370 if (PyUnicode_READY(input) == -1)
7371 return NULL;
7372 idata = (char*)PyUnicode_DATA(input);
7373 kind = PyUnicode_KIND(input);
7374 size = PyUnicode_GET_LENGTH(input);
7375 i = 0;
7376
7377 if (size == 0) {
7378 Py_INCREF(input);
7379 return input;
7380 }
7381
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007382 /* allocate enough for a simple 1:1 translation without
7383 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007384 osize = size;
7385 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7386 opos = 0;
7387 if (output == NULL) {
7388 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007389 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007390 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007392 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 /* try to encode it */
7394 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007395 if (charmaptranslate_output(input, i, mapping,
7396 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 Py_XDECREF(x);
7398 goto onError;
7399 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007400 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007402 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 else { /* untranslatable character */
7404 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7405 Py_ssize_t repsize;
7406 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007407 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007408 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007409 Py_ssize_t collstart = i;
7410 Py_ssize_t collend = i+1;
7411 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007414 while (collend < size) {
7415 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 goto onError;
7417 Py_XDECREF(x);
7418 if (x!=Py_None)
7419 break;
7420 ++collend;
7421 }
7422 /* cache callback name lookup
7423 * (if not done yet, i.e. it's the first error) */
7424 if (known_errorHandler==-1) {
7425 if ((errors==NULL) || (!strcmp(errors, "strict")))
7426 known_errorHandler = 1;
7427 else if (!strcmp(errors, "replace"))
7428 known_errorHandler = 2;
7429 else if (!strcmp(errors, "ignore"))
7430 known_errorHandler = 3;
7431 else if (!strcmp(errors, "xmlcharrefreplace"))
7432 known_errorHandler = 4;
7433 else
7434 known_errorHandler = 0;
7435 }
7436 switch (known_errorHandler) {
7437 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007438 raise_translate_exception(&exc, input, collstart,
7439 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007440 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 case 2: /* replace */
7442 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007443 for (coll = collstart; coll<collend; coll++)
7444 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 /* fall through */
7446 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007447 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 break;
7449 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007450 /* generate replacement (temporarily (mis)uses i) */
7451 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 char buffer[2+29+1+1];
7453 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007454 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7455 if (charmaptranslate_makespace(&output, &osize,
7456 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 goto onError;
7458 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007459 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007461 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 break;
7463 default:
7464 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007465 reason, input, &exc,
7466 collstart, collend, &newpos);
7467 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007468 goto onError;
7469 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007470 repsize = PyUnicode_GET_LENGTH(repunicode);
7471 if (charmaptranslate_makespace(&output, &osize,
7472 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 Py_DECREF(repunicode);
7474 goto onError;
7475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007476 for (uni2 = 0; repsize-->0; ++uni2)
7477 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7478 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007480 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007481 }
7482 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007483 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7484 if (!res)
7485 goto onError;
7486 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007487 Py_XDECREF(exc);
7488 Py_XDECREF(errorHandler);
7489 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490
Benjamin Peterson29060642009-01-31 22:14:21 +00007491 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007492 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007493 Py_XDECREF(exc);
7494 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495 return NULL;
7496}
7497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007498/* Deprecated. Use PyUnicode_Translate instead. */
7499PyObject *
7500PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7501 Py_ssize_t size,
7502 PyObject *mapping,
7503 const char *errors)
7504{
7505 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7506 if (!unicode)
7507 return NULL;
7508 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7509}
7510
Alexander Belopolsky40018472011-02-26 01:02:56 +00007511PyObject *
7512PyUnicode_Translate(PyObject *str,
7513 PyObject *mapping,
7514 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515{
7516 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007517
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518 str = PyUnicode_FromObject(str);
7519 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007521 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522 Py_DECREF(str);
7523 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007524
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 Py_XDECREF(str);
7527 return NULL;
7528}
Tim Petersced69f82003-09-16 20:30:58 +00007529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007530static Py_UCS4
7531fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7532{
7533 /* No need to call PyUnicode_READY(self) because this function is only
7534 called as a callback from fixup() which does it already. */
7535 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7536 const int kind = PyUnicode_KIND(self);
7537 void *data = PyUnicode_DATA(self);
7538 Py_UCS4 maxchar = 0, ch, fixed;
7539 Py_ssize_t i;
7540
7541 for (i = 0; i < len; ++i) {
7542 ch = PyUnicode_READ(kind, data, i);
7543 fixed = 0;
7544 if (ch > 127) {
7545 if (Py_UNICODE_ISSPACE(ch))
7546 fixed = ' ';
7547 else {
7548 const int decimal = Py_UNICODE_TODECIMAL(ch);
7549 if (decimal >= 0)
7550 fixed = '0' + decimal;
7551 }
7552 if (fixed != 0) {
7553 if (fixed > maxchar)
7554 maxchar = fixed;
7555 PyUnicode_WRITE(kind, data, i, fixed);
7556 }
7557 else if (ch > maxchar)
7558 maxchar = ch;
7559 }
7560 else if (ch > maxchar)
7561 maxchar = ch;
7562 }
7563
7564 return maxchar;
7565}
7566
7567PyObject *
7568_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7569{
7570 if (!PyUnicode_Check(unicode)) {
7571 PyErr_BadInternalCall();
7572 return NULL;
7573 }
7574 if (PyUnicode_READY(unicode) == -1)
7575 return NULL;
7576 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7577 /* If the string is already ASCII, just return the same string */
7578 Py_INCREF(unicode);
7579 return unicode;
7580 }
7581 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7582}
7583
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007584PyObject *
7585PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7586 Py_ssize_t length)
7587{
7588 PyObject *result;
7589 Py_UNICODE *p; /* write pointer into result */
7590 Py_ssize_t i;
7591 /* Copy to a new string */
7592 result = (PyObject *)_PyUnicode_New(length);
7593 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7594 if (result == NULL)
7595 return result;
7596 p = PyUnicode_AS_UNICODE(result);
7597 /* Iterate over code points */
7598 for (i = 0; i < length; i++) {
7599 Py_UNICODE ch =s[i];
7600 if (ch > 127) {
7601 int decimal = Py_UNICODE_TODECIMAL(ch);
7602 if (decimal >= 0)
7603 p[i] = '0' + decimal;
7604 }
7605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007606 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7607 Py_DECREF(result);
7608 return NULL;
7609 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007610 return result;
7611}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007612/* --- Decimal Encoder ---------------------------------------------------- */
7613
Alexander Belopolsky40018472011-02-26 01:02:56 +00007614int
7615PyUnicode_EncodeDecimal(Py_UNICODE *s,
7616 Py_ssize_t length,
7617 char *output,
7618 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007619{
7620 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007621 PyObject *errorHandler = NULL;
7622 PyObject *exc = NULL;
7623 const char *encoding = "decimal";
7624 const char *reason = "invalid decimal Unicode string";
7625 /* the following variable is used for caching string comparisons
7626 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7627 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007628
7629 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 PyErr_BadArgument();
7631 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007632 }
7633
7634 p = s;
7635 end = s + length;
7636 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 register Py_UNICODE ch = *p;
7638 int decimal;
7639 PyObject *repunicode;
7640 Py_ssize_t repsize;
7641 Py_ssize_t newpos;
7642 Py_UNICODE *uni2;
7643 Py_UNICODE *collstart;
7644 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007645
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007647 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 ++p;
7649 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007650 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007651 decimal = Py_UNICODE_TODECIMAL(ch);
7652 if (decimal >= 0) {
7653 *output++ = '0' + decimal;
7654 ++p;
7655 continue;
7656 }
7657 if (0 < ch && ch < 256) {
7658 *output++ = (char)ch;
7659 ++p;
7660 continue;
7661 }
7662 /* All other characters are considered unencodable */
7663 collstart = p;
7664 collend = p+1;
7665 while (collend < end) {
7666 if ((0 < *collend && *collend < 256) ||
7667 !Py_UNICODE_ISSPACE(*collend) ||
7668 Py_UNICODE_TODECIMAL(*collend))
7669 break;
7670 }
7671 /* cache callback name lookup
7672 * (if not done yet, i.e. it's the first error) */
7673 if (known_errorHandler==-1) {
7674 if ((errors==NULL) || (!strcmp(errors, "strict")))
7675 known_errorHandler = 1;
7676 else if (!strcmp(errors, "replace"))
7677 known_errorHandler = 2;
7678 else if (!strcmp(errors, "ignore"))
7679 known_errorHandler = 3;
7680 else if (!strcmp(errors, "xmlcharrefreplace"))
7681 known_errorHandler = 4;
7682 else
7683 known_errorHandler = 0;
7684 }
7685 switch (known_errorHandler) {
7686 case 1: /* strict */
7687 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7688 goto onError;
7689 case 2: /* replace */
7690 for (p = collstart; p < collend; ++p)
7691 *output++ = '?';
7692 /* fall through */
7693 case 3: /* ignore */
7694 p = collend;
7695 break;
7696 case 4: /* xmlcharrefreplace */
7697 /* generate replacement (temporarily (mis)uses p) */
7698 for (p = collstart; p < collend; ++p)
7699 output += sprintf(output, "&#%d;", (int)*p);
7700 p = collend;
7701 break;
7702 default:
7703 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7704 encoding, reason, s, length, &exc,
7705 collstart-s, collend-s, &newpos);
7706 if (repunicode == NULL)
7707 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007708 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007709 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007710 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7711 Py_DECREF(repunicode);
7712 goto onError;
7713 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 /* generate replacement */
7715 repsize = PyUnicode_GET_SIZE(repunicode);
7716 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7717 Py_UNICODE ch = *uni2;
7718 if (Py_UNICODE_ISSPACE(ch))
7719 *output++ = ' ';
7720 else {
7721 decimal = Py_UNICODE_TODECIMAL(ch);
7722 if (decimal >= 0)
7723 *output++ = '0' + decimal;
7724 else if (0 < ch && ch < 256)
7725 *output++ = (char)ch;
7726 else {
7727 Py_DECREF(repunicode);
7728 raise_encode_exception(&exc, encoding,
7729 s, length, collstart-s, collend-s, reason);
7730 goto onError;
7731 }
7732 }
7733 }
7734 p = s + newpos;
7735 Py_DECREF(repunicode);
7736 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007737 }
7738 /* 0-terminate the output string */
7739 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007740 Py_XDECREF(exc);
7741 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007742 return 0;
7743
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007745 Py_XDECREF(exc);
7746 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007747 return -1;
7748}
7749
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750/* --- Helpers ------------------------------------------------------------ */
7751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007752#include "stringlib/ucs1lib.h"
7753#include "stringlib/fastsearch.h"
7754#include "stringlib/partition.h"
7755#include "stringlib/split.h"
7756#include "stringlib/count.h"
7757#include "stringlib/find.h"
7758#include "stringlib/localeutil.h"
7759#include "stringlib/undef.h"
7760
7761#include "stringlib/ucs2lib.h"
7762#include "stringlib/fastsearch.h"
7763#include "stringlib/partition.h"
7764#include "stringlib/split.h"
7765#include "stringlib/count.h"
7766#include "stringlib/find.h"
7767#include "stringlib/localeutil.h"
7768#include "stringlib/undef.h"
7769
7770#include "stringlib/ucs4lib.h"
7771#include "stringlib/fastsearch.h"
7772#include "stringlib/partition.h"
7773#include "stringlib/split.h"
7774#include "stringlib/count.h"
7775#include "stringlib/find.h"
7776#include "stringlib/localeutil.h"
7777#include "stringlib/undef.h"
7778
7779static Py_ssize_t
7780any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7781 const Py_UCS1*, Py_ssize_t,
7782 Py_ssize_t, Py_ssize_t),
7783 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7784 const Py_UCS2*, Py_ssize_t,
7785 Py_ssize_t, Py_ssize_t),
7786 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7787 const Py_UCS4*, Py_ssize_t,
7788 Py_ssize_t, Py_ssize_t),
7789 PyObject* s1, PyObject* s2,
7790 Py_ssize_t start,
7791 Py_ssize_t end)
7792{
7793 int kind1, kind2, kind;
7794 void *buf1, *buf2;
7795 Py_ssize_t len1, len2, result;
7796
7797 kind1 = PyUnicode_KIND(s1);
7798 kind2 = PyUnicode_KIND(s2);
7799 kind = kind1 > kind2 ? kind1 : kind2;
7800 buf1 = PyUnicode_DATA(s1);
7801 buf2 = PyUnicode_DATA(s2);
7802 if (kind1 != kind)
7803 buf1 = _PyUnicode_AsKind(s1, kind);
7804 if (!buf1)
7805 return -2;
7806 if (kind2 != kind)
7807 buf2 = _PyUnicode_AsKind(s2, kind);
7808 if (!buf2) {
7809 if (kind1 != kind) PyMem_Free(buf1);
7810 return -2;
7811 }
7812 len1 = PyUnicode_GET_LENGTH(s1);
7813 len2 = PyUnicode_GET_LENGTH(s2);
7814
7815 switch(kind) {
7816 case PyUnicode_1BYTE_KIND:
7817 result = ucs1(buf1, len1, buf2, len2, start, end);
7818 break;
7819 case PyUnicode_2BYTE_KIND:
7820 result = ucs2(buf1, len1, buf2, len2, start, end);
7821 break;
7822 case PyUnicode_4BYTE_KIND:
7823 result = ucs4(buf1, len1, buf2, len2, start, end);
7824 break;
7825 default:
7826 assert(0); result = -2;
7827 }
7828
7829 if (kind1 != kind)
7830 PyMem_Free(buf1);
7831 if (kind2 != kind)
7832 PyMem_Free(buf2);
7833
7834 return result;
7835}
7836
7837Py_ssize_t
7838_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7839 Py_ssize_t n_buffer,
7840 void *digits, Py_ssize_t n_digits,
7841 Py_ssize_t min_width,
7842 const char *grouping,
7843 const char *thousands_sep)
7844{
7845 switch(kind) {
7846 case PyUnicode_1BYTE_KIND:
7847 return _PyUnicode_ucs1_InsertThousandsGrouping(
7848 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7849 min_width, grouping, thousands_sep);
7850 case PyUnicode_2BYTE_KIND:
7851 return _PyUnicode_ucs2_InsertThousandsGrouping(
7852 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7853 min_width, grouping, thousands_sep);
7854 case PyUnicode_4BYTE_KIND:
7855 return _PyUnicode_ucs4_InsertThousandsGrouping(
7856 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7857 min_width, grouping, thousands_sep);
7858 }
7859 assert(0);
7860 return -1;
7861}
7862
7863
Eric Smith8c663262007-08-25 02:26:07 +00007864#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007865#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007866
Thomas Wouters477c8d52006-05-27 19:21:47 +00007867#include "stringlib/count.h"
7868#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007869
Thomas Wouters477c8d52006-05-27 19:21:47 +00007870/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007871#define ADJUST_INDICES(start, end, len) \
7872 if (end > len) \
7873 end = len; \
7874 else if (end < 0) { \
7875 end += len; \
7876 if (end < 0) \
7877 end = 0; \
7878 } \
7879 if (start < 0) { \
7880 start += len; \
7881 if (start < 0) \
7882 start = 0; \
7883 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007884
Alexander Belopolsky40018472011-02-26 01:02:56 +00007885Py_ssize_t
7886PyUnicode_Count(PyObject *str,
7887 PyObject *substr,
7888 Py_ssize_t start,
7889 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007891 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007892 PyUnicodeObject* str_obj;
7893 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007894 int kind1, kind2, kind;
7895 void *buf1 = NULL, *buf2 = NULL;
7896 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007897
Thomas Wouters477c8d52006-05-27 19:21:47 +00007898 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007899 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007901 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007902 if (!sub_obj || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 Py_DECREF(str_obj);
7904 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905 }
Tim Petersced69f82003-09-16 20:30:58 +00007906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007907 kind1 = PyUnicode_KIND(str_obj);
7908 kind2 = PyUnicode_KIND(sub_obj);
7909 kind = kind1 > kind2 ? kind1 : kind2;
7910 buf1 = PyUnicode_DATA(str_obj);
7911 if (kind1 != kind)
7912 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7913 if (!buf1)
7914 goto onError;
7915 buf2 = PyUnicode_DATA(sub_obj);
7916 if (kind2 != kind)
7917 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7918 if (!buf2)
7919 goto onError;
7920 len1 = PyUnicode_GET_LENGTH(str_obj);
7921 len2 = PyUnicode_GET_LENGTH(sub_obj);
7922
7923 ADJUST_INDICES(start, end, len1);
7924 switch(kind) {
7925 case PyUnicode_1BYTE_KIND:
7926 result = ucs1lib_count(
7927 ((Py_UCS1*)buf1) + start, end - start,
7928 buf2, len2, PY_SSIZE_T_MAX
7929 );
7930 break;
7931 case PyUnicode_2BYTE_KIND:
7932 result = ucs2lib_count(
7933 ((Py_UCS2*)buf1) + start, end - start,
7934 buf2, len2, PY_SSIZE_T_MAX
7935 );
7936 break;
7937 case PyUnicode_4BYTE_KIND:
7938 result = ucs4lib_count(
7939 ((Py_UCS4*)buf1) + start, end - start,
7940 buf2, len2, PY_SSIZE_T_MAX
7941 );
7942 break;
7943 default:
7944 assert(0); result = 0;
7945 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007946
7947 Py_DECREF(sub_obj);
7948 Py_DECREF(str_obj);
7949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007950 if (kind1 != kind)
7951 PyMem_Free(buf1);
7952 if (kind2 != kind)
7953 PyMem_Free(buf2);
7954
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007956 onError:
7957 Py_DECREF(sub_obj);
7958 Py_DECREF(str_obj);
7959 if (kind1 != kind && buf1)
7960 PyMem_Free(buf1);
7961 if (kind2 != kind && buf2)
7962 PyMem_Free(buf2);
7963 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964}
7965
Alexander Belopolsky40018472011-02-26 01:02:56 +00007966Py_ssize_t
7967PyUnicode_Find(PyObject *str,
7968 PyObject *sub,
7969 Py_ssize_t start,
7970 Py_ssize_t end,
7971 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007973 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00007974
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007976 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007978 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007979 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 Py_DECREF(str);
7981 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982 }
Tim Petersced69f82003-09-16 20:30:58 +00007983
Thomas Wouters477c8d52006-05-27 19:21:47 +00007984 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007985 result = any_find_slice(
7986 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
7987 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007988 );
7989 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007990 result = any_find_slice(
7991 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
7992 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007993 );
7994
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007996 Py_DECREF(sub);
7997
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 return result;
7999}
8000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008001Py_ssize_t
8002PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8003 Py_ssize_t start, Py_ssize_t end,
8004 int direction)
8005{
8006 char *result;
8007 int kind;
8008 if (PyUnicode_READY(str) == -1)
8009 return -2;
8010 if (end > PyUnicode_GET_LENGTH(str))
8011 end = PyUnicode_GET_LENGTH(str);
8012 kind = PyUnicode_KIND(str);
8013 result = findchar(PyUnicode_1BYTE_DATA(str)
8014 + PyUnicode_KIND_SIZE(kind, start),
8015 kind,
8016 end-start, ch, direction);
8017 if (!result)
8018 return -1;
8019 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8020}
8021
Alexander Belopolsky40018472011-02-26 01:02:56 +00008022static int
8023tailmatch(PyUnicodeObject *self,
8024 PyUnicodeObject *substring,
8025 Py_ssize_t start,
8026 Py_ssize_t end,
8027 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008029 int kind_self;
8030 int kind_sub;
8031 void *data_self;
8032 void *data_sub;
8033 Py_ssize_t offset;
8034 Py_ssize_t i;
8035 Py_ssize_t end_sub;
8036
8037 if (PyUnicode_READY(self) == -1 ||
8038 PyUnicode_READY(substring) == -1)
8039 return 0;
8040
8041 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042 return 1;
8043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008044 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8045 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008049 kind_self = PyUnicode_KIND(self);
8050 data_self = PyUnicode_DATA(self);
8051 kind_sub = PyUnicode_KIND(substring);
8052 data_sub = PyUnicode_DATA(substring);
8053 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8054
8055 if (direction > 0)
8056 offset = end;
8057 else
8058 offset = start;
8059
8060 if (PyUnicode_READ(kind_self, data_self, offset) ==
8061 PyUnicode_READ(kind_sub, data_sub, 0) &&
8062 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8063 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8064 /* If both are of the same kind, memcmp is sufficient */
8065 if (kind_self == kind_sub) {
8066 return ! memcmp((char *)data_self +
8067 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8068 data_sub,
8069 PyUnicode_GET_LENGTH(substring) *
8070 PyUnicode_CHARACTER_SIZE(substring));
8071 }
8072 /* otherwise we have to compare each character by first accesing it */
8073 else {
8074 /* We do not need to compare 0 and len(substring)-1 because
8075 the if statement above ensured already that they are equal
8076 when we end up here. */
8077 // TODO: honor direction and do a forward or backwards search
8078 for (i = 1; i < end_sub; ++i) {
8079 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8080 PyUnicode_READ(kind_sub, data_sub, i))
8081 return 0;
8082 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085 }
8086
8087 return 0;
8088}
8089
Alexander Belopolsky40018472011-02-26 01:02:56 +00008090Py_ssize_t
8091PyUnicode_Tailmatch(PyObject *str,
8092 PyObject *substr,
8093 Py_ssize_t start,
8094 Py_ssize_t end,
8095 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008097 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008098
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099 str = PyUnicode_FromObject(str);
8100 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008101 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102 substr = PyUnicode_FromObject(substr);
8103 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 Py_DECREF(str);
8105 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106 }
Tim Petersced69f82003-09-16 20:30:58 +00008107
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 (PyUnicodeObject *)substr,
8110 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111 Py_DECREF(str);
8112 Py_DECREF(substr);
8113 return result;
8114}
8115
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116/* Apply fixfct filter to the Unicode object self and return a
8117 reference to the modified object */
8118
Alexander Belopolsky40018472011-02-26 01:02:56 +00008119static PyObject *
8120fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008121 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008123 PyObject *u;
8124 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008126 if (PyUnicode_READY(self) == -1)
8127 return NULL;
8128 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8129 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8130 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008134 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8135 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008137 /* fix functions return the new maximum character in a string,
8138 if the kind of the resulting unicode object does not change,
8139 everything is fine. Otherwise we need to change the string kind
8140 and re-run the fix function. */
8141 maxchar_new = fixfct((PyUnicodeObject*)u);
8142 if (maxchar_new == 0)
8143 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8144 else if (maxchar_new <= 127)
8145 maxchar_new = 127;
8146 else if (maxchar_new <= 255)
8147 maxchar_new = 255;
8148 else if (maxchar_new <= 65535)
8149 maxchar_new = 65535;
8150 else
8151 maxchar_new = 1114111; /* 0x10ffff */
8152
8153 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 /* fixfct should return TRUE if it modified the buffer. If
8155 FALSE, return a reference to the original buffer instead
8156 (to save space, not time) */
8157 Py_INCREF(self);
8158 Py_DECREF(u);
8159 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008161 else if (maxchar_new == maxchar_old) {
8162 return u;
8163 }
8164 else {
8165 /* In case the maximum character changed, we need to
8166 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008167 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008168 if (v == NULL) {
8169 Py_DECREF(u);
8170 return NULL;
8171 }
8172 if (maxchar_new > maxchar_old) {
8173 /* If the maxchar increased so that the kind changed, not all
8174 characters are representable anymore and we need to fix the
8175 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008176 if (PyUnicode_CopyCharacters(v, 0,
8177 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008178 PyUnicode_GET_LENGTH(self)) < 0)
8179 {
8180 Py_DECREF(u);
8181 return NULL;
8182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008183 maxchar_old = fixfct((PyUnicodeObject*)v);
8184 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8185 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008186 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008187 if (PyUnicode_CopyCharacters(v, 0,
8188 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008189 PyUnicode_GET_LENGTH(self)) < 0)
8190 {
8191 Py_DECREF(u);
8192 return NULL;
8193 }
8194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008195
8196 Py_DECREF(u);
8197 return v;
8198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199}
8200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008201static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008202fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008204 /* No need to call PyUnicode_READY(self) because this function is only
8205 called as a callback from fixup() which does it already. */
8206 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8207 const int kind = PyUnicode_KIND(self);
8208 void *data = PyUnicode_DATA(self);
8209 int touched = 0;
8210 Py_UCS4 maxchar = 0;
8211 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008213 for (i = 0; i < len; ++i) {
8214 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8215 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8216 if (up != ch) {
8217 if (up > maxchar)
8218 maxchar = up;
8219 PyUnicode_WRITE(kind, data, i, up);
8220 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008222 else if (ch > maxchar)
8223 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 }
8225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008226 if (touched)
8227 return maxchar;
8228 else
8229 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230}
8231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008232static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008233fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8236 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8237 const int kind = PyUnicode_KIND(self);
8238 void *data = PyUnicode_DATA(self);
8239 int touched = 0;
8240 Py_UCS4 maxchar = 0;
8241 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008243 for(i = 0; i < len; ++i) {
8244 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8245 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8246 if (lo != ch) {
8247 if (lo > maxchar)
8248 maxchar = lo;
8249 PyUnicode_WRITE(kind, data, i, lo);
8250 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008252 else if (ch > maxchar)
8253 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254 }
8255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008256 if (touched)
8257 return maxchar;
8258 else
8259 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260}
8261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008262static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008263fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008265 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8266 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8267 const int kind = PyUnicode_KIND(self);
8268 void *data = PyUnicode_DATA(self);
8269 int touched = 0;
8270 Py_UCS4 maxchar = 0;
8271 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008273 for(i = 0; i < len; ++i) {
8274 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8275 Py_UCS4 nu = 0;
8276
8277 if (Py_UNICODE_ISUPPER(ch))
8278 nu = Py_UNICODE_TOLOWER(ch);
8279 else if (Py_UNICODE_ISLOWER(ch))
8280 nu = Py_UNICODE_TOUPPER(ch);
8281
8282 if (nu != 0) {
8283 if (nu > maxchar)
8284 maxchar = nu;
8285 PyUnicode_WRITE(kind, data, i, nu);
8286 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008288 else if (ch > maxchar)
8289 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 }
8291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008292 if (touched)
8293 return maxchar;
8294 else
8295 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296}
8297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008298static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008299fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008301 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8302 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8303 const int kind = PyUnicode_KIND(self);
8304 void *data = PyUnicode_DATA(self);
8305 int touched = 0;
8306 Py_UCS4 maxchar = 0;
8307 Py_ssize_t i = 0;
8308 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008309
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008310 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312
8313 ch = PyUnicode_READ(kind, data, i);
8314 if (!Py_UNICODE_ISUPPER(ch)) {
8315 maxchar = Py_UNICODE_TOUPPER(ch);
8316 PyUnicode_WRITE(kind, data, i, maxchar);
8317 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008319 ++i;
8320 for(; i < len; ++i) {
8321 ch = PyUnicode_READ(kind, data, i);
8322 if (!Py_UNICODE_ISLOWER(ch)) {
8323 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8324 if (lo > maxchar)
8325 maxchar = lo;
8326 PyUnicode_WRITE(kind, data, i, lo);
8327 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008328 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 else if (ch > maxchar)
8330 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008331 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332
8333 if (touched)
8334 return maxchar;
8335 else
8336 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337}
8338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008339static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008340fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008342 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8343 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8344 const int kind = PyUnicode_KIND(self);
8345 void *data = PyUnicode_DATA(self);
8346 Py_UCS4 maxchar = 0;
8347 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 int previous_is_cased;
8349
8350 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 if (len == 1) {
8352 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8353 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8354 if (ti != ch) {
8355 PyUnicode_WRITE(kind, data, i, ti);
8356 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 }
8358 else
8359 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362 for(; i < len; ++i) {
8363 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8364 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008365
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008369 nu = Py_UNICODE_TOTITLE(ch);
8370
8371 if (nu > maxchar)
8372 maxchar = nu;
8373 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008374
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 if (Py_UNICODE_ISLOWER(ch) ||
8376 Py_UNICODE_ISUPPER(ch) ||
8377 Py_UNICODE_ISTITLE(ch))
8378 previous_is_cased = 1;
8379 else
8380 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383}
8384
Tim Peters8ce9f162004-08-27 01:49:32 +00008385PyObject *
8386PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008388 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008389 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008390 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008391 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008392 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8393 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008394 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008395 Py_ssize_t sz, i, res_offset;
8396 Py_UCS4 maxchar = 0;
8397 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398
Tim Peters05eba1f2004-08-27 21:32:02 +00008399 fseq = PySequence_Fast(seq, "");
8400 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008401 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008402 }
8403
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008404 /* NOTE: the following code can't call back into Python code,
8405 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008406 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008407
Tim Peters05eba1f2004-08-27 21:32:02 +00008408 seqlen = PySequence_Fast_GET_SIZE(fseq);
8409 /* If empty sequence, return u"". */
8410 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008411 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008412 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008413 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008414 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008415 /* If singleton sequence with an exact Unicode, return that. */
8416 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 item = items[0];
8418 if (PyUnicode_CheckExact(item)) {
8419 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008420 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 goto Done;
8422 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008423 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008424 else {
8425 /* Set up sep and seplen */
8426 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008427 /* fall back to a blank space separator */
8428 sep = PyUnicode_FromOrdinal(' ');
8429 if (!sep || PyUnicode_READY(sep) == -1)
8430 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008431 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008432 else {
8433 if (!PyUnicode_Check(separator)) {
8434 PyErr_Format(PyExc_TypeError,
8435 "separator: expected str instance,"
8436 " %.80s found",
8437 Py_TYPE(separator)->tp_name);
8438 goto onError;
8439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440 if (PyUnicode_READY(separator) == -1)
8441 goto onError;
8442 sep = separator;
8443 seplen = PyUnicode_GET_LENGTH(separator);
8444 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8445 /* inc refcount to keep this code path symetric with the
8446 above case of a blank separator */
8447 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008448 }
8449 }
8450
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008451 /* There are at least two things to join, or else we have a subclass
8452 * of str in the sequence.
8453 * Do a pre-pass to figure out the total amount of space we'll
8454 * need (sz), and see whether all argument are strings.
8455 */
8456 sz = 0;
8457 for (i = 0; i < seqlen; i++) {
8458 const Py_ssize_t old_sz = sz;
8459 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 if (!PyUnicode_Check(item)) {
8461 PyErr_Format(PyExc_TypeError,
8462 "sequence item %zd: expected str instance,"
8463 " %.80s found",
8464 i, Py_TYPE(item)->tp_name);
8465 goto onError;
8466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 if (PyUnicode_READY(item) == -1)
8468 goto onError;
8469 sz += PyUnicode_GET_LENGTH(item);
8470 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8471 if (item_maxchar > maxchar)
8472 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008473 if (i != 0)
8474 sz += seplen;
8475 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8476 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008478 goto onError;
8479 }
8480 }
Tim Petersced69f82003-09-16 20:30:58 +00008481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008482 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008483 if (res == NULL)
8484 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008485
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008486 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008488 Py_ssize_t itemlen;
8489 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 /* Copy item, and maybe the separator. */
8492 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008493 if (PyUnicode_CopyCharacters(res, res_offset,
8494 sep, 0, seplen) < 0)
8495 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008498 if (PyUnicode_CopyCharacters(res, res_offset,
8499 item, 0, itemlen) < 0)
8500 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008504
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008506 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 Py_XDECREF(sep);
8508 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008511 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008513 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514 return NULL;
8515}
8516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517#define FILL(kind, data, value, start, length) \
8518 do { \
8519 Py_ssize_t i_ = 0; \
8520 assert(kind != PyUnicode_WCHAR_KIND); \
8521 switch ((kind)) { \
8522 case PyUnicode_1BYTE_KIND: { \
8523 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8524 memset(to_, (unsigned char)value, length); \
8525 break; \
8526 } \
8527 case PyUnicode_2BYTE_KIND: { \
8528 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8529 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8530 break; \
8531 } \
8532 default: { \
8533 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8534 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8535 break; \
8536 } \
8537 } \
8538 } while (0)
8539
Alexander Belopolsky40018472011-02-26 01:02:56 +00008540static PyUnicodeObject *
8541pad(PyUnicodeObject *self,
8542 Py_ssize_t left,
8543 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 PyObject *u;
8547 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008548 int kind;
8549 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550
8551 if (left < 0)
8552 left = 0;
8553 if (right < 0)
8554 right = 0;
8555
Tim Peters7a29bd52001-09-12 03:03:31 +00008556 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557 Py_INCREF(self);
8558 return self;
8559 }
8560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008561 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8562 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008563 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8564 return NULL;
8565 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8567 if (fill > maxchar)
8568 maxchar = fill;
8569 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008570 if (!u)
8571 return NULL;
8572
8573 kind = PyUnicode_KIND(u);
8574 data = PyUnicode_DATA(u);
8575 if (left)
8576 FILL(kind, data, fill, 0, left);
8577 if (right)
8578 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008579 if (PyUnicode_CopyCharacters(u, left,
8580 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008581 _PyUnicode_LENGTH(self)) < 0)
8582 {
8583 Py_DECREF(u);
8584 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 }
8586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008587 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590
Alexander Belopolsky40018472011-02-26 01:02:56 +00008591PyObject *
8592PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595
8596 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008597 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600 switch(PyUnicode_KIND(string)) {
8601 case PyUnicode_1BYTE_KIND:
8602 list = ucs1lib_splitlines(
8603 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8604 PyUnicode_GET_LENGTH(string), keepends);
8605 break;
8606 case PyUnicode_2BYTE_KIND:
8607 list = ucs2lib_splitlines(
8608 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8609 PyUnicode_GET_LENGTH(string), keepends);
8610 break;
8611 case PyUnicode_4BYTE_KIND:
8612 list = ucs4lib_splitlines(
8613 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8614 PyUnicode_GET_LENGTH(string), keepends);
8615 break;
8616 default:
8617 assert(0);
8618 list = 0;
8619 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 Py_DECREF(string);
8621 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622}
8623
Alexander Belopolsky40018472011-02-26 01:02:56 +00008624static PyObject *
8625split(PyUnicodeObject *self,
8626 PyUnicodeObject *substring,
8627 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008629 int kind1, kind2, kind;
8630 void *buf1, *buf2;
8631 Py_ssize_t len1, len2;
8632 PyObject* out;
8633
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008635 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 if (PyUnicode_READY(self) == -1)
8638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 if (substring == NULL)
8641 switch(PyUnicode_KIND(self)) {
8642 case PyUnicode_1BYTE_KIND:
8643 return ucs1lib_split_whitespace(
8644 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8645 PyUnicode_GET_LENGTH(self), maxcount
8646 );
8647 case PyUnicode_2BYTE_KIND:
8648 return ucs2lib_split_whitespace(
8649 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8650 PyUnicode_GET_LENGTH(self), maxcount
8651 );
8652 case PyUnicode_4BYTE_KIND:
8653 return ucs4lib_split_whitespace(
8654 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8655 PyUnicode_GET_LENGTH(self), maxcount
8656 );
8657 default:
8658 assert(0);
8659 return NULL;
8660 }
8661
8662 if (PyUnicode_READY(substring) == -1)
8663 return NULL;
8664
8665 kind1 = PyUnicode_KIND(self);
8666 kind2 = PyUnicode_KIND(substring);
8667 kind = kind1 > kind2 ? kind1 : kind2;
8668 buf1 = PyUnicode_DATA(self);
8669 buf2 = PyUnicode_DATA(substring);
8670 if (kind1 != kind)
8671 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8672 if (!buf1)
8673 return NULL;
8674 if (kind2 != kind)
8675 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8676 if (!buf2) {
8677 if (kind1 != kind) PyMem_Free(buf1);
8678 return NULL;
8679 }
8680 len1 = PyUnicode_GET_LENGTH(self);
8681 len2 = PyUnicode_GET_LENGTH(substring);
8682
8683 switch(kind) {
8684 case PyUnicode_1BYTE_KIND:
8685 out = ucs1lib_split(
8686 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8687 break;
8688 case PyUnicode_2BYTE_KIND:
8689 out = ucs2lib_split(
8690 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8691 break;
8692 case PyUnicode_4BYTE_KIND:
8693 out = ucs4lib_split(
8694 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8695 break;
8696 default:
8697 out = NULL;
8698 }
8699 if (kind1 != kind)
8700 PyMem_Free(buf1);
8701 if (kind2 != kind)
8702 PyMem_Free(buf2);
8703 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704}
8705
Alexander Belopolsky40018472011-02-26 01:02:56 +00008706static PyObject *
8707rsplit(PyUnicodeObject *self,
8708 PyUnicodeObject *substring,
8709 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008710{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008711 int kind1, kind2, kind;
8712 void *buf1, *buf2;
8713 Py_ssize_t len1, len2;
8714 PyObject* out;
8715
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008716 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008717 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719 if (PyUnicode_READY(self) == -1)
8720 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008722 if (substring == NULL)
8723 switch(PyUnicode_KIND(self)) {
8724 case PyUnicode_1BYTE_KIND:
8725 return ucs1lib_rsplit_whitespace(
8726 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8727 PyUnicode_GET_LENGTH(self), maxcount
8728 );
8729 case PyUnicode_2BYTE_KIND:
8730 return ucs2lib_rsplit_whitespace(
8731 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8732 PyUnicode_GET_LENGTH(self), maxcount
8733 );
8734 case PyUnicode_4BYTE_KIND:
8735 return ucs4lib_rsplit_whitespace(
8736 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8737 PyUnicode_GET_LENGTH(self), maxcount
8738 );
8739 default:
8740 assert(0);
8741 return NULL;
8742 }
8743
8744 if (PyUnicode_READY(substring) == -1)
8745 return NULL;
8746
8747 kind1 = PyUnicode_KIND(self);
8748 kind2 = PyUnicode_KIND(substring);
8749 kind = kind1 > kind2 ? kind1 : kind2;
8750 buf1 = PyUnicode_DATA(self);
8751 buf2 = PyUnicode_DATA(substring);
8752 if (kind1 != kind)
8753 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8754 if (!buf1)
8755 return NULL;
8756 if (kind2 != kind)
8757 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8758 if (!buf2) {
8759 if (kind1 != kind) PyMem_Free(buf1);
8760 return NULL;
8761 }
8762 len1 = PyUnicode_GET_LENGTH(self);
8763 len2 = PyUnicode_GET_LENGTH(substring);
8764
8765 switch(kind) {
8766 case PyUnicode_1BYTE_KIND:
8767 out = ucs1lib_rsplit(
8768 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8769 break;
8770 case PyUnicode_2BYTE_KIND:
8771 out = ucs2lib_rsplit(
8772 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8773 break;
8774 case PyUnicode_4BYTE_KIND:
8775 out = ucs4lib_rsplit(
8776 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8777 break;
8778 default:
8779 out = NULL;
8780 }
8781 if (kind1 != kind)
8782 PyMem_Free(buf1);
8783 if (kind2 != kind)
8784 PyMem_Free(buf2);
8785 return out;
8786}
8787
8788static Py_ssize_t
8789anylib_find(int kind, void *buf1, Py_ssize_t len1,
8790 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8791{
8792 switch(kind) {
8793 case PyUnicode_1BYTE_KIND:
8794 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8795 case PyUnicode_2BYTE_KIND:
8796 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8797 case PyUnicode_4BYTE_KIND:
8798 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8799 }
8800 assert(0);
8801 return -1;
8802}
8803
8804static Py_ssize_t
8805anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8806 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8807{
8808 switch(kind) {
8809 case PyUnicode_1BYTE_KIND:
8810 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8811 case PyUnicode_2BYTE_KIND:
8812 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8813 case PyUnicode_4BYTE_KIND:
8814 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8815 }
8816 assert(0);
8817 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008818}
8819
Alexander Belopolsky40018472011-02-26 01:02:56 +00008820static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821replace(PyObject *self, PyObject *str1,
8822 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 PyObject *u;
8825 char *sbuf = PyUnicode_DATA(self);
8826 char *buf1 = PyUnicode_DATA(str1);
8827 char *buf2 = PyUnicode_DATA(str2);
8828 int srelease = 0, release1 = 0, release2 = 0;
8829 int skind = PyUnicode_KIND(self);
8830 int kind1 = PyUnicode_KIND(str1);
8831 int kind2 = PyUnicode_KIND(str2);
8832 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8833 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8834 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835
8836 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008839 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 if (skind < kind1)
8842 /* substring too wide to be present */
8843 goto nothing;
8844
8845 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008846 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008847 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008848 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008849 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008851 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 Py_UCS4 u1, u2, maxchar;
8853 int mayshrink, rkind;
8854 u1 = PyUnicode_READ_CHAR(str1, 0);
8855 if (!findchar(sbuf, PyUnicode_KIND(self),
8856 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008857 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 u2 = PyUnicode_READ_CHAR(str2, 0);
8859 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8860 /* Replacing u1 with u2 may cause a maxchar reduction in the
8861 result string. */
8862 mayshrink = maxchar > 127;
8863 if (u2 > maxchar) {
8864 maxchar = u2;
8865 mayshrink = 0;
8866 }
8867 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008868 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008869 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008870 if (PyUnicode_CopyCharacters(u, 0,
8871 (PyObject*)self, 0, slen) < 0)
8872 {
8873 Py_DECREF(u);
8874 return NULL;
8875 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 rkind = PyUnicode_KIND(u);
8877 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8878 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008879 if (--maxcount < 0)
8880 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008882 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 if (mayshrink) {
8884 PyObject *tmp = u;
8885 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8886 PyUnicode_GET_LENGTH(tmp));
8887 Py_DECREF(tmp);
8888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008890 int rkind = skind;
8891 char *res;
8892 if (kind1 < rkind) {
8893 /* widen substring */
8894 buf1 = _PyUnicode_AsKind(str1, rkind);
8895 if (!buf1) goto error;
8896 release1 = 1;
8897 }
8898 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008899 if (i < 0)
8900 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 if (rkind > kind2) {
8902 /* widen replacement */
8903 buf2 = _PyUnicode_AsKind(str2, rkind);
8904 if (!buf2) goto error;
8905 release2 = 1;
8906 }
8907 else if (rkind < kind2) {
8908 /* widen self and buf1 */
8909 rkind = kind2;
8910 if (release1) PyMem_Free(buf1);
8911 sbuf = _PyUnicode_AsKind(self, rkind);
8912 if (!sbuf) goto error;
8913 srelease = 1;
8914 buf1 = _PyUnicode_AsKind(str1, rkind);
8915 if (!buf1) goto error;
8916 release1 = 1;
8917 }
8918 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8919 if (!res) {
8920 PyErr_NoMemory();
8921 goto error;
8922 }
8923 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008924 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8926 buf2,
8927 PyUnicode_KIND_SIZE(rkind, len2));
8928 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008929
8930 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8932 slen-i,
8933 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008934 if (i == -1)
8935 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8937 buf2,
8938 PyUnicode_KIND_SIZE(rkind, len2));
8939 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008940 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008941
8942 u = PyUnicode_FromKindAndData(rkind, res, slen);
8943 PyMem_Free(res);
8944 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 Py_ssize_t n, i, j, ires;
8949 Py_ssize_t product, new_size;
8950 int rkind = skind;
8951 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 if (kind1 < rkind) {
8954 buf1 = _PyUnicode_AsKind(str1, rkind);
8955 if (!buf1) goto error;
8956 release1 = 1;
8957 }
8958 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008959 if (n == 0)
8960 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 if (kind2 < rkind) {
8962 buf2 = _PyUnicode_AsKind(str2, rkind);
8963 if (!buf2) goto error;
8964 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 else if (kind2 > rkind) {
8967 rkind = kind2;
8968 sbuf = _PyUnicode_AsKind(self, rkind);
8969 if (!sbuf) goto error;
8970 srelease = 1;
8971 if (release1) PyMem_Free(buf1);
8972 buf1 = _PyUnicode_AsKind(str1, rkind);
8973 if (!buf1) goto error;
8974 release1 = 1;
8975 }
8976 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
8977 PyUnicode_GET_LENGTH(str1))); */
8978 product = n * (len2-len1);
8979 if ((product / (len2-len1)) != n) {
8980 PyErr_SetString(PyExc_OverflowError,
8981 "replace string is too long");
8982 goto error;
8983 }
8984 new_size = slen + product;
8985 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
8986 PyErr_SetString(PyExc_OverflowError,
8987 "replace string is too long");
8988 goto error;
8989 }
8990 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
8991 if (!res)
8992 goto error;
8993 ires = i = 0;
8994 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008995 while (n-- > 0) {
8996 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997 j = anylib_find(rkind,
8998 sbuf + PyUnicode_KIND_SIZE(rkind, i),
8999 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009000 if (j == -1)
9001 break;
9002 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009003 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9005 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9006 PyUnicode_KIND_SIZE(rkind, j-i));
9007 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009008 }
9009 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010 if (len2 > 0) {
9011 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9012 buf2,
9013 PyUnicode_KIND_SIZE(rkind, len2));
9014 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009015 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009019 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009020 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9021 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9022 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009023 } else {
9024 /* interleave */
9025 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9027 buf2,
9028 PyUnicode_KIND_SIZE(rkind, len2));
9029 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009030 if (--n <= 0)
9031 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9033 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9034 PyUnicode_KIND_SIZE(rkind, 1));
9035 ires++;
9036 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009037 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009038 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9039 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9040 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 if (srelease)
9045 PyMem_FREE(sbuf);
9046 if (release1)
9047 PyMem_FREE(buf1);
9048 if (release2)
9049 PyMem_FREE(buf2);
9050 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009051
Benjamin Peterson29060642009-01-31 22:14:21 +00009052 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009053 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 if (srelease)
9055 PyMem_FREE(sbuf);
9056 if (release1)
9057 PyMem_FREE(buf1);
9058 if (release2)
9059 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009060 if (PyUnicode_CheckExact(self)) {
9061 Py_INCREF(self);
9062 return (PyObject *) self;
9063 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 return PyUnicode_FromKindAndData(PyUnicode_KIND(self),
9065 PyUnicode_DATA(self),
9066 PyUnicode_GET_LENGTH(self));
9067 error:
9068 if (srelease && sbuf)
9069 PyMem_FREE(sbuf);
9070 if (release1 && buf1)
9071 PyMem_FREE(buf1);
9072 if (release2 && buf2)
9073 PyMem_FREE(buf2);
9074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075}
9076
9077/* --- Unicode Object Methods --------------------------------------------- */
9078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009079PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009080 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081\n\
9082Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009083characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084
9085static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009086unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088 return fixup(self, fixtitle);
9089}
9090
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009091PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093\n\
9094Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009095have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096
9097static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009098unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100 return fixup(self, fixcapitalize);
9101}
9102
9103#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009104PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009105 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106\n\
9107Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009108normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109
9110static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009111unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112{
9113 PyObject *list;
9114 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009115 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117 /* Split into words */
9118 list = split(self, NULL, -1);
9119 if (!list)
9120 return NULL;
9121
9122 /* Capitalize each word */
9123 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9124 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009125 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126 if (item == NULL)
9127 goto onError;
9128 Py_DECREF(PyList_GET_ITEM(list, i));
9129 PyList_SET_ITEM(list, i, item);
9130 }
9131
9132 /* Join the words to form a new string */
9133 item = PyUnicode_Join(NULL, list);
9134
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136 Py_DECREF(list);
9137 return (PyObject *)item;
9138}
9139#endif
9140
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009141/* Argument converter. Coerces to a single unicode character */
9142
9143static int
9144convert_uc(PyObject *obj, void *addr)
9145{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009146 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009147 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009148
Benjamin Peterson14339b62009-01-31 16:36:08 +00009149 uniobj = PyUnicode_FromObject(obj);
9150 if (uniobj == NULL) {
9151 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009153 return 0;
9154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009156 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009157 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009158 Py_DECREF(uniobj);
9159 return 0;
9160 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161 if (PyUnicode_READY(uniobj)) {
9162 Py_DECREF(uniobj);
9163 return 0;
9164 }
9165 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009166 Py_DECREF(uniobj);
9167 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009168}
9169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009170PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009171 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009173Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009174done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175
9176static PyObject *
9177unicode_center(PyUnicodeObject *self, PyObject *args)
9178{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009179 Py_ssize_t marg, left;
9180 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 Py_UCS4 fillchar = ' ';
9182
9183 if (PyUnicode_READY(self) == -1)
9184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185
Thomas Woutersde017742006-02-16 19:34:37 +00009186 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009187 return NULL;
9188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009190 Py_INCREF(self);
9191 return (PyObject*) self;
9192 }
9193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009195 left = marg / 2 + (marg & width & 1);
9196
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009197 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009198}
9199
Marc-André Lemburge5034372000-08-08 08:04:29 +00009200#if 0
9201
9202/* This code should go into some future Unicode collation support
9203 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009204 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009205
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009206/* speedy UTF-16 code point order comparison */
9207/* gleaned from: */
9208/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9209
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009210static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009211{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009212 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009213 0, 0, 0, 0, 0, 0, 0, 0,
9214 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009215 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009216};
9217
Guido van Rossumd57fd912000-03-10 22:53:23 +00009218static int
9219unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9220{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009221 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009222
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223 Py_UNICODE *s1 = str1->str;
9224 Py_UNICODE *s2 = str2->str;
9225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226 len1 = str1->_base._base.length;
9227 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009228
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009230 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009231
9232 c1 = *s1++;
9233 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009234
Benjamin Peterson29060642009-01-31 22:14:21 +00009235 if (c1 > (1<<11) * 26)
9236 c1 += utf16Fixup[c1>>11];
9237 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009238 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009239 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009240
9241 if (c1 != c2)
9242 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009243
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009244 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009245 }
9246
9247 return (len1 < len2) ? -1 : (len1 != len2);
9248}
9249
Marc-André Lemburge5034372000-08-08 08:04:29 +00009250#else
9251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009252/* This function assumes that str1 and str2 are readied by the caller. */
9253
Marc-André Lemburge5034372000-08-08 08:04:29 +00009254static int
9255unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9256{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 int kind1, kind2;
9258 void *data1, *data2;
9259 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 kind1 = PyUnicode_KIND(str1);
9262 kind2 = PyUnicode_KIND(str2);
9263 data1 = PyUnicode_DATA(str1);
9264 data2 = PyUnicode_DATA(str2);
9265 len1 = PyUnicode_GET_LENGTH(str1);
9266 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268 for (i = 0; i < len1 && i < len2; ++i) {
9269 Py_UCS4 c1, c2;
9270 c1 = PyUnicode_READ(kind1, data1, i);
9271 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009272
9273 if (c1 != c2)
9274 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009275 }
9276
9277 return (len1 < len2) ? -1 : (len1 != len2);
9278}
9279
9280#endif
9281
Alexander Belopolsky40018472011-02-26 01:02:56 +00009282int
9283PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9286 if (PyUnicode_READY(left) == -1 ||
9287 PyUnicode_READY(right) == -1)
9288 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009289 return unicode_compare((PyUnicodeObject *)left,
9290 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009292 PyErr_Format(PyExc_TypeError,
9293 "Can't compare %.100s and %.100s",
9294 left->ob_type->tp_name,
9295 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296 return -1;
9297}
9298
Martin v. Löwis5b222132007-06-10 09:51:05 +00009299int
9300PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9301{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302 Py_ssize_t i;
9303 int kind;
9304 void *data;
9305 Py_UCS4 chr;
9306
Martin v. Löwis5b222132007-06-10 09:51:05 +00009307 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 if (PyUnicode_READY(uni) == -1)
9309 return -1;
9310 kind = PyUnicode_KIND(uni);
9311 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009312 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9314 if (chr != str[i])
9315 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009316 /* This check keeps Python strings that end in '\0' from comparing equal
9317 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009319 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009320 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009321 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009322 return 0;
9323}
9324
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009325
Benjamin Peterson29060642009-01-31 22:14:21 +00009326#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009327 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009328
Alexander Belopolsky40018472011-02-26 01:02:56 +00009329PyObject *
9330PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009331{
9332 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009333
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009334 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9335 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336 if (PyUnicode_READY(left) == -1 ||
9337 PyUnicode_READY(right) == -1)
9338 return NULL;
9339 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9340 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009341 if (op == Py_EQ) {
9342 Py_INCREF(Py_False);
9343 return Py_False;
9344 }
9345 if (op == Py_NE) {
9346 Py_INCREF(Py_True);
9347 return Py_True;
9348 }
9349 }
9350 if (left == right)
9351 result = 0;
9352 else
9353 result = unicode_compare((PyUnicodeObject *)left,
9354 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009355
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009356 /* Convert the return value to a Boolean */
9357 switch (op) {
9358 case Py_EQ:
9359 v = TEST_COND(result == 0);
9360 break;
9361 case Py_NE:
9362 v = TEST_COND(result != 0);
9363 break;
9364 case Py_LE:
9365 v = TEST_COND(result <= 0);
9366 break;
9367 case Py_GE:
9368 v = TEST_COND(result >= 0);
9369 break;
9370 case Py_LT:
9371 v = TEST_COND(result == -1);
9372 break;
9373 case Py_GT:
9374 v = TEST_COND(result == 1);
9375 break;
9376 default:
9377 PyErr_BadArgument();
9378 return NULL;
9379 }
9380 Py_INCREF(v);
9381 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009382 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009383
Brian Curtindfc80e32011-08-10 20:28:54 -05009384 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009385}
9386
Alexander Belopolsky40018472011-02-26 01:02:56 +00009387int
9388PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009389{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009390 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 int kind1, kind2, kind;
9392 void *buf1, *buf2;
9393 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009394 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009395
9396 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009397 sub = PyUnicode_FromObject(element);
9398 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009399 PyErr_Format(PyExc_TypeError,
9400 "'in <string>' requires string as left operand, not %s",
9401 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009402 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 if (PyUnicode_READY(sub) == -1)
9405 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009406
Thomas Wouters477c8d52006-05-27 19:21:47 +00009407 str = PyUnicode_FromObject(container);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 if (!str || PyUnicode_READY(container) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009409 Py_DECREF(sub);
9410 return -1;
9411 }
9412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 kind1 = PyUnicode_KIND(str);
9414 kind2 = PyUnicode_KIND(sub);
9415 kind = kind1 > kind2 ? kind1 : kind2;
9416 buf1 = PyUnicode_DATA(str);
9417 buf2 = PyUnicode_DATA(sub);
9418 if (kind1 != kind)
9419 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9420 if (!buf1) {
9421 Py_DECREF(sub);
9422 return -1;
9423 }
9424 if (kind2 != kind)
9425 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9426 if (!buf2) {
9427 Py_DECREF(sub);
9428 if (kind1 != kind) PyMem_Free(buf1);
9429 return -1;
9430 }
9431 len1 = PyUnicode_GET_LENGTH(str);
9432 len2 = PyUnicode_GET_LENGTH(sub);
9433
9434 switch(kind) {
9435 case PyUnicode_1BYTE_KIND:
9436 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9437 break;
9438 case PyUnicode_2BYTE_KIND:
9439 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9440 break;
9441 case PyUnicode_4BYTE_KIND:
9442 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9443 break;
9444 default:
9445 result = -1;
9446 assert(0);
9447 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009448
9449 Py_DECREF(str);
9450 Py_DECREF(sub);
9451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 if (kind1 != kind)
9453 PyMem_Free(buf1);
9454 if (kind2 != kind)
9455 PyMem_Free(buf2);
9456
Guido van Rossum403d68b2000-03-13 15:55:09 +00009457 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009458}
9459
Guido van Rossumd57fd912000-03-10 22:53:23 +00009460/* Concat to string or Unicode object giving a new Unicode object. */
9461
Alexander Belopolsky40018472011-02-26 01:02:56 +00009462PyObject *
9463PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009464{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 PyObject *u = NULL, *v = NULL, *w;
9466 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467
9468 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009471 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009474 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475
9476 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009478 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484 }
9485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 if (PyUnicode_READY(u) == -1 || PyUnicode_READY(v) == -1)
9487 goto onError;
9488
9489 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009490 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493 w = PyUnicode_New(
9494 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9495 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009497 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009498 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9499 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009500 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009501 v, 0,
9502 PyUnicode_GET_LENGTH(v)) < 0)
9503 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504 Py_DECREF(u);
9505 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509 Py_XDECREF(u);
9510 Py_XDECREF(v);
9511 return NULL;
9512}
9513
Walter Dörwald1ab83302007-05-18 17:15:44 +00009514void
9515PyUnicode_Append(PyObject **pleft, PyObject *right)
9516{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009517 PyObject *new;
9518 if (*pleft == NULL)
9519 return;
9520 if (right == NULL || !PyUnicode_Check(*pleft)) {
9521 Py_DECREF(*pleft);
9522 *pleft = NULL;
9523 return;
9524 }
9525 new = PyUnicode_Concat(*pleft, right);
9526 Py_DECREF(*pleft);
9527 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009528}
9529
9530void
9531PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9532{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009533 PyUnicode_Append(pleft, right);
9534 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009535}
9536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009537PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009538 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009540Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009541string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009542interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543
9544static PyObject *
9545unicode_count(PyUnicodeObject *self, PyObject *args)
9546{
9547 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009548 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009549 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009551 int kind1, kind2, kind;
9552 void *buf1, *buf2;
9553 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009554
Jesus Ceaac451502011-04-20 17:09:23 +02009555 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9556 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009557 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 kind1 = PyUnicode_KIND(self);
9560 kind2 = PyUnicode_KIND(substring);
9561 kind = kind1 > kind2 ? kind1 : kind2;
9562 buf1 = PyUnicode_DATA(self);
9563 buf2 = PyUnicode_DATA(substring);
9564 if (kind1 != kind)
9565 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9566 if (!buf1) {
9567 Py_DECREF(substring);
9568 return NULL;
9569 }
9570 if (kind2 != kind)
9571 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9572 if (!buf2) {
9573 Py_DECREF(substring);
9574 if (kind1 != kind) PyMem_Free(buf1);
9575 return NULL;
9576 }
9577 len1 = PyUnicode_GET_LENGTH(self);
9578 len2 = PyUnicode_GET_LENGTH(substring);
9579
9580 ADJUST_INDICES(start, end, len1);
9581 switch(kind) {
9582 case PyUnicode_1BYTE_KIND:
9583 iresult = ucs1lib_count(
9584 ((Py_UCS1*)buf1) + start, end - start,
9585 buf2, len2, PY_SSIZE_T_MAX
9586 );
9587 break;
9588 case PyUnicode_2BYTE_KIND:
9589 iresult = ucs2lib_count(
9590 ((Py_UCS2*)buf1) + start, end - start,
9591 buf2, len2, PY_SSIZE_T_MAX
9592 );
9593 break;
9594 case PyUnicode_4BYTE_KIND:
9595 iresult = ucs4lib_count(
9596 ((Py_UCS4*)buf1) + start, end - start,
9597 buf2, len2, PY_SSIZE_T_MAX
9598 );
9599 break;
9600 default:
9601 assert(0); iresult = 0;
9602 }
9603
9604 result = PyLong_FromSsize_t(iresult);
9605
9606 if (kind1 != kind)
9607 PyMem_Free(buf1);
9608 if (kind2 != kind)
9609 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009610
9611 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009612
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613 return result;
9614}
9615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009616PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009617 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009618\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009619Encode S using the codec registered for encoding. Default encoding\n\
9620is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009621handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009622a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9623'xmlcharrefreplace' as well as any other name registered with\n\
9624codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625
9626static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009627unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009629 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630 char *encoding = NULL;
9631 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009632
Benjamin Peterson308d6372009-09-18 21:42:35 +00009633 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9634 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009636 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009637}
9638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009639PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009640 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641\n\
9642Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009643If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644
9645static PyObject*
9646unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9647{
9648 Py_UNICODE *e;
9649 Py_UNICODE *p;
9650 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009651 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653 PyUnicodeObject *u;
9654 int tabsize = 8;
9655
9656 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9660 return NULL;
9661
Thomas Wouters7e474022000-07-16 12:04:32 +00009662 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009663 i = 0; /* chars up to and including most recent \n or \r */
9664 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9666 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009668 if (tabsize > 0) {
9669 incr = tabsize - (j % tabsize); /* cannot overflow */
9670 if (j > PY_SSIZE_T_MAX - incr)
9671 goto overflow1;
9672 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009673 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009674 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009676 if (j > PY_SSIZE_T_MAX - 1)
9677 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678 j++;
9679 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009680 if (i > PY_SSIZE_T_MAX - j)
9681 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009683 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684 }
9685 }
9686
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009687 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009688 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009689
Guido van Rossumd57fd912000-03-10 22:53:23 +00009690 /* Second pass: create output string and fill it */
9691 u = _PyUnicode_New(i + j);
9692 if (!u)
9693 return NULL;
9694
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009695 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009696 q = _PyUnicode_WSTR(u); /* next output char */
9697 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009701 if (tabsize > 0) {
9702 i = tabsize - (j % tabsize);
9703 j += i;
9704 while (i--) {
9705 if (q >= qe)
9706 goto overflow2;
9707 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009708 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009709 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009710 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009711 else {
9712 if (q >= qe)
9713 goto overflow2;
9714 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009715 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716 if (*p == '\n' || *p == '\r')
9717 j = 0;
9718 }
9719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 if (PyUnicode_READY(u) == -1) {
9721 Py_DECREF(u);
9722 return NULL;
9723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009725
9726 overflow2:
9727 Py_DECREF(u);
9728 overflow1:
9729 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9730 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731}
9732
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009733PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009734 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735\n\
9736Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009737such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738arguments start and end are interpreted as in slice notation.\n\
9739\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009740Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009741
9742static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009743unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009744{
Jesus Ceaac451502011-04-20 17:09:23 +02009745 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009746 Py_ssize_t start;
9747 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009748 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749
Jesus Ceaac451502011-04-20 17:09:23 +02009750 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9751 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009752 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009754 if (PyUnicode_READY(self) == -1)
9755 return NULL;
9756 if (PyUnicode_READY(substring) == -1)
9757 return NULL;
9758
9759 result = any_find_slice(
9760 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9761 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009762 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009763
9764 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009766 if (result == -2)
9767 return NULL;
9768
Christian Heimes217cfd12007-12-02 14:31:20 +00009769 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770}
9771
9772static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009773unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775 Py_UCS4 ch;
9776
9777 if (PyUnicode_READY(self) == -1)
9778 return NULL;
9779 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780 PyErr_SetString(PyExc_IndexError, "string index out of range");
9781 return NULL;
9782 }
9783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9785 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009786}
9787
Guido van Rossumc2504932007-09-18 19:42:40 +00009788/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009789 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009790static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009791unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009792{
Guido van Rossumc2504932007-09-18 19:42:40 +00009793 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009794 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 if (_PyUnicode_HASH(self) != -1)
9797 return _PyUnicode_HASH(self);
9798 if (PyUnicode_READY(self) == -1)
9799 return -1;
9800 len = PyUnicode_GET_LENGTH(self);
9801
9802 /* The hash function as a macro, gets expanded three times below. */
9803#define HASH(P) \
9804 x = (Py_uhash_t)*P << 7; \
9805 while (--len >= 0) \
9806 x = (1000003*x) ^ (Py_uhash_t)*P++;
9807
9808 switch (PyUnicode_KIND(self)) {
9809 case PyUnicode_1BYTE_KIND: {
9810 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9811 HASH(c);
9812 break;
9813 }
9814 case PyUnicode_2BYTE_KIND: {
9815 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9816 HASH(s);
9817 break;
9818 }
9819 default: {
9820 Py_UCS4 *l;
9821 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9822 "Impossible switch case in unicode_hash");
9823 l = PyUnicode_4BYTE_DATA(self);
9824 HASH(l);
9825 break;
9826 }
9827 }
9828 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9829
Guido van Rossumc2504932007-09-18 19:42:40 +00009830 if (x == -1)
9831 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009833 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009834}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009837PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009838 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009839\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009840Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009841
9842static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009845 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009846 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009847 Py_ssize_t start;
9848 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849
Jesus Ceaac451502011-04-20 17:09:23 +02009850 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9851 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 if (PyUnicode_READY(self) == -1)
9855 return NULL;
9856 if (PyUnicode_READY(substring) == -1)
9857 return NULL;
9858
9859 result = any_find_slice(
9860 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9861 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009862 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863
9864 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 if (result == -2)
9867 return NULL;
9868
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869 if (result < 0) {
9870 PyErr_SetString(PyExc_ValueError, "substring not found");
9871 return NULL;
9872 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009873
Christian Heimes217cfd12007-12-02 14:31:20 +00009874 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875}
9876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009877PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009878 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009880Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009881at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882
9883static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009884unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 Py_ssize_t i, length;
9887 int kind;
9888 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889 int cased;
9890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 if (PyUnicode_READY(self) == -1)
9892 return NULL;
9893 length = PyUnicode_GET_LENGTH(self);
9894 kind = PyUnicode_KIND(self);
9895 data = PyUnicode_DATA(self);
9896
Guido van Rossumd57fd912000-03-10 22:53:23 +00009897 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 if (length == 1)
9899 return PyBool_FromLong(
9900 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009902 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009904 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009905
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 for (i = 0; i < length; i++) {
9908 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009909
Benjamin Peterson29060642009-01-31 22:14:21 +00009910 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9911 return PyBool_FromLong(0);
9912 else if (!cased && Py_UNICODE_ISLOWER(ch))
9913 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009915 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916}
9917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009918PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009919 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009921Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009922at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923
9924static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009925unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009927 Py_ssize_t i, length;
9928 int kind;
9929 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930 int cased;
9931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 if (PyUnicode_READY(self) == -1)
9933 return NULL;
9934 length = PyUnicode_GET_LENGTH(self);
9935 kind = PyUnicode_KIND(self);
9936 data = PyUnicode_DATA(self);
9937
Guido van Rossumd57fd912000-03-10 22:53:23 +00009938 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 if (length == 1)
9940 return PyBool_FromLong(
9941 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009943 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009945 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009946
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948 for (i = 0; i < length; i++) {
9949 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009950
Benjamin Peterson29060642009-01-31 22:14:21 +00009951 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9952 return PyBool_FromLong(0);
9953 else if (!cased && Py_UNICODE_ISUPPER(ch))
9954 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009956 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009957}
9958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009959PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009960 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009962Return True if S is a titlecased string and there is at least one\n\
9963character in S, i.e. upper- and titlecase characters may only\n\
9964follow uncased characters and lowercase characters only cased ones.\n\
9965Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966
9967static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009968unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 Py_ssize_t i, length;
9971 int kind;
9972 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973 int cased, previous_is_cased;
9974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 if (PyUnicode_READY(self) == -1)
9976 return NULL;
9977 length = PyUnicode_GET_LENGTH(self);
9978 kind = PyUnicode_KIND(self);
9979 data = PyUnicode_DATA(self);
9980
Guido van Rossumd57fd912000-03-10 22:53:23 +00009981 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 if (length == 1) {
9983 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
9984 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
9985 (Py_UNICODE_ISUPPER(ch) != 0));
9986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009987
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009988 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009990 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009991
Guido van Rossumd57fd912000-03-10 22:53:23 +00009992 cased = 0;
9993 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009994 for (i = 0; i < length; i++) {
9995 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009996
Benjamin Peterson29060642009-01-31 22:14:21 +00009997 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
9998 if (previous_is_cased)
9999 return PyBool_FromLong(0);
10000 previous_is_cased = 1;
10001 cased = 1;
10002 }
10003 else if (Py_UNICODE_ISLOWER(ch)) {
10004 if (!previous_is_cased)
10005 return PyBool_FromLong(0);
10006 previous_is_cased = 1;
10007 cased = 1;
10008 }
10009 else
10010 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010011 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010012 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010013}
10014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010015PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010016 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010018Return True if all characters in S are whitespace\n\
10019and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020
10021static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010022unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 Py_ssize_t i, length;
10025 int kind;
10026 void *data;
10027
10028 if (PyUnicode_READY(self) == -1)
10029 return NULL;
10030 length = PyUnicode_GET_LENGTH(self);
10031 kind = PyUnicode_KIND(self);
10032 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 if (length == 1)
10036 return PyBool_FromLong(
10037 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010039 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010041 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 for (i = 0; i < length; i++) {
10044 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010045 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010046 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010047 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010048 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010049}
10050
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010051PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010052 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010053\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010054Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010055and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010056
10057static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010058unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010059{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 Py_ssize_t i, length;
10061 int kind;
10062 void *data;
10063
10064 if (PyUnicode_READY(self) == -1)
10065 return NULL;
10066 length = PyUnicode_GET_LENGTH(self);
10067 kind = PyUnicode_KIND(self);
10068 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010069
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010070 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 if (length == 1)
10072 return PyBool_FromLong(
10073 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010074
10075 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010077 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 for (i = 0; i < length; i++) {
10080 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010081 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010082 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010083 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010084}
10085
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010086PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010087 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010088\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010089Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010090and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010091
10092static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010093unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010094{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 int kind;
10096 void *data;
10097 Py_ssize_t len, i;
10098
10099 if (PyUnicode_READY(self) == -1)
10100 return NULL;
10101
10102 kind = PyUnicode_KIND(self);
10103 data = PyUnicode_DATA(self);
10104 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010105
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010106 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 if (len == 1) {
10108 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10109 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10110 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010111
10112 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010114 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 for (i = 0; i < len; i++) {
10117 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010118 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010119 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010120 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010121 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010122}
10123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010124PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010125 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010126\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010127Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010128False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010129
10130static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010131unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 Py_ssize_t i, length;
10134 int kind;
10135 void *data;
10136
10137 if (PyUnicode_READY(self) == -1)
10138 return NULL;
10139 length = PyUnicode_GET_LENGTH(self);
10140 kind = PyUnicode_KIND(self);
10141 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142
Guido van Rossumd57fd912000-03-10 22:53:23 +000010143 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 if (length == 1)
10145 return PyBool_FromLong(
10146 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010148 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010150 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 for (i = 0; i < length; i++) {
10153 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010154 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010156 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157}
10158
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010159PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010160 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010162Return True if all characters in S are digits\n\
10163and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164
10165static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010166unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 Py_ssize_t i, length;
10169 int kind;
10170 void *data;
10171
10172 if (PyUnicode_READY(self) == -1)
10173 return NULL;
10174 length = PyUnicode_GET_LENGTH(self);
10175 kind = PyUnicode_KIND(self);
10176 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 if (length == 1) {
10180 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10181 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010184 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010186 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 for (i = 0; i < length; i++) {
10189 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010190 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010191 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010192 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010193}
10194
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010195PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010196 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010197\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010198Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010199False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200
10201static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010202unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 Py_ssize_t i, length;
10205 int kind;
10206 void *data;
10207
10208 if (PyUnicode_READY(self) == -1)
10209 return NULL;
10210 length = PyUnicode_GET_LENGTH(self);
10211 kind = PyUnicode_KIND(self);
10212 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 if (length == 1)
10216 return PyBool_FromLong(
10217 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010218
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010219 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010221 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 for (i = 0; i < length; i++) {
10224 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010225 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010227 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228}
10229
Martin v. Löwis47383402007-08-15 07:32:56 +000010230int
10231PyUnicode_IsIdentifier(PyObject *self)
10232{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 int kind;
10234 void *data;
10235 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010236 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 if (PyUnicode_READY(self) == -1) {
10239 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010240 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 }
10242
10243 /* Special case for empty strings */
10244 if (PyUnicode_GET_LENGTH(self) == 0)
10245 return 0;
10246 kind = PyUnicode_KIND(self);
10247 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010248
10249 /* PEP 3131 says that the first character must be in
10250 XID_Start and subsequent characters in XID_Continue,
10251 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010252 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010253 letters, digits, underscore). However, given the current
10254 definition of XID_Start and XID_Continue, it is sufficient
10255 to check just for these, except that _ must be allowed
10256 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010258 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010259 return 0;
10260
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010261 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010263 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010264 return 1;
10265}
10266
10267PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010268 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010269\n\
10270Return True if S is a valid identifier according\n\
10271to the language definition.");
10272
10273static PyObject*
10274unicode_isidentifier(PyObject *self)
10275{
10276 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10277}
10278
Georg Brandl559e5d72008-06-11 18:37:52 +000010279PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010281\n\
10282Return True if all characters in S are considered\n\
10283printable in repr() or S is empty, False otherwise.");
10284
10285static PyObject*
10286unicode_isprintable(PyObject *self)
10287{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 Py_ssize_t i, length;
10289 int kind;
10290 void *data;
10291
10292 if (PyUnicode_READY(self) == -1)
10293 return NULL;
10294 length = PyUnicode_GET_LENGTH(self);
10295 kind = PyUnicode_KIND(self);
10296 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010297
10298 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 if (length == 1)
10300 return PyBool_FromLong(
10301 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303 for (i = 0; i < length; i++) {
10304 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010305 Py_RETURN_FALSE;
10306 }
10307 }
10308 Py_RETURN_TRUE;
10309}
10310
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010311PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010312 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313\n\
10314Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010315iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316
10317static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010318unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010320 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321}
10322
Martin v. Löwis18e16552006-02-15 17:27:45 +000010323static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324unicode_length(PyUnicodeObject *self)
10325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 if (PyUnicode_READY(self) == -1)
10327 return -1;
10328 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329}
10330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010331PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010332 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010333\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010334Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010335done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336
10337static PyObject *
10338unicode_ljust(PyUnicodeObject *self, PyObject *args)
10339{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010340 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 Py_UCS4 fillchar = ' ';
10342
10343 if (PyUnicode_READY(self) == -1)
10344 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010345
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010346 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347 return NULL;
10348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350 Py_INCREF(self);
10351 return (PyObject*) self;
10352 }
10353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355}
10356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010357PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010358 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010360Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010361
10362static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010363unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365 return fixup(self, fixlower);
10366}
10367
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010368#define LEFTSTRIP 0
10369#define RIGHTSTRIP 1
10370#define BOTHSTRIP 2
10371
10372/* Arrays indexed by above */
10373static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10374
10375#define STRIPNAME(i) (stripformat[i]+3)
10376
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010377/* externally visible for str.strip(unicode) */
10378PyObject *
10379_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10380{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 void *data;
10382 int kind;
10383 Py_ssize_t i, j, len;
10384 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10387 return NULL;
10388
10389 kind = PyUnicode_KIND(self);
10390 data = PyUnicode_DATA(self);
10391 len = PyUnicode_GET_LENGTH(self);
10392 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10393 PyUnicode_DATA(sepobj),
10394 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010395
Benjamin Peterson14339b62009-01-31 16:36:08 +000010396 i = 0;
10397 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 while (i < len &&
10399 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010400 i++;
10401 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010402 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010403
Benjamin Peterson14339b62009-01-31 16:36:08 +000010404 j = len;
10405 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010406 do {
10407 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 } while (j >= i &&
10409 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010410 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010411 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010412
Benjamin Peterson14339b62009-01-31 16:36:08 +000010413 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010414 Py_INCREF(self);
10415 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010416 }
10417 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 return PyUnicode_Substring((PyObject*)self, i, j);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010419}
10420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421/* Assumes an already ready self string. */
10422
10423static PyObject *
10424substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len)
10425{
10426 const int kind = PyUnicode_KIND(self);
10427 void *data = PyUnicode_DATA(self);
10428 Py_UCS4 maxchar = 0;
10429 Py_ssize_t i;
10430 PyObject *unicode;
10431
10432 if (start < 0 || len < 0 || (start + len) > PyUnicode_GET_LENGTH(self)) {
10433 PyErr_BadInternalCall();
10434 return NULL;
10435 }
10436
10437 if (len == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) {
10438 Py_INCREF(self);
10439 return (PyObject*)self;
10440 }
10441
10442 for (i = 0; i < len; ++i) {
10443 const Py_UCS4 ch = PyUnicode_READ(kind, data, start + i);
10444 if (ch > maxchar)
10445 maxchar = ch;
10446 }
10447
10448 unicode = PyUnicode_New(len, maxchar);
10449 if (unicode == NULL)
10450 return NULL;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010451 if (PyUnicode_CopyCharacters(unicode, 0,
10452 (PyObject*)self, start, len) < 0)
10453 {
10454 Py_DECREF(unicode);
10455 return NULL;
10456 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 return unicode;
10458}
10459
10460PyObject*
10461PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10462{
10463 unsigned char *data;
10464 int kind;
10465
10466 if (start == 0 && end == PyUnicode_GET_LENGTH(self)
10467 && PyUnicode_CheckExact(self))
10468 {
10469 Py_INCREF(self);
10470 return (PyObject *)self;
10471 }
10472
10473 if ((end - start) == 1)
10474 return unicode_getitem((PyUnicodeObject*)self, start);
10475
10476 if (PyUnicode_READY(self) == -1)
10477 return NULL;
10478 kind = PyUnicode_KIND(self);
10479 data = PyUnicode_1BYTE_DATA(self);
10480 return PyUnicode_FromKindAndData(kind, data + PyUnicode_KIND_SIZE(kind, start),
10481 end-start);
10482}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483
10484static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010485do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 int kind;
10488 void *data;
10489 Py_ssize_t len, i, j;
10490
10491 if (PyUnicode_READY(self) == -1)
10492 return NULL;
10493
10494 kind = PyUnicode_KIND(self);
10495 data = PyUnicode_DATA(self);
10496 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010497
Benjamin Peterson14339b62009-01-31 16:36:08 +000010498 i = 0;
10499 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010501 i++;
10502 }
10503 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010504
Benjamin Peterson14339b62009-01-31 16:36:08 +000010505 j = len;
10506 if (striptype != LEFTSTRIP) {
10507 do {
10508 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010510 j++;
10511 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010512
Benjamin Peterson14339b62009-01-31 16:36:08 +000010513 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
10514 Py_INCREF(self);
10515 return (PyObject*)self;
10516 }
10517 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 return substring(self, i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519}
10520
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010521
10522static PyObject *
10523do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10524{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010525 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010526
Benjamin Peterson14339b62009-01-31 16:36:08 +000010527 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10528 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010529
Benjamin Peterson14339b62009-01-31 16:36:08 +000010530 if (sep != NULL && sep != Py_None) {
10531 if (PyUnicode_Check(sep))
10532 return _PyUnicode_XStrip(self, striptype, sep);
10533 else {
10534 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010535 "%s arg must be None or str",
10536 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010537 return NULL;
10538 }
10539 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010540
Benjamin Peterson14339b62009-01-31 16:36:08 +000010541 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010542}
10543
10544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010545PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010546 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010547\n\
10548Return a copy of the string S with leading and trailing\n\
10549whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010550If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010551
10552static PyObject *
10553unicode_strip(PyUnicodeObject *self, PyObject *args)
10554{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010555 if (PyTuple_GET_SIZE(args) == 0)
10556 return do_strip(self, BOTHSTRIP); /* Common case */
10557 else
10558 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010559}
10560
10561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010562PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010563 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010564\n\
10565Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010566If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010567
10568static PyObject *
10569unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10570{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010571 if (PyTuple_GET_SIZE(args) == 0)
10572 return do_strip(self, LEFTSTRIP); /* Common case */
10573 else
10574 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010575}
10576
10577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010578PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010579 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010580\n\
10581Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010582If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010583
10584static PyObject *
10585unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10586{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010587 if (PyTuple_GET_SIZE(args) == 0)
10588 return do_strip(self, RIGHTSTRIP); /* Common case */
10589 else
10590 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010591}
10592
10593
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010595unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596{
10597 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 Py_ssize_t nchars, n;
10599 size_t nbytes, char_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600
Georg Brandl222de0f2009-04-12 12:01:50 +000010601 if (len < 1) {
10602 Py_INCREF(unicode_empty);
10603 return (PyObject *)unicode_empty;
10604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605
Tim Peters7a29bd52001-09-12 03:03:31 +000010606 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607 /* no repeat, return original string */
10608 Py_INCREF(str);
10609 return (PyObject*) str;
10610 }
Tim Peters8f422462000-09-09 06:13:41 +000010611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 if (PyUnicode_READY(str) == -1)
10613 return NULL;
10614
Tim Peters8f422462000-09-09 06:13:41 +000010615 /* ensure # of chars needed doesn't overflow int and # of bytes
10616 * needed doesn't overflow size_t
10617 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 nchars = len * PyUnicode_GET_LENGTH(str);
10619 if (nchars / len != PyUnicode_GET_LENGTH(str)) {
Tim Peters8f422462000-09-09 06:13:41 +000010620 PyErr_SetString(PyExc_OverflowError,
10621 "repeated string is too long");
10622 return NULL;
10623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 char_size = PyUnicode_CHARACTER_SIZE(str);
10625 nbytes = (nchars + 1) * char_size;
10626 if (nbytes / char_size != (size_t)(nchars + 1)) {
Tim Peters8f422462000-09-09 06:13:41 +000010627 PyErr_SetString(PyExc_OverflowError,
10628 "repeated string is too long");
10629 return NULL;
10630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632 if (!u)
10633 return NULL;
10634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 if (PyUnicode_GET_LENGTH(str) == 1) {
10636 const int kind = PyUnicode_KIND(str);
10637 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10638 void *to = PyUnicode_DATA(u);
10639 for (n = 0; n < len; ++n)
10640 PyUnicode_WRITE(kind, to, n, fill_char);
10641 }
10642 else {
10643 /* number of characters copied this far */
10644 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10645 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10646 char *to = (char *) PyUnicode_DATA(u);
10647 Py_MEMCPY(to, PyUnicode_DATA(str),
10648 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010649 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 n = (done <= nchars-done) ? done : nchars-done;
10651 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010652 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010653 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010654 }
10655
10656 return (PyObject*) u;
10657}
10658
Alexander Belopolsky40018472011-02-26 01:02:56 +000010659PyObject *
10660PyUnicode_Replace(PyObject *obj,
10661 PyObject *subobj,
10662 PyObject *replobj,
10663 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664{
10665 PyObject *self;
10666 PyObject *str1;
10667 PyObject *str2;
10668 PyObject *result;
10669
10670 self = PyUnicode_FromObject(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 if (self == NULL || PyUnicode_READY(obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010672 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673 str1 = PyUnicode_FromObject(subobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 if (str1 == NULL || PyUnicode_READY(obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010675 Py_DECREF(self);
10676 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677 }
10678 str2 = PyUnicode_FromObject(replobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 if (str2 == NULL || PyUnicode_READY(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010680 Py_DECREF(self);
10681 Py_DECREF(str1);
10682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010685 Py_DECREF(self);
10686 Py_DECREF(str1);
10687 Py_DECREF(str2);
10688 return result;
10689}
10690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010691PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010692 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693\n\
10694Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010695old replaced by new. If the optional argument count is\n\
10696given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697
10698static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 PyObject *str1;
10702 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010703 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704 PyObject *result;
10705
Martin v. Löwis18e16552006-02-15 17:27:45 +000010706 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010709 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710 str1 = PyUnicode_FromObject(str1);
10711 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10712 return NULL;
10713 str2 = PyUnicode_FromObject(str2);
10714 if (str2 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010715 Py_DECREF(str1);
10716 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010717 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718
10719 result = replace(self, str1, str2, maxcount);
10720
10721 Py_DECREF(str1);
10722 Py_DECREF(str2);
10723 return result;
10724}
10725
Alexander Belopolsky40018472011-02-26 01:02:56 +000010726static PyObject *
10727unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010729 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 Py_ssize_t isize;
10731 Py_ssize_t osize, squote, dquote, i, o;
10732 Py_UCS4 max, quote;
10733 int ikind, okind;
10734 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010737 return NULL;
10738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 isize = PyUnicode_GET_LENGTH(unicode);
10740 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 /* Compute length of output, quote characters, and
10743 maximum character */
10744 osize = 2; /* quotes */
10745 max = 127;
10746 squote = dquote = 0;
10747 ikind = PyUnicode_KIND(unicode);
10748 for (i = 0; i < isize; i++) {
10749 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10750 switch (ch) {
10751 case '\'': squote++; osize++; break;
10752 case '"': dquote++; osize++; break;
10753 case '\\': case '\t': case '\r': case '\n':
10754 osize += 2; break;
10755 default:
10756 /* Fast-path ASCII */
10757 if (ch < ' ' || ch == 0x7f)
10758 osize += 4; /* \xHH */
10759 else if (ch < 0x7f)
10760 osize++;
10761 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10762 osize++;
10763 max = ch > max ? ch : max;
10764 }
10765 else if (ch < 0x100)
10766 osize += 4; /* \xHH */
10767 else if (ch < 0x10000)
10768 osize += 6; /* \uHHHH */
10769 else
10770 osize += 10; /* \uHHHHHHHH */
10771 }
10772 }
10773
10774 quote = '\'';
10775 if (squote) {
10776 if (dquote)
10777 /* Both squote and dquote present. Use squote,
10778 and escape them */
10779 osize += squote;
10780 else
10781 quote = '"';
10782 }
10783
10784 repr = PyUnicode_New(osize, max);
10785 if (repr == NULL)
10786 return NULL;
10787 okind = PyUnicode_KIND(repr);
10788 odata = PyUnicode_DATA(repr);
10789
10790 PyUnicode_WRITE(okind, odata, 0, quote);
10791 PyUnicode_WRITE(okind, odata, osize-1, quote);
10792
10793 for (i = 0, o = 1; i < isize; i++) {
10794 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010795
10796 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 if ((ch == quote) || (ch == '\\')) {
10798 PyUnicode_WRITE(okind, odata, o++, '\\');
10799 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010800 continue;
10801 }
10802
Benjamin Peterson29060642009-01-31 22:14:21 +000010803 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010804 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 PyUnicode_WRITE(okind, odata, o++, '\\');
10806 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010807 }
10808 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 PyUnicode_WRITE(okind, odata, o++, '\\');
10810 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010811 }
10812 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 PyUnicode_WRITE(okind, odata, o++, '\\');
10814 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010815 }
10816
10817 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010818 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 PyUnicode_WRITE(okind, odata, o++, '\\');
10820 PyUnicode_WRITE(okind, odata, o++, 'x');
10821 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10822 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010823 }
10824
Georg Brandl559e5d72008-06-11 18:37:52 +000010825 /* Copy ASCII characters as-is */
10826 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010828 }
10829
Benjamin Peterson29060642009-01-31 22:14:21 +000010830 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010831 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010832 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010833 (categories Z* and C* except ASCII space)
10834 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010836 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837 if (ch <= 0xff) {
10838 PyUnicode_WRITE(okind, odata, o++, '\\');
10839 PyUnicode_WRITE(okind, odata, o++, 'x');
10840 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10841 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010842 }
10843 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 else if (ch >= 0x10000) {
10845 PyUnicode_WRITE(okind, odata, o++, '\\');
10846 PyUnicode_WRITE(okind, odata, o++, 'U');
10847 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10848 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10849 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10850 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10851 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10852 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10853 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10854 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010855 }
10856 /* Map 16-bit characters to '\uxxxx' */
10857 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 PyUnicode_WRITE(okind, odata, o++, '\\');
10859 PyUnicode_WRITE(okind, odata, o++, 'u');
10860 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10861 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10862 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10863 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010864 }
10865 }
10866 /* Copy characters as-is */
10867 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010869 }
10870 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010871 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010873 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874}
10875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010876PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010877 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010878\n\
10879Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010880such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010881arguments start and end are interpreted as in slice notation.\n\
10882\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010883Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010884
10885static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010887{
Jesus Ceaac451502011-04-20 17:09:23 +020010888 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010889 Py_ssize_t start;
10890 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010891 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010892
Jesus Ceaac451502011-04-20 17:09:23 +020010893 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10894 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010897 if (PyUnicode_READY(self) == -1)
10898 return NULL;
10899 if (PyUnicode_READY(substring) == -1)
10900 return NULL;
10901
10902 result = any_find_slice(
10903 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10904 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010905 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906
10907 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010909 if (result == -2)
10910 return NULL;
10911
Christian Heimes217cfd12007-12-02 14:31:20 +000010912 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010913}
10914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010915PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010916 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010918Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919
10920static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922{
Jesus Ceaac451502011-04-20 17:09:23 +020010923 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010924 Py_ssize_t start;
10925 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010926 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927
Jesus Ceaac451502011-04-20 17:09:23 +020010928 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10929 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010930 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932 if (PyUnicode_READY(self) == -1)
10933 return NULL;
10934 if (PyUnicode_READY(substring) == -1)
10935 return NULL;
10936
10937 result = any_find_slice(
10938 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10939 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010940 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941
10942 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 if (result == -2)
10945 return NULL;
10946
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947 if (result < 0) {
10948 PyErr_SetString(PyExc_ValueError, "substring not found");
10949 return NULL;
10950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951
Christian Heimes217cfd12007-12-02 14:31:20 +000010952 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953}
10954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010955PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010956 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010958Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010959done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960
10961static PyObject *
10962unicode_rjust(PyUnicodeObject *self, PyObject *args)
10963{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010964 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010965 Py_UCS4 fillchar = ' ';
10966
10967 if (PyUnicode_READY(self) == -1)
10968 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010969
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010970 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971 return NULL;
10972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010973 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974 Py_INCREF(self);
10975 return (PyObject*) self;
10976 }
10977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010978 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979}
10980
Alexander Belopolsky40018472011-02-26 01:02:56 +000010981PyObject *
10982PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983{
10984 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010985
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986 s = PyUnicode_FromObject(s);
10987 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010988 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010989 if (sep != NULL) {
10990 sep = PyUnicode_FromObject(sep);
10991 if (sep == NULL) {
10992 Py_DECREF(s);
10993 return NULL;
10994 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995 }
10996
10997 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
10998
10999 Py_DECREF(s);
11000 Py_XDECREF(sep);
11001 return result;
11002}
11003
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011004PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011005 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006\n\
11007Return a list of the words in S, using sep as the\n\
11008delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011009splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011010whitespace string is a separator and empty strings are\n\
11011removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012
11013static PyObject*
11014unicode_split(PyUnicodeObject *self, PyObject *args)
11015{
11016 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011017 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018
Martin v. Löwis18e16552006-02-15 17:27:45 +000011019 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020 return NULL;
11021
11022 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011023 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011025 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011027 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028}
11029
Thomas Wouters477c8d52006-05-27 19:21:47 +000011030PyObject *
11031PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11032{
11033 PyObject* str_obj;
11034 PyObject* sep_obj;
11035 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 int kind1, kind2, kind;
11037 void *buf1 = NULL, *buf2 = NULL;
11038 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011039
11040 str_obj = PyUnicode_FromObject(str_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 if (!str_obj || PyUnicode_READY(str_in) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011042 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011043 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011045 Py_DECREF(str_obj);
11046 return NULL;
11047 }
11048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049 kind1 = PyUnicode_KIND(str_in);
11050 kind2 = PyUnicode_KIND(sep_obj);
11051 kind = kind1 > kind2 ? kind1 : kind2;
11052 buf1 = PyUnicode_DATA(str_in);
11053 if (kind1 != kind)
11054 buf1 = _PyUnicode_AsKind(str_in, kind);
11055 if (!buf1)
11056 goto onError;
11057 buf2 = PyUnicode_DATA(sep_obj);
11058 if (kind2 != kind)
11059 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11060 if (!buf2)
11061 goto onError;
11062 len1 = PyUnicode_GET_LENGTH(str_obj);
11063 len2 = PyUnicode_GET_LENGTH(sep_obj);
11064
11065 switch(PyUnicode_KIND(str_in)) {
11066 case PyUnicode_1BYTE_KIND:
11067 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11068 break;
11069 case PyUnicode_2BYTE_KIND:
11070 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11071 break;
11072 case PyUnicode_4BYTE_KIND:
11073 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11074 break;
11075 default:
11076 assert(0);
11077 out = 0;
11078 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011079
11080 Py_DECREF(sep_obj);
11081 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 if (kind1 != kind)
11083 PyMem_Free(buf1);
11084 if (kind2 != kind)
11085 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011086
11087 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 onError:
11089 Py_DECREF(sep_obj);
11090 Py_DECREF(str_obj);
11091 if (kind1 != kind && buf1)
11092 PyMem_Free(buf1);
11093 if (kind2 != kind && buf2)
11094 PyMem_Free(buf2);
11095 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011096}
11097
11098
11099PyObject *
11100PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11101{
11102 PyObject* str_obj;
11103 PyObject* sep_obj;
11104 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 int kind1, kind2, kind;
11106 void *buf1 = NULL, *buf2 = NULL;
11107 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011108
11109 str_obj = PyUnicode_FromObject(str_in);
11110 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011111 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011112 sep_obj = PyUnicode_FromObject(sep_in);
11113 if (!sep_obj) {
11114 Py_DECREF(str_obj);
11115 return NULL;
11116 }
11117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118 kind1 = PyUnicode_KIND(str_in);
11119 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011120 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011121 buf1 = PyUnicode_DATA(str_in);
11122 if (kind1 != kind)
11123 buf1 = _PyUnicode_AsKind(str_in, kind);
11124 if (!buf1)
11125 goto onError;
11126 buf2 = PyUnicode_DATA(sep_obj);
11127 if (kind2 != kind)
11128 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11129 if (!buf2)
11130 goto onError;
11131 len1 = PyUnicode_GET_LENGTH(str_obj);
11132 len2 = PyUnicode_GET_LENGTH(sep_obj);
11133
11134 switch(PyUnicode_KIND(str_in)) {
11135 case PyUnicode_1BYTE_KIND:
11136 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11137 break;
11138 case PyUnicode_2BYTE_KIND:
11139 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11140 break;
11141 case PyUnicode_4BYTE_KIND:
11142 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11143 break;
11144 default:
11145 assert(0);
11146 out = 0;
11147 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011148
11149 Py_DECREF(sep_obj);
11150 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 if (kind1 != kind)
11152 PyMem_Free(buf1);
11153 if (kind2 != kind)
11154 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011155
11156 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 onError:
11158 Py_DECREF(sep_obj);
11159 Py_DECREF(str_obj);
11160 if (kind1 != kind && buf1)
11161 PyMem_Free(buf1);
11162 if (kind2 != kind && buf2)
11163 PyMem_Free(buf2);
11164 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011165}
11166
11167PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011168 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011169\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011170Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011171the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011172found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011173
11174static PyObject*
11175unicode_partition(PyUnicodeObject *self, PyObject *separator)
11176{
11177 return PyUnicode_Partition((PyObject *)self, separator);
11178}
11179
11180PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011181 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011182\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011183Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011184the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011185separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011186
11187static PyObject*
11188unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11189{
11190 return PyUnicode_RPartition((PyObject *)self, separator);
11191}
11192
Alexander Belopolsky40018472011-02-26 01:02:56 +000011193PyObject *
11194PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011195{
11196 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011197
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011198 s = PyUnicode_FromObject(s);
11199 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011200 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011201 if (sep != NULL) {
11202 sep = PyUnicode_FromObject(sep);
11203 if (sep == NULL) {
11204 Py_DECREF(s);
11205 return NULL;
11206 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011207 }
11208
11209 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11210
11211 Py_DECREF(s);
11212 Py_XDECREF(sep);
11213 return result;
11214}
11215
11216PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011217 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011218\n\
11219Return a list of the words in S, using sep as the\n\
11220delimiter string, starting at the end of the string and\n\
11221working to the front. If maxsplit is given, at most maxsplit\n\
11222splits are done. If sep is not specified, any whitespace string\n\
11223is a separator.");
11224
11225static PyObject*
11226unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11227{
11228 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011229 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011230
Martin v. Löwis18e16552006-02-15 17:27:45 +000011231 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011232 return NULL;
11233
11234 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011235 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011236 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011237 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011238 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011239 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011240}
11241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011242PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011243 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244\n\
11245Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011246Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011247is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248
11249static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011250unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011252 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011253 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011255 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11256 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257 return NULL;
11258
Guido van Rossum86662912000-04-11 15:38:46 +000011259 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260}
11261
11262static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011263PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264{
Walter Dörwald346737f2007-05-31 10:44:43 +000011265 if (PyUnicode_CheckExact(self)) {
11266 Py_INCREF(self);
11267 return self;
11268 } else
11269 /* Subtype -- return genuine unicode string with the same value. */
11270 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
11271 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272}
11273
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011274PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011275 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276\n\
11277Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011278and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279
11280static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011281unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283 return fixup(self, fixswapcase);
11284}
11285
Georg Brandlceee0772007-11-27 23:48:05 +000011286PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011287 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011288\n\
11289Return a translation table usable for str.translate().\n\
11290If there is only one argument, it must be a dictionary mapping Unicode\n\
11291ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011292Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011293If there are two arguments, they must be strings of equal length, and\n\
11294in the resulting dictionary, each character in x will be mapped to the\n\
11295character at the same position in y. If there is a third argument, it\n\
11296must be a string, whose characters will be mapped to None in the result.");
11297
11298static PyObject*
11299unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11300{
11301 PyObject *x, *y = NULL, *z = NULL;
11302 PyObject *new = NULL, *key, *value;
11303 Py_ssize_t i = 0;
11304 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011305
Georg Brandlceee0772007-11-27 23:48:05 +000011306 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11307 return NULL;
11308 new = PyDict_New();
11309 if (!new)
11310 return NULL;
11311 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011312 int x_kind, y_kind, z_kind;
11313 void *x_data, *y_data, *z_data;
11314
Georg Brandlceee0772007-11-27 23:48:05 +000011315 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011316 if (!PyUnicode_Check(x)) {
11317 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11318 "be a string if there is a second argument");
11319 goto err;
11320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011322 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11323 "arguments must have equal length");
11324 goto err;
11325 }
11326 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 x_kind = PyUnicode_KIND(x);
11328 y_kind = PyUnicode_KIND(y);
11329 x_data = PyUnicode_DATA(x);
11330 y_data = PyUnicode_DATA(y);
11331 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11332 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11333 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011334 if (!key || !value)
11335 goto err;
11336 res = PyDict_SetItem(new, key, value);
11337 Py_DECREF(key);
11338 Py_DECREF(value);
11339 if (res < 0)
11340 goto err;
11341 }
11342 /* create entries for deleting chars in z */
11343 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 z_kind = PyUnicode_KIND(z);
11345 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011346 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011347 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011348 if (!key)
11349 goto err;
11350 res = PyDict_SetItem(new, key, Py_None);
11351 Py_DECREF(key);
11352 if (res < 0)
11353 goto err;
11354 }
11355 }
11356 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357 int kind;
11358 void *data;
11359
Georg Brandlceee0772007-11-27 23:48:05 +000011360 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011361 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011362 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11363 "to maketrans it must be a dict");
11364 goto err;
11365 }
11366 /* copy entries into the new dict, converting string keys to int keys */
11367 while (PyDict_Next(x, &i, &key, &value)) {
11368 if (PyUnicode_Check(key)) {
11369 /* convert string keys to integer keys */
11370 PyObject *newkey;
11371 if (PyUnicode_GET_SIZE(key) != 1) {
11372 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11373 "table must be of length 1");
11374 goto err;
11375 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011376 kind = PyUnicode_KIND(key);
11377 data = PyUnicode_DATA(key);
11378 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011379 if (!newkey)
11380 goto err;
11381 res = PyDict_SetItem(new, newkey, value);
11382 Py_DECREF(newkey);
11383 if (res < 0)
11384 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011385 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011386 /* just keep integer keys */
11387 if (PyDict_SetItem(new, key, value) < 0)
11388 goto err;
11389 } else {
11390 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11391 "be strings or integers");
11392 goto err;
11393 }
11394 }
11395 }
11396 return new;
11397 err:
11398 Py_DECREF(new);
11399 return NULL;
11400}
11401
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011402PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011403 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404\n\
11405Return a copy of the string S, where all characters have been mapped\n\
11406through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011407Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011408Unmapped characters are left untouched. Characters mapped to None\n\
11409are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410
11411static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415}
11416
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011417PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011418 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011420Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421
11422static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011423unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425 return fixup(self, fixupper);
11426}
11427
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011428PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011429 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011431Pad a numeric string S with zeros on the left, to fill a field\n\
11432of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433
11434static PyObject *
11435unicode_zfill(PyUnicodeObject *self, PyObject *args)
11436{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011437 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011439 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 int kind;
11441 void *data;
11442 Py_UCS4 chr;
11443
11444 if (PyUnicode_READY(self) == -1)
11445 return NULL;
11446
Martin v. Löwis18e16552006-02-15 17:27:45 +000011447 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448 return NULL;
11449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011451 if (PyUnicode_CheckExact(self)) {
11452 Py_INCREF(self);
11453 return (PyObject*) self;
11454 }
11455 else
11456 return PyUnicode_FromUnicode(
11457 PyUnicode_AS_UNICODE(self),
11458 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +000011459 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460 }
11461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463
11464 u = pad(self, fill, 0, '0');
11465
Walter Dörwald068325e2002-04-15 13:36:47 +000011466 if (u == NULL)
11467 return NULL;
11468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 kind = PyUnicode_KIND(u);
11470 data = PyUnicode_DATA(u);
11471 chr = PyUnicode_READ(kind, data, fill);
11472
11473 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011475 PyUnicode_WRITE(kind, data, 0, chr);
11476 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477 }
11478
11479 return (PyObject*) u;
11480}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481
11482#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011483static PyObject *
11484unicode__decimal2ascii(PyObject *self)
11485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011487}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488#endif
11489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011490PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011493Return True if S starts with the specified prefix, False otherwise.\n\
11494With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011495With optional end, stop comparing S at that position.\n\
11496prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497
11498static PyObject *
11499unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011500 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011502 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011504 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011505 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011506 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507
Jesus Ceaac451502011-04-20 17:09:23 +020011508 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011510 if (PyTuple_Check(subobj)) {
11511 Py_ssize_t i;
11512 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11513 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011515 if (substring == NULL)
11516 return NULL;
11517 result = tailmatch(self, substring, start, end, -1);
11518 Py_DECREF(substring);
11519 if (result) {
11520 Py_RETURN_TRUE;
11521 }
11522 }
11523 /* nothing matched */
11524 Py_RETURN_FALSE;
11525 }
11526 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011527 if (substring == NULL) {
11528 if (PyErr_ExceptionMatches(PyExc_TypeError))
11529 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11530 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011532 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011533 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011535 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536}
11537
11538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011539PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011540 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011542Return True if S ends with the specified suffix, False otherwise.\n\
11543With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011544With optional end, stop comparing S at that position.\n\
11545suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546
11547static PyObject *
11548unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011551 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011553 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011554 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011555 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556
Jesus Ceaac451502011-04-20 17:09:23 +020011557 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011559 if (PyTuple_Check(subobj)) {
11560 Py_ssize_t i;
11561 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11562 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011564 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011566 result = tailmatch(self, substring, start, end, +1);
11567 Py_DECREF(substring);
11568 if (result) {
11569 Py_RETURN_TRUE;
11570 }
11571 }
11572 Py_RETURN_FALSE;
11573 }
11574 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011575 if (substring == NULL) {
11576 if (PyErr_ExceptionMatches(PyExc_TypeError))
11577 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11578 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011579 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011580 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011581 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011583 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584}
11585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011587
11588PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011589 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011590\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011591Return a formatted version of S, using substitutions from args and kwargs.\n\
11592The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011593
Eric Smith27bbca62010-11-04 17:06:58 +000011594PyDoc_STRVAR(format_map__doc__,
11595 "S.format_map(mapping) -> str\n\
11596\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011597Return a formatted version of S, using substitutions from mapping.\n\
11598The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011599
Eric Smith4a7d76d2008-05-30 18:10:19 +000011600static PyObject *
11601unicode__format__(PyObject* self, PyObject* args)
11602{
11603 PyObject *format_spec;
11604
11605 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11606 return NULL;
11607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11609 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011610}
11611
Eric Smith8c663262007-08-25 02:26:07 +000011612PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011613 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011614\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011615Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011616
11617static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011618unicode__sizeof__(PyUnicodeObject *v)
11619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 Py_ssize_t size;
11621
11622 /* If it's a compact object, account for base structure +
11623 character data. */
11624 if (PyUnicode_IS_COMPACT_ASCII(v))
11625 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11626 else if (PyUnicode_IS_COMPACT(v))
11627 size = sizeof(PyCompactUnicodeObject) +
11628 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11629 else {
11630 /* If it is a two-block object, account for base object, and
11631 for character block if present. */
11632 size = sizeof(PyUnicodeObject);
11633 if (v->data.any)
11634 size += (PyUnicode_GET_LENGTH(v) + 1) *
11635 PyUnicode_CHARACTER_SIZE(v);
11636 }
11637 /* If the wstr pointer is present, account for it unless it is shared
11638 with the data pointer. Since PyUnicode_DATA will crash if the object
11639 is not ready, check whether it's either not ready (in which case the
11640 data is entirely in wstr) or if the data is not shared. */
11641 if (_PyUnicode_WSTR(v) &&
11642 (!PyUnicode_IS_READY(v) ||
11643 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11644 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11645 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11646 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11647
11648 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011649}
11650
11651PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011652 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011653
11654static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011655unicode_getnewargs(PyUnicodeObject *v)
11656{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 PyObject *copy;
11658 unsigned char *data;
11659 int kind;
11660 if (PyUnicode_READY(v) == -1)
11661 return NULL;
11662 kind = PyUnicode_KIND(v);
11663 data = PyUnicode_1BYTE_DATA(v);
11664 copy = PyUnicode_FromKindAndData(kind, data, PyUnicode_GET_LENGTH(v));
11665 if (!copy)
11666 return NULL;
11667 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011668}
11669
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670static PyMethodDef unicode_methods[] = {
11671
11672 /* Order is according to common usage: often used methods should
11673 appear first, since lookup is done sequentially. */
11674
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011675 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011676 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11677 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011678 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011679 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11680 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11681 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11682 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11683 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11684 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11685 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011686 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011687 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11688 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11689 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011690 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011691 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11692 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11693 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011694 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011695 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011696 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011697 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011698 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11699 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11700 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11701 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11702 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11703 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11704 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11705 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11706 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11707 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11708 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11709 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11710 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11711 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011712 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011713 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011714 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011715 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011716 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011717 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011718 {"maketrans", (PyCFunction) unicode_maketrans,
11719 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011720 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011721#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011722 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723#endif
11724
11725#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011726 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011727 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728#endif
11729
Benjamin Peterson14339b62009-01-31 16:36:08 +000011730 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731 {NULL, NULL}
11732};
11733
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011734static PyObject *
11735unicode_mod(PyObject *v, PyObject *w)
11736{
Brian Curtindfc80e32011-08-10 20:28:54 -050011737 if (!PyUnicode_Check(v))
11738 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011740}
11741
11742static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011743 0, /*nb_add*/
11744 0, /*nb_subtract*/
11745 0, /*nb_multiply*/
11746 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011747};
11748
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011750 (lenfunc) unicode_length, /* sq_length */
11751 PyUnicode_Concat, /* sq_concat */
11752 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11753 (ssizeargfunc) unicode_getitem, /* sq_item */
11754 0, /* sq_slice */
11755 0, /* sq_ass_item */
11756 0, /* sq_ass_slice */
11757 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758};
11759
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011760static PyObject*
11761unicode_subscript(PyUnicodeObject* self, PyObject* item)
11762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 if (PyUnicode_READY(self) == -1)
11764 return NULL;
11765
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011766 if (PyIndex_Check(item)) {
11767 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011768 if (i == -1 && PyErr_Occurred())
11769 return NULL;
11770 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011772 return unicode_getitem(self, i);
11773 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011774 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011776 Py_UNICODE* result_buf;
11777 PyObject* result;
11778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011780 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011781 return NULL;
11782 }
11783
11784 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 return PyUnicode_New(0, 0);
11786 } else if (start == 0 && step == 1 &&
11787 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011788 PyUnicode_CheckExact(self)) {
11789 Py_INCREF(self);
11790 return (PyObject *)self;
11791 } else if (step == 1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 return substring(self, start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011793 } else {
11794 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011795 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11796 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011797
Benjamin Peterson29060642009-01-31 22:14:21 +000011798 if (result_buf == NULL)
11799 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011800
11801 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11802 result_buf[i] = source_buf[cur];
11803 }
Tim Petersced69f82003-09-16 20:30:58 +000011804
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011805 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011806 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011807 return result;
11808 }
11809 } else {
11810 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11811 return NULL;
11812 }
11813}
11814
11815static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011816 (lenfunc)unicode_length, /* mp_length */
11817 (binaryfunc)unicode_subscript, /* mp_subscript */
11818 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011819};
11820
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822/* Helpers for PyUnicode_Format() */
11823
11824static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011825getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011827 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011829 (*p_argidx)++;
11830 if (arglen < 0)
11831 return args;
11832 else
11833 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834 }
11835 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011836 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837 return NULL;
11838}
11839
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011840/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011842static PyObject *
11843formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011845 char *p;
11846 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011848
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849 x = PyFloat_AsDouble(v);
11850 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011851 return NULL;
11852
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011854 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011855
Eric Smith0923d1d2009-04-16 20:16:10 +000011856 p = PyOS_double_to_string(x, type, prec,
11857 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011858 if (p == NULL)
11859 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011861 PyMem_Free(p);
11862 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863}
11864
Tim Peters38fd5b62000-09-21 05:43:11 +000011865static PyObject*
11866formatlong(PyObject *val, int flags, int prec, int type)
11867{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011868 char *buf;
11869 int len;
11870 PyObject *str; /* temporary string object. */
11871 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011872
Benjamin Peterson14339b62009-01-31 16:36:08 +000011873 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11874 if (!str)
11875 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011877 Py_DECREF(str);
11878 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011879}
11880
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011883 size_t buflen,
11884 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011886 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011887 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 if (PyUnicode_GET_LENGTH(v) == 1) {
11889 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011890 buf[1] = '\0';
11891 return 1;
11892 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011893 goto onError;
11894 }
11895 else {
11896 /* Integer input truncated to a character */
11897 long x;
11898 x = PyLong_AsLong(v);
11899 if (x == -1 && PyErr_Occurred())
11900 goto onError;
11901
11902 if (x < 0 || x > 0x10ffff) {
11903 PyErr_SetString(PyExc_OverflowError,
11904 "%c arg not in range(0x110000)");
11905 return -1;
11906 }
11907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011909 buf[1] = '\0';
11910 return 1;
11911 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011912
Benjamin Peterson29060642009-01-31 22:14:21 +000011913 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011914 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011915 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011916 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917}
11918
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011919/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011920 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011921*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011922#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011923
Alexander Belopolsky40018472011-02-26 01:02:56 +000011924PyObject *
11925PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 void *fmt;
11928 int fmtkind;
11929 PyObject *result;
11930 Py_UCS4 *res, *res0;
11931 Py_UCS4 max;
11932 int kind;
11933 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011937
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011939 PyErr_BadInternalCall();
11940 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11943 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 fmt = PyUnicode_DATA(uformat);
11946 fmtkind = PyUnicode_KIND(uformat);
11947 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11948 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949
11950 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11952 if (res0 == NULL) {
11953 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956
11957 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011958 arglen = PyTuple_Size(args);
11959 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960 }
11961 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011962 arglen = -1;
11963 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011965 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011966 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011967 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968
11969 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 if (--rescnt < 0) {
11972 rescnt = fmtcnt + 100;
11973 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11975 if (res0 == NULL){
11976 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 }
11979 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011980 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011981 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011983 }
11984 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011985 /* Got a format specifier */
11986 int flags = 0;
11987 Py_ssize_t width = -1;
11988 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 Py_UCS4 c = '\0';
11990 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 int isnumok;
11992 PyObject *v = NULL;
11993 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 void *pbuf;
11995 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 Py_ssize_t len, len1;
11998 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 fmtpos++;
12001 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12002 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012003 Py_ssize_t keylen;
12004 PyObject *key;
12005 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012006
Benjamin Peterson29060642009-01-31 22:14:21 +000012007 if (dict == NULL) {
12008 PyErr_SetString(PyExc_TypeError,
12009 "format requires a mapping");
12010 goto onError;
12011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012013 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012015 /* Skip over balanced parentheses */
12016 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012018 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012024 if (fmtcnt < 0 || pcount > 0) {
12025 PyErr_SetString(PyExc_ValueError,
12026 "incomplete format key");
12027 goto onError;
12028 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012029 key = substring(uformat, keystart, keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012030 if (key == NULL)
12031 goto onError;
12032 if (args_owned) {
12033 Py_DECREF(args);
12034 args_owned = 0;
12035 }
12036 args = PyObject_GetItem(dict, key);
12037 Py_DECREF(key);
12038 if (args == NULL) {
12039 goto onError;
12040 }
12041 args_owned = 1;
12042 arglen = -1;
12043 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012044 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012045 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 case '-': flags |= F_LJUST; continue;
12048 case '+': flags |= F_SIGN; continue;
12049 case ' ': flags |= F_BLANK; continue;
12050 case '#': flags |= F_ALT; continue;
12051 case '0': flags |= F_ZERO; continue;
12052 }
12053 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012054 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012055 if (c == '*') {
12056 v = getnextarg(args, arglen, &argidx);
12057 if (v == NULL)
12058 goto onError;
12059 if (!PyLong_Check(v)) {
12060 PyErr_SetString(PyExc_TypeError,
12061 "* wants int");
12062 goto onError;
12063 }
12064 width = PyLong_AsLong(v);
12065 if (width == -1 && PyErr_Occurred())
12066 goto onError;
12067 if (width < 0) {
12068 flags |= F_LJUST;
12069 width = -width;
12070 }
12071 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012073 }
12074 else if (c >= '0' && c <= '9') {
12075 width = c - '0';
12076 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012078 if (c < '0' || c > '9')
12079 break;
12080 if ((width*10) / 10 != width) {
12081 PyErr_SetString(PyExc_ValueError,
12082 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012083 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012084 }
12085 width = width*10 + (c - '0');
12086 }
12087 }
12088 if (c == '.') {
12089 prec = 0;
12090 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012092 if (c == '*') {
12093 v = getnextarg(args, arglen, &argidx);
12094 if (v == NULL)
12095 goto onError;
12096 if (!PyLong_Check(v)) {
12097 PyErr_SetString(PyExc_TypeError,
12098 "* wants int");
12099 goto onError;
12100 }
12101 prec = PyLong_AsLong(v);
12102 if (prec == -1 && PyErr_Occurred())
12103 goto onError;
12104 if (prec < 0)
12105 prec = 0;
12106 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012108 }
12109 else if (c >= '0' && c <= '9') {
12110 prec = c - '0';
12111 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012113 if (c < '0' || c > '9')
12114 break;
12115 if ((prec*10) / 10 != prec) {
12116 PyErr_SetString(PyExc_ValueError,
12117 "prec too big");
12118 goto onError;
12119 }
12120 prec = prec*10 + (c - '0');
12121 }
12122 }
12123 } /* prec */
12124 if (fmtcnt >= 0) {
12125 if (c == 'h' || c == 'l' || c == 'L') {
12126 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012128 }
12129 }
12130 if (fmtcnt < 0) {
12131 PyErr_SetString(PyExc_ValueError,
12132 "incomplete format");
12133 goto onError;
12134 }
12135 if (c != '%') {
12136 v = getnextarg(args, arglen, &argidx);
12137 if (v == NULL)
12138 goto onError;
12139 }
12140 sign = 0;
12141 fill = ' ';
12142 switch (c) {
12143
12144 case '%':
12145 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012146 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012147 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012149 len = 1;
12150 break;
12151
12152 case 's':
12153 case 'r':
12154 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012155 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012156 temp = v;
12157 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012158 }
12159 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012160 if (c == 's')
12161 temp = PyObject_Str(v);
12162 else if (c == 'r')
12163 temp = PyObject_Repr(v);
12164 else
12165 temp = PyObject_ASCII(v);
12166 if (temp == NULL)
12167 goto onError;
12168 if (PyUnicode_Check(temp))
12169 /* nothing to do */;
12170 else {
12171 Py_DECREF(temp);
12172 PyErr_SetString(PyExc_TypeError,
12173 "%s argument has non-string str()");
12174 goto onError;
12175 }
12176 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 if (PyUnicode_READY(temp) == -1) {
12178 Py_CLEAR(temp);
12179 goto onError;
12180 }
12181 pbuf = PyUnicode_DATA(temp);
12182 kind = PyUnicode_KIND(temp);
12183 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012184 if (prec >= 0 && len > prec)
12185 len = prec;
12186 break;
12187
12188 case 'i':
12189 case 'd':
12190 case 'u':
12191 case 'o':
12192 case 'x':
12193 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012194 isnumok = 0;
12195 if (PyNumber_Check(v)) {
12196 PyObject *iobj=NULL;
12197
12198 if (PyLong_Check(v)) {
12199 iobj = v;
12200 Py_INCREF(iobj);
12201 }
12202 else {
12203 iobj = PyNumber_Long(v);
12204 }
12205 if (iobj!=NULL) {
12206 if (PyLong_Check(iobj)) {
12207 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012208 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012209 Py_DECREF(iobj);
12210 if (!temp)
12211 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 if (PyUnicode_READY(temp) == -1) {
12213 Py_CLEAR(temp);
12214 goto onError;
12215 }
12216 pbuf = PyUnicode_DATA(temp);
12217 kind = PyUnicode_KIND(temp);
12218 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012219 sign = 1;
12220 }
12221 else {
12222 Py_DECREF(iobj);
12223 }
12224 }
12225 }
12226 if (!isnumok) {
12227 PyErr_Format(PyExc_TypeError,
12228 "%%%c format: a number is required, "
12229 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12230 goto onError;
12231 }
12232 if (flags & F_ZERO)
12233 fill = '0';
12234 break;
12235
12236 case 'e':
12237 case 'E':
12238 case 'f':
12239 case 'F':
12240 case 'g':
12241 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012242 temp = formatfloat(v, flags, prec, c);
12243 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012244 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012245 if (PyUnicode_READY(temp) == -1) {
12246 Py_CLEAR(temp);
12247 goto onError;
12248 }
12249 pbuf = PyUnicode_DATA(temp);
12250 kind = PyUnicode_KIND(temp);
12251 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 sign = 1;
12253 if (flags & F_ZERO)
12254 fill = '0';
12255 break;
12256
12257 case 'c':
12258 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012260 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012261 if (len < 0)
12262 goto onError;
12263 break;
12264
12265 default:
12266 PyErr_Format(PyExc_ValueError,
12267 "unsupported format character '%c' (0x%x) "
12268 "at index %zd",
12269 (31<=c && c<=126) ? (char)c : '?',
12270 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012272 goto onError;
12273 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 /* pbuf is initialized here. */
12275 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012276 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12278 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12279 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012280 len--;
12281 }
12282 else if (flags & F_SIGN)
12283 sign = '+';
12284 else if (flags & F_BLANK)
12285 sign = ' ';
12286 else
12287 sign = 0;
12288 }
12289 if (width < len)
12290 width = len;
12291 if (rescnt - (sign != 0) < width) {
12292 reslen -= rescnt;
12293 rescnt = width + fmtcnt + 100;
12294 reslen += rescnt;
12295 if (reslen < 0) {
12296 Py_XDECREF(temp);
12297 PyErr_NoMemory();
12298 goto onError;
12299 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12301 if (res0 == 0) {
12302 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 Py_XDECREF(temp);
12304 goto onError;
12305 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 }
12308 if (sign) {
12309 if (fill != ' ')
12310 *res++ = sign;
12311 rescnt--;
12312 if (width > len)
12313 width--;
12314 }
12315 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12317 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012318 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12320 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012321 }
12322 rescnt -= 2;
12323 width -= 2;
12324 if (width < 0)
12325 width = 0;
12326 len -= 2;
12327 }
12328 if (width > len && !(flags & F_LJUST)) {
12329 do {
12330 --rescnt;
12331 *res++ = fill;
12332 } while (--width > len);
12333 }
12334 if (fill == ' ') {
12335 if (sign)
12336 *res++ = sign;
12337 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12339 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12340 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12341 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012342 }
12343 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 /* Copy all characters, preserving len */
12345 len1 = len;
12346 while (len1--) {
12347 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12348 rescnt--;
12349 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012350 while (--width >= len) {
12351 --rescnt;
12352 *res++ = ' ';
12353 }
12354 if (dict && (argidx < arglen) && c != '%') {
12355 PyErr_SetString(PyExc_TypeError,
12356 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012357 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012358 goto onError;
12359 }
12360 Py_XDECREF(temp);
12361 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362 } /* until end */
12363 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012364 PyErr_SetString(PyExc_TypeError,
12365 "not all arguments converted during string formatting");
12366 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012367 }
12368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369
12370 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12371 if (*res > max)
12372 max = *res;
12373 result = PyUnicode_New(reslen - rescnt, max);
12374 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 kind = PyUnicode_KIND(result);
12377 for (res = res0; res < res0+reslen-rescnt; res++)
12378 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12379 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012381 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382 }
12383 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384 return (PyObject *)result;
12385
Benjamin Peterson29060642009-01-31 22:14:21 +000012386 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388 Py_DECREF(uformat);
12389 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012390 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391 }
12392 return NULL;
12393}
12394
Jeremy Hylton938ace62002-07-17 16:30:39 +000012395static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012396unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12397
Tim Peters6d6c1a32001-08-02 04:15:00 +000012398static PyObject *
12399unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12400{
Benjamin Peterson29060642009-01-31 22:14:21 +000012401 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012402 static char *kwlist[] = {"object", "encoding", "errors", 0};
12403 char *encoding = NULL;
12404 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012405
Benjamin Peterson14339b62009-01-31 16:36:08 +000012406 if (type != &PyUnicode_Type)
12407 return unicode_subtype_new(type, args, kwds);
12408 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012409 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012410 return NULL;
12411 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012412 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012413 if (encoding == NULL && errors == NULL)
12414 return PyObject_Str(x);
12415 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012416 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012417}
12418
Guido van Rossume023fe02001-08-30 03:12:59 +000012419static PyObject *
12420unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12421{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012422 PyUnicodeObject *tmp, *pnew;
12423 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012425
Benjamin Peterson14339b62009-01-31 16:36:08 +000012426 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12427 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12428 if (tmp == NULL)
12429 return NULL;
12430 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12432 // it seems kind of strange that tp_alloc gets passed the size
12433 // of the unicode string because there will follow another
12434 // malloc.
12435 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12436 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012437 if (pnew == NULL) {
12438 Py_DECREF(tmp);
12439 return NULL;
12440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12442 if (_PyUnicode_WSTR(pnew) == NULL) {
12443 err = PyErr_NoMemory();
12444 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12447 _PyUnicode_WSTR_LENGTH(pnew) = n;
12448 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12449 _PyUnicode_STATE(pnew).interned = 0;
12450 _PyUnicode_STATE(pnew).kind = 0;
12451 _PyUnicode_STATE(pnew).compact = 0;
12452 _PyUnicode_STATE(pnew).ready = 0;
12453 _PyUnicode_STATE(pnew).ascii = 0;
12454 pnew->data.any = NULL;
12455 _PyUnicode_LENGTH(pnew) = 0;
12456 pnew->_base.utf8 = NULL;
12457 pnew->_base.utf8_length = 0;
12458
12459 if (PyUnicode_READY(pnew) == -1) {
12460 PyObject_FREE(_PyUnicode_WSTR(pnew));
12461 goto onError;
12462 }
12463
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464 Py_DECREF(tmp);
12465 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466
12467 onError:
12468 _Py_ForgetReference((PyObject *)pnew);
12469 PyObject_Del(pnew);
12470 Py_DECREF(tmp);
12471 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012472}
12473
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012474PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012475 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012476\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012477Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012478encoding defaults to the current default string encoding.\n\
12479errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012480
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012481static PyObject *unicode_iter(PyObject *seq);
12482
Guido van Rossumd57fd912000-03-10 22:53:23 +000012483PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012484 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012485 "str", /* tp_name */
12486 sizeof(PyUnicodeObject), /* tp_size */
12487 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012488 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012489 (destructor)unicode_dealloc, /* tp_dealloc */
12490 0, /* tp_print */
12491 0, /* tp_getattr */
12492 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012493 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012494 unicode_repr, /* tp_repr */
12495 &unicode_as_number, /* tp_as_number */
12496 &unicode_as_sequence, /* tp_as_sequence */
12497 &unicode_as_mapping, /* tp_as_mapping */
12498 (hashfunc) unicode_hash, /* tp_hash*/
12499 0, /* tp_call*/
12500 (reprfunc) unicode_str, /* tp_str */
12501 PyObject_GenericGetAttr, /* tp_getattro */
12502 0, /* tp_setattro */
12503 0, /* tp_as_buffer */
12504 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012505 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012506 unicode_doc, /* tp_doc */
12507 0, /* tp_traverse */
12508 0, /* tp_clear */
12509 PyUnicode_RichCompare, /* tp_richcompare */
12510 0, /* tp_weaklistoffset */
12511 unicode_iter, /* tp_iter */
12512 0, /* tp_iternext */
12513 unicode_methods, /* tp_methods */
12514 0, /* tp_members */
12515 0, /* tp_getset */
12516 &PyBaseObject_Type, /* tp_base */
12517 0, /* tp_dict */
12518 0, /* tp_descr_get */
12519 0, /* tp_descr_set */
12520 0, /* tp_dictoffset */
12521 0, /* tp_init */
12522 0, /* tp_alloc */
12523 unicode_new, /* tp_new */
12524 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525};
12526
12527/* Initialize the Unicode implementation */
12528
Thomas Wouters78890102000-07-22 19:25:51 +000012529void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012531 int i;
12532
Thomas Wouters477c8d52006-05-27 19:21:47 +000012533 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012535 0x000A, /* LINE FEED */
12536 0x000D, /* CARRIAGE RETURN */
12537 0x001C, /* FILE SEPARATOR */
12538 0x001D, /* GROUP SEPARATOR */
12539 0x001E, /* RECORD SEPARATOR */
12540 0x0085, /* NEXT LINE */
12541 0x2028, /* LINE SEPARATOR */
12542 0x2029, /* PARAGRAPH SEPARATOR */
12543 };
12544
Fred Drakee4315f52000-05-09 19:53:39 +000012545 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012547 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012549
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012550 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012551 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012552 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012553 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012554
12555 /* initialize the linebreak bloom filter */
12556 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012558 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012559
12560 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561}
12562
12563/* Finalize the Unicode implementation */
12564
Christian Heimesa156e092008-02-16 07:38:31 +000012565int
12566PyUnicode_ClearFreeList(void)
12567{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012569}
12570
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571void
Thomas Wouters78890102000-07-22 19:25:51 +000012572_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012574 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012576 Py_XDECREF(unicode_empty);
12577 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012578
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012579 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012580 if (unicode_latin1[i]) {
12581 Py_DECREF(unicode_latin1[i]);
12582 unicode_latin1[i] = NULL;
12583 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012584 }
Christian Heimesa156e092008-02-16 07:38:31 +000012585 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012587
Walter Dörwald16807132007-05-25 13:52:07 +000012588void
12589PyUnicode_InternInPlace(PyObject **p)
12590{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012591 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12592 PyObject *t;
12593 if (s == NULL || !PyUnicode_Check(s))
12594 Py_FatalError(
12595 "PyUnicode_InternInPlace: unicode strings only please!");
12596 /* If it's a subclass, we don't really know what putting
12597 it in the interned dict might do. */
12598 if (!PyUnicode_CheckExact(s))
12599 return;
12600 if (PyUnicode_CHECK_INTERNED(s))
12601 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 if (PyUnicode_READY(s) == -1) {
12603 assert(0 && "ready fail in intern...");
12604 return;
12605 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012606 if (interned == NULL) {
12607 interned = PyDict_New();
12608 if (interned == NULL) {
12609 PyErr_Clear(); /* Don't leave an exception */
12610 return;
12611 }
12612 }
12613 /* It might be that the GetItem call fails even
12614 though the key is present in the dictionary,
12615 namely when this happens during a stack overflow. */
12616 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012617 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012618 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012619
Benjamin Peterson29060642009-01-31 22:14:21 +000012620 if (t) {
12621 Py_INCREF(t);
12622 Py_DECREF(*p);
12623 *p = t;
12624 return;
12625 }
Walter Dörwald16807132007-05-25 13:52:07 +000012626
Benjamin Peterson14339b62009-01-31 16:36:08 +000012627 PyThreadState_GET()->recursion_critical = 1;
12628 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12629 PyErr_Clear();
12630 PyThreadState_GET()->recursion_critical = 0;
12631 return;
12632 }
12633 PyThreadState_GET()->recursion_critical = 0;
12634 /* The two references in interned are not counted by refcnt.
12635 The deallocator will take care of this */
12636 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012638}
12639
12640void
12641PyUnicode_InternImmortal(PyObject **p)
12642{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12644
Benjamin Peterson14339b62009-01-31 16:36:08 +000012645 PyUnicode_InternInPlace(p);
12646 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012648 Py_INCREF(*p);
12649 }
Walter Dörwald16807132007-05-25 13:52:07 +000012650}
12651
12652PyObject *
12653PyUnicode_InternFromString(const char *cp)
12654{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012655 PyObject *s = PyUnicode_FromString(cp);
12656 if (s == NULL)
12657 return NULL;
12658 PyUnicode_InternInPlace(&s);
12659 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012660}
12661
Alexander Belopolsky40018472011-02-26 01:02:56 +000012662void
12663_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012664{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012665 PyObject *keys;
12666 PyUnicodeObject *s;
12667 Py_ssize_t i, n;
12668 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012669
Benjamin Peterson14339b62009-01-31 16:36:08 +000012670 if (interned == NULL || !PyDict_Check(interned))
12671 return;
12672 keys = PyDict_Keys(interned);
12673 if (keys == NULL || !PyList_Check(keys)) {
12674 PyErr_Clear();
12675 return;
12676 }
Walter Dörwald16807132007-05-25 13:52:07 +000012677
Benjamin Peterson14339b62009-01-31 16:36:08 +000012678 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12679 detector, interned unicode strings are not forcibly deallocated;
12680 rather, we give them their stolen references back, and then clear
12681 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012682
Benjamin Peterson14339b62009-01-31 16:36:08 +000012683 n = PyList_GET_SIZE(keys);
12684 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012685 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012686 for (i = 0; i < n; i++) {
12687 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 if (PyUnicode_READY(s) == -1)
12689 fprintf(stderr, "could not ready string\n");
12690 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012691 case SSTATE_NOT_INTERNED:
12692 /* XXX Shouldn't happen */
12693 break;
12694 case SSTATE_INTERNED_IMMORTAL:
12695 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012697 break;
12698 case SSTATE_INTERNED_MORTAL:
12699 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012701 break;
12702 default:
12703 Py_FatalError("Inconsistent interned string state.");
12704 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012706 }
12707 fprintf(stderr, "total size of all interned strings: "
12708 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12709 "mortal/immortal\n", mortal_size, immortal_size);
12710 Py_DECREF(keys);
12711 PyDict_Clear(interned);
12712 Py_DECREF(interned);
12713 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012714}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012715
12716
12717/********************* Unicode Iterator **************************/
12718
12719typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012720 PyObject_HEAD
12721 Py_ssize_t it_index;
12722 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012723} unicodeiterobject;
12724
12725static void
12726unicodeiter_dealloc(unicodeiterobject *it)
12727{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012728 _PyObject_GC_UNTRACK(it);
12729 Py_XDECREF(it->it_seq);
12730 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012731}
12732
12733static int
12734unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12735{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012736 Py_VISIT(it->it_seq);
12737 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012738}
12739
12740static PyObject *
12741unicodeiter_next(unicodeiterobject *it)
12742{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012743 PyUnicodeObject *seq;
12744 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012745
Benjamin Peterson14339b62009-01-31 16:36:08 +000012746 assert(it != NULL);
12747 seq = it->it_seq;
12748 if (seq == NULL)
12749 return NULL;
12750 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12753 int kind = PyUnicode_KIND(seq);
12754 void *data = PyUnicode_DATA(seq);
12755 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12756 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012757 if (item != NULL)
12758 ++it->it_index;
12759 return item;
12760 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012761
Benjamin Peterson14339b62009-01-31 16:36:08 +000012762 Py_DECREF(seq);
12763 it->it_seq = NULL;
12764 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012765}
12766
12767static PyObject *
12768unicodeiter_len(unicodeiterobject *it)
12769{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012770 Py_ssize_t len = 0;
12771 if (it->it_seq)
12772 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12773 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012774}
12775
12776PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12777
12778static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012779 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012780 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012781 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012782};
12783
12784PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012785 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12786 "str_iterator", /* tp_name */
12787 sizeof(unicodeiterobject), /* tp_basicsize */
12788 0, /* tp_itemsize */
12789 /* methods */
12790 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12791 0, /* tp_print */
12792 0, /* tp_getattr */
12793 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012794 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012795 0, /* tp_repr */
12796 0, /* tp_as_number */
12797 0, /* tp_as_sequence */
12798 0, /* tp_as_mapping */
12799 0, /* tp_hash */
12800 0, /* tp_call */
12801 0, /* tp_str */
12802 PyObject_GenericGetAttr, /* tp_getattro */
12803 0, /* tp_setattro */
12804 0, /* tp_as_buffer */
12805 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12806 0, /* tp_doc */
12807 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12808 0, /* tp_clear */
12809 0, /* tp_richcompare */
12810 0, /* tp_weaklistoffset */
12811 PyObject_SelfIter, /* tp_iter */
12812 (iternextfunc)unicodeiter_next, /* tp_iternext */
12813 unicodeiter_methods, /* tp_methods */
12814 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012815};
12816
12817static PyObject *
12818unicode_iter(PyObject *seq)
12819{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012820 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012821
Benjamin Peterson14339b62009-01-31 16:36:08 +000012822 if (!PyUnicode_Check(seq)) {
12823 PyErr_BadInternalCall();
12824 return NULL;
12825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012826 if (PyUnicode_READY(seq) == -1)
12827 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012828 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12829 if (it == NULL)
12830 return NULL;
12831 it->it_index = 0;
12832 Py_INCREF(seq);
12833 it->it_seq = (PyUnicodeObject *)seq;
12834 _PyObject_GC_TRACK(it);
12835 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012836}
12837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012838#define UNIOP(x) Py_UNICODE_##x
12839#define UNIOP_t Py_UNICODE
12840#include "uniops.h"
12841#undef UNIOP
12842#undef UNIOP_t
12843#define UNIOP(x) Py_UCS4_##x
12844#define UNIOP_t Py_UCS4
12845#include "uniops.h"
12846#undef UNIOP
12847#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012848
Victor Stinner71133ff2010-09-01 23:43:53 +000012849Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012850PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012851{
12852 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12853 Py_UNICODE *copy;
12854 Py_ssize_t size;
12855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012856 if (!PyUnicode_Check(unicode)) {
12857 PyErr_BadArgument();
12858 return NULL;
12859 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012860 /* Ensure we won't overflow the size. */
12861 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12862 PyErr_NoMemory();
12863 return NULL;
12864 }
12865 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12866 size *= sizeof(Py_UNICODE);
12867 copy = PyMem_Malloc(size);
12868 if (copy == NULL) {
12869 PyErr_NoMemory();
12870 return NULL;
12871 }
12872 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12873 return copy;
12874}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012875
Georg Brandl66c221e2010-10-14 07:04:07 +000012876/* A _string module, to export formatter_parser and formatter_field_name_split
12877 to the string.Formatter class implemented in Python. */
12878
12879static PyMethodDef _string_methods[] = {
12880 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12881 METH_O, PyDoc_STR("split the argument as a field name")},
12882 {"formatter_parser", (PyCFunction) formatter_parser,
12883 METH_O, PyDoc_STR("parse the argument as a format string")},
12884 {NULL, NULL}
12885};
12886
12887static struct PyModuleDef _string_module = {
12888 PyModuleDef_HEAD_INIT,
12889 "_string",
12890 PyDoc_STR("string helper module"),
12891 0,
12892 _string_methods,
12893 NULL,
12894 NULL,
12895 NULL,
12896 NULL
12897};
12898
12899PyMODINIT_FUNC
12900PyInit__string(void)
12901{
12902 return PyModule_Create(&_string_module);
12903}
12904
12905
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012906#ifdef __cplusplus
12907}
12908#endif