blob: fb02599775faad6f3dd4005b8a11f3ef1d8b5e08 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200108#define _PyUnicode_UTF8(op) \
109 (PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 ((PyCompactUnicodeObject*)(op))->utf8)
112#define _PyUnicode_UTF8_LENGTH(op) \
113 (PyUnicode_IS_COMPACT_ASCII(op) ? \
114 ((PyASCIIObject*)(op))->length : \
115 ((PyCompactUnicodeObject*)(op))->utf8_length)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200116#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
117#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
119#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
120#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
121#define _PyUnicode_KIND(op) \
122 (assert(PyUnicode_Check(op)), \
123 ((PyASCIIObject *)(op))->state.kind)
124#define _PyUnicode_GET_LENGTH(op) \
125 (assert(PyUnicode_Check(op)), \
126 ((PyASCIIObject *)(op))->length)
127
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200128/* The Unicode string has been modified: reset the hash */
129#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131
Walter Dörwald16807132007-05-25 13:52:07 +0000132/* This dictionary holds all interned unicode strings. Note that references
133 to strings in this dictionary are *not* counted in the string's ob_refcnt.
134 When the interned string reaches a refcnt of 0 the string deallocation
135 function will delete the reference from this dictionary.
136
137 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000138 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000139*/
140static PyObject *interned;
141
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000142/* The empty Unicode object is shared to improve performance. */
143static PyUnicodeObject *unicode_empty;
144
145/* Single character Unicode strings in the Latin-1 range are being
146 shared as well. */
147static PyUnicodeObject *unicode_latin1[256];
148
Christian Heimes190d79e2008-01-30 11:58:22 +0000149/* Fast detection of the most frequent whitespace characters */
150const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000151 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000152/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000153/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000154/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000155/* case 0x000C: * FORM FEED */
156/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000157 0, 1, 1, 1, 1, 1, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* case 0x001C: * FILE SEPARATOR */
160/* case 0x001D: * GROUP SEPARATOR */
161/* case 0x001E: * RECORD SEPARATOR */
162/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000163 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000165 1, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000169
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000178};
179
Alexander Belopolsky40018472011-02-26 01:02:56 +0000180static PyObject *
181unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000182 PyObject **errorHandler,const char *encoding, const char *reason,
183 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
184 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
185
Alexander Belopolsky40018472011-02-26 01:02:56 +0000186static void
187raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300188 const char *encoding,
189 const Py_UNICODE *unicode, Py_ssize_t size,
190 Py_ssize_t startpos, Py_ssize_t endpos,
191 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000192
Christian Heimes190d79e2008-01-30 11:58:22 +0000193/* Same for linebreaks */
194static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000196/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* 0x000B, * LINE TABULATION */
198/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000202/* 0x001C, * FILE SEPARATOR */
203/* 0x001D, * GROUP SEPARATOR */
204/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 1, 1, 1, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000210
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000219};
220
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300221/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
222 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000223Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000224PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000225{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000226#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000227 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000228#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 /* This is actually an illegal character, so it should
230 not be passed to unichr. */
231 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000232#endif
233}
234
Thomas Wouters477c8d52006-05-27 19:21:47 +0000235/* --- Bloom Filters ----------------------------------------------------- */
236
237/* stuff to implement simple "bloom filters" for Unicode characters.
238 to keep things simple, we use a single bitmask, using the least 5
239 bits from each unicode characters as the bit index. */
240
241/* the linebreak mask is set up by Unicode_Init below */
242
Antoine Pitrouf068f942010-01-13 14:19:12 +0000243#if LONG_BIT >= 128
244#define BLOOM_WIDTH 128
245#elif LONG_BIT >= 64
246#define BLOOM_WIDTH 64
247#elif LONG_BIT >= 32
248#define BLOOM_WIDTH 32
249#else
250#error "LONG_BIT is smaller than 32"
251#endif
252
Thomas Wouters477c8d52006-05-27 19:21:47 +0000253#define BLOOM_MASK unsigned long
254
255static BLOOM_MASK bloom_linebreak;
256
Antoine Pitrouf068f942010-01-13 14:19:12 +0000257#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
258#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259
Benjamin Peterson29060642009-01-31 22:14:21 +0000260#define BLOOM_LINEBREAK(ch) \
261 ((ch) < 128U ? ascii_linebreak[(ch)] : \
262 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000263
Alexander Belopolsky40018472011-02-26 01:02:56 +0000264Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200265make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266{
267 /* calculate simple bloom-style bitmask for a given unicode string */
268
Antoine Pitrouf068f942010-01-13 14:19:12 +0000269 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270 Py_ssize_t i;
271
272 mask = 0;
273 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200274 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000275
276 return mask;
277}
278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279#define BLOOM_MEMBER(mask, chr, str) \
280 (BLOOM(mask, chr) \
281 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
Guido van Rossumd57fd912000-03-10 22:53:23 +0000283/* --- Unicode Object ----------------------------------------------------- */
284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200285static PyObject *
286substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len);
287
288static PyObject *
289fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
290
291Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
292 Py_ssize_t size, Py_UCS4 ch,
293 int direction)
294{
295 /* like wcschr, but doesn't stop at NULL characters */
296 Py_ssize_t i;
297 if (direction == 1) {
298 for(i = 0; i < size; i++)
299 if (PyUnicode_READ(kind, s, i) == ch)
300 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
301 }
302 else {
303 for(i = size-1; i >= 0; i--)
304 if (PyUnicode_READ(kind, s, i) == ch)
305 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
306 }
307 return NULL;
308}
309
Alexander Belopolsky40018472011-02-26 01:02:56 +0000310static int
311unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200312 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313{
314 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 /* Resizing is only supported for old unicode objects. */
317 assert(!PyUnicode_IS_COMPACT(unicode));
318 assert(_PyUnicode_WSTR(unicode) != NULL);
319
320 /* ... and only if they have not been readied yet, because
321 callees usually rely on the wstr representation when resizing. */
322 assert(unicode->data.any == NULL);
323
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000324 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200325 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000326 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 /* Resizing shared object (unicode_empty or single character
329 objects) in-place is not allowed. Use PyUnicode_Resize()
330 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000331
Benjamin Peterson14339b62009-01-31 16:36:08 +0000332 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200333 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
334 _PyUnicode_WSTR(unicode)[0] < 256U &&
335 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000337 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 return -1;
339 }
340
Thomas Wouters477c8d52006-05-27 19:21:47 +0000341 /* We allocate one more byte to make sure the string is Ux0000 terminated.
342 The overallocation is also used by fastsearch, which assumes that it's
343 safe to look at str[length] (without making any assumptions about what
344 it contains). */
345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 oldstr = _PyUnicode_WSTR(unicode);
347 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
348 sizeof(Py_UNICODE) * (length + 1));
349 if (!_PyUnicode_WSTR(unicode)) {
350 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 PyErr_NoMemory();
352 return -1;
353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 _PyUnicode_WSTR(unicode)[length] = 0;
355 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356
Benjamin Peterson29060642009-01-31 22:14:21 +0000357 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 if (unicode->data.any != NULL) {
359 PyObject_FREE(unicode->data.any);
360 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
361 PyObject_FREE(unicode->_base.utf8);
362 }
363 unicode->_base.utf8 = NULL;
364 unicode->_base.utf8_length = 0;
365 unicode->data.any = NULL;
366 _PyUnicode_LENGTH(unicode) = 0;
367 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
368 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200370 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000371
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 return 0;
373}
374
375/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000376 Ux0000 terminated; some code (e.g. new_identifier)
377 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378
379 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000380 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381
382*/
383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384#ifdef Py_DEBUG
385int unicode_old_new_calls = 0;
386#endif
387
Alexander Belopolsky40018472011-02-26 01:02:56 +0000388static PyUnicodeObject *
389_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000390{
391 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200392 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393
Thomas Wouters477c8d52006-05-27 19:21:47 +0000394 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000395 if (length == 0 && unicode_empty != NULL) {
396 Py_INCREF(unicode_empty);
397 return unicode_empty;
398 }
399
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000400 /* Ensure we won't overflow the size. */
401 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
402 return (PyUnicodeObject *)PyErr_NoMemory();
403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200404 if (length < 0) {
405 PyErr_SetString(PyExc_SystemError,
406 "Negative size passed to _PyUnicode_New");
407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 }
409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410#ifdef Py_DEBUG
411 ++unicode_old_new_calls;
412#endif
413
414 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
415 if (unicode == NULL)
416 return NULL;
417 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
418 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
419 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 PyErr_NoMemory();
421 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200423
Jeremy Hyltond8082792003-09-16 19:41:39 +0000424 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000425 * the caller fails before initializing str -- unicode_resize()
426 * reads str[0], and the Keep-Alive optimization can keep memory
427 * allocated for str alive across a call to unicode_dealloc(unicode).
428 * We don't want unicode_resize to read uninitialized memory in
429 * that case.
430 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431 _PyUnicode_WSTR(unicode)[0] = 0;
432 _PyUnicode_WSTR(unicode)[length] = 0;
433 _PyUnicode_WSTR_LENGTH(unicode) = length;
434 _PyUnicode_HASH(unicode) = -1;
435 _PyUnicode_STATE(unicode).interned = 0;
436 _PyUnicode_STATE(unicode).kind = 0;
437 _PyUnicode_STATE(unicode).compact = 0;
438 _PyUnicode_STATE(unicode).ready = 0;
439 _PyUnicode_STATE(unicode).ascii = 0;
440 unicode->data.any = NULL;
441 _PyUnicode_LENGTH(unicode) = 0;
442 unicode->_base.utf8 = NULL;
443 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000445
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000447 /* XXX UNREF/NEWREF interface should be more symmetrical */
448 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000449 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000450 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000451 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452}
453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454#ifdef Py_DEBUG
455int unicode_new_new_calls = 0;
456
457/* Functions wrapping macros for use in debugger */
458char *_PyUnicode_utf8(void *unicode){
459 return _PyUnicode_UTF8(unicode);
460}
461
462void *_PyUnicode_compact_data(void *unicode) {
463 return _PyUnicode_COMPACT_DATA(unicode);
464}
465void *_PyUnicode_data(void *unicode){
466 printf("obj %p\n", unicode);
467 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
468 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
469 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
470 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
471 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
472 return PyUnicode_DATA(unicode);
473}
474#endif
475
476PyObject *
477PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
478{
479 PyObject *obj;
480 PyCompactUnicodeObject *unicode;
481 void *data;
482 int kind_state;
483 int is_sharing = 0, is_ascii = 0;
484 Py_ssize_t char_size;
485 Py_ssize_t struct_size;
486
487 /* Optimization for empty strings */
488 if (size == 0 && unicode_empty != NULL) {
489 Py_INCREF(unicode_empty);
490 return (PyObject *)unicode_empty;
491 }
492
493#ifdef Py_DEBUG
494 ++unicode_new_new_calls;
495#endif
496
497 struct_size = sizeof(PyCompactUnicodeObject);
498 if (maxchar < 128) {
499 kind_state = PyUnicode_1BYTE_KIND;
500 char_size = 1;
501 is_ascii = 1;
502 struct_size = sizeof(PyASCIIObject);
503 }
504 else if (maxchar < 256) {
505 kind_state = PyUnicode_1BYTE_KIND;
506 char_size = 1;
507 }
508 else if (maxchar < 65536) {
509 kind_state = PyUnicode_2BYTE_KIND;
510 char_size = 2;
511 if (sizeof(wchar_t) == 2)
512 is_sharing = 1;
513 }
514 else {
515 kind_state = PyUnicode_4BYTE_KIND;
516 char_size = 4;
517 if (sizeof(wchar_t) == 4)
518 is_sharing = 1;
519 }
520
521 /* Ensure we won't overflow the size. */
522 if (size < 0) {
523 PyErr_SetString(PyExc_SystemError,
524 "Negative size passed to PyUnicode_New");
525 return NULL;
526 }
527 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
528 return PyErr_NoMemory();
529
530 /* Duplicated allocation code from _PyObject_New() instead of a call to
531 * PyObject_New() so we are able to allocate space for the object and
532 * it's data buffer.
533 */
534 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
535 if (obj == NULL)
536 return PyErr_NoMemory();
537 obj = PyObject_INIT(obj, &PyUnicode_Type);
538 if (obj == NULL)
539 return NULL;
540
541 unicode = (PyCompactUnicodeObject *)obj;
542 if (is_ascii)
543 data = ((PyASCIIObject*)obj) + 1;
544 else
545 data = unicode + 1;
546 _PyUnicode_LENGTH(unicode) = size;
547 _PyUnicode_HASH(unicode) = -1;
548 _PyUnicode_STATE(unicode).interned = 0;
549 _PyUnicode_STATE(unicode).kind = kind_state;
550 _PyUnicode_STATE(unicode).compact = 1;
551 _PyUnicode_STATE(unicode).ready = 1;
552 _PyUnicode_STATE(unicode).ascii = is_ascii;
553 if (is_ascii) {
554 ((char*)data)[size] = 0;
555 _PyUnicode_WSTR(unicode) = NULL;
556 }
557 else if (kind_state == PyUnicode_1BYTE_KIND) {
558 ((char*)data)[size] = 0;
559 _PyUnicode_WSTR(unicode) = NULL;
560 _PyUnicode_WSTR_LENGTH(unicode) = 0;
561 unicode->utf8_length = 0;
562 unicode->utf8 = NULL;
563 }
564 else {
565 unicode->utf8 = NULL;
566 if (kind_state == PyUnicode_2BYTE_KIND)
567 ((Py_UCS2*)data)[size] = 0;
568 else /* kind_state == PyUnicode_4BYTE_KIND */
569 ((Py_UCS4*)data)[size] = 0;
570 if (is_sharing) {
571 _PyUnicode_WSTR_LENGTH(unicode) = size;
572 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
573 }
574 else {
575 _PyUnicode_WSTR_LENGTH(unicode) = 0;
576 _PyUnicode_WSTR(unicode) = NULL;
577 }
578 }
579 return obj;
580}
581
582#if SIZEOF_WCHAR_T == 2
583/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
584 will decode surrogate pairs, the other conversions are implemented as macros
585 for efficency.
586
587 This function assumes that unicode can hold one more code point than wstr
588 characters for a terminating null character. */
589static int
590unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
591 PyUnicodeObject *unicode)
592{
593 const wchar_t *iter;
594 Py_UCS4 *ucs4_out;
595
596 assert(unicode && PyUnicode_Check(unicode));
597 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
598 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
599
600 for (iter = begin; iter < end; ) {
601 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
602 _PyUnicode_GET_LENGTH(unicode)));
603 if (*iter >= 0xD800 && *iter <= 0xDBFF
604 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
605 {
606 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
607 iter += 2;
608 }
609 else {
610 *ucs4_out++ = *iter;
611 iter++;
612 }
613 }
614 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
615 _PyUnicode_GET_LENGTH(unicode)));
616
617 return 0;
618}
619#endif
620
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200621Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200622PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
623 PyObject *from, Py_ssize_t from_start,
624 Py_ssize_t how_many)
625{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200626 unsigned int from_kind, to_kind;
627 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200628
Victor Stinnerb1536152011-09-30 02:26:10 +0200629 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
630 PyErr_BadInternalCall();
631 return -1;
632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200633
634 if (PyUnicode_READY(from))
635 return -1;
636 if (PyUnicode_READY(to))
637 return -1;
638
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200639 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200640 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
641 PyErr_Format(PyExc_ValueError,
642 "Cannot write %zi characters at %zi "
643 "in a string of %zi characters",
644 how_many, to_start, PyUnicode_GET_LENGTH(to));
645 return -1;
646 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200647 if (how_many == 0)
648 return 0;
649
650 if (Py_REFCNT(to) != 1) {
651 PyErr_SetString(PyExc_ValueError,
652 "Cannot modify a string having more than 1 reference");
653 return -1;
654 }
Victor Stinnerc17f5402011-09-29 00:16:58 +0200655 _PyUnicode_DIRTY(to);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200657 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200658 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200659 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200660 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200661
662 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200663 /* fast path */
Victor Stinnera0702ab2011-09-29 14:14:38 +0200664 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200665 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200666 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200667 + PyUnicode_KIND_SIZE(from_kind, from_start),
668 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200669 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200670 else if (from_kind == PyUnicode_1BYTE_KIND
671 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200672 {
673 _PyUnicode_CONVERT_BYTES(
674 Py_UCS1, Py_UCS2,
675 PyUnicode_1BYTE_DATA(from) + from_start,
676 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
677 PyUnicode_2BYTE_DATA(to) + to_start
678 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200679 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200680 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200681 && to_kind == PyUnicode_4BYTE_KIND)
682 {
683 _PyUnicode_CONVERT_BYTES(
684 Py_UCS1, Py_UCS4,
685 PyUnicode_1BYTE_DATA(from) + from_start,
686 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
687 PyUnicode_4BYTE_DATA(to) + to_start
688 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200689 }
690 else if (from_kind == PyUnicode_2BYTE_KIND
691 && to_kind == PyUnicode_4BYTE_KIND)
692 {
693 _PyUnicode_CONVERT_BYTES(
694 Py_UCS2, Py_UCS4,
695 PyUnicode_2BYTE_DATA(from) + from_start,
696 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
697 PyUnicode_4BYTE_DATA(to) + to_start
698 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200699 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200700 else {
701 int invalid_kinds;
702 if (from_kind > to_kind) {
703 /* slow path to check for character overflow */
704 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
705 Py_UCS4 ch, maxchar;
706 Py_ssize_t i;
707
708 maxchar = 0;
709 invalid_kinds = 0;
710 for (i=0; i < how_many; i++) {
711 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
712 if (ch > maxchar) {
713 maxchar = ch;
714 if (maxchar > to_maxchar) {
715 invalid_kinds = 1;
716 break;
717 }
718 }
719 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
720 }
721 }
722 else
723 invalid_kinds = 1;
724 if (invalid_kinds) {
725 PyErr_Format(PyExc_ValueError,
726 "Cannot copy UCS%u characters "
727 "into a string of UCS%u characters",
728 1 << (from_kind - 1),
729 1 << (to_kind -1));
730 return -1;
731 }
732 }
733 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200734}
735
Victor Stinner17222162011-09-28 22:15:37 +0200736/* Find the maximum code point and count the number of surrogate pairs so a
737 correct string length can be computed before converting a string to UCS4.
738 This function counts single surrogates as a character and not as a pair.
739
740 Return 0 on success, or -1 on error. */
741static int
742find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
743 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744{
745 const wchar_t *iter;
746
747 if (num_surrogates == NULL || maxchar == NULL) {
748 PyErr_SetString(PyExc_SystemError,
749 "unexpected NULL arguments to "
750 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
751 return -1;
752 }
753
754 *num_surrogates = 0;
755 *maxchar = 0;
756
757 for (iter = begin; iter < end; ) {
758 if (*iter > *maxchar)
759 *maxchar = *iter;
760#if SIZEOF_WCHAR_T == 2
761 if (*iter >= 0xD800 && *iter <= 0xDBFF
762 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
763 {
764 Py_UCS4 surrogate_val;
765 surrogate_val = (((iter[0] & 0x3FF)<<10)
766 | (iter[1] & 0x3FF)) + 0x10000;
767 ++(*num_surrogates);
768 if (surrogate_val > *maxchar)
769 *maxchar = surrogate_val;
770 iter += 2;
771 }
772 else
773 iter++;
774#else
775 iter++;
776#endif
777 }
778 return 0;
779}
780
781#ifdef Py_DEBUG
782int unicode_ready_calls = 0;
783#endif
784
785int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200786_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200788 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200789 wchar_t *end;
790 Py_UCS4 maxchar = 0;
791 Py_ssize_t num_surrogates;
792#if SIZEOF_WCHAR_T == 2
793 Py_ssize_t length_wo_surrogates;
794#endif
795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200796 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200797 strings were created using _PyObject_New() and where no canonical
798 representation (the str field) has been set yet aka strings
799 which are not yet ready. */
800 assert(PyUnicode_Check(obj));
801 assert(!PyUnicode_IS_READY(obj));
802 assert(!PyUnicode_IS_COMPACT(obj));
803 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200804 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200805 assert(unicode->data.any == NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200806 assert(unicode->_base.utf8 == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200807 /* Actually, it should neither be interned nor be anything else: */
808 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809
810#ifdef Py_DEBUG
811 ++unicode_ready_calls;
812#endif
813
814 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200815 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200816 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200817 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200818
819 if (maxchar < 256) {
820 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
821 if (!unicode->data.any) {
822 PyErr_NoMemory();
823 return -1;
824 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200825 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826 _PyUnicode_WSTR(unicode), end,
827 PyUnicode_1BYTE_DATA(unicode));
828 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
829 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
830 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
831 if (maxchar < 128) {
832 unicode->_base.utf8 = unicode->data.any;
833 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
834 }
835 else {
836 unicode->_base.utf8 = NULL;
837 unicode->_base.utf8_length = 0;
838 }
839 PyObject_FREE(_PyUnicode_WSTR(unicode));
840 _PyUnicode_WSTR(unicode) = NULL;
841 _PyUnicode_WSTR_LENGTH(unicode) = 0;
842 }
843 /* In this case we might have to convert down from 4-byte native
844 wchar_t to 2-byte unicode. */
845 else if (maxchar < 65536) {
846 assert(num_surrogates == 0 &&
847 "FindMaxCharAndNumSurrogatePairs() messed up");
848
Victor Stinner506f5922011-09-28 22:34:18 +0200849#if SIZEOF_WCHAR_T == 2
850 /* We can share representations and are done. */
851 unicode->data.any = _PyUnicode_WSTR(unicode);
852 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
853 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
854 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
855 unicode->_base.utf8 = NULL;
856 unicode->_base.utf8_length = 0;
857#else
858 /* sizeof(wchar_t) == 4 */
859 unicode->data.any = PyObject_MALLOC(
860 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
861 if (!unicode->data.any) {
862 PyErr_NoMemory();
863 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200864 }
Victor Stinner506f5922011-09-28 22:34:18 +0200865 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
866 _PyUnicode_WSTR(unicode), end,
867 PyUnicode_2BYTE_DATA(unicode));
868 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
869 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
870 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
871 unicode->_base.utf8 = NULL;
872 unicode->_base.utf8_length = 0;
873 PyObject_FREE(_PyUnicode_WSTR(unicode));
874 _PyUnicode_WSTR(unicode) = NULL;
875 _PyUnicode_WSTR_LENGTH(unicode) = 0;
876#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877 }
878 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
879 else {
880#if SIZEOF_WCHAR_T == 2
881 /* in case the native representation is 2-bytes, we need to allocate a
882 new normalized 4-byte version. */
883 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
884 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
885 if (!unicode->data.any) {
886 PyErr_NoMemory();
887 return -1;
888 }
889 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
890 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
891 unicode->_base.utf8 = NULL;
892 unicode->_base.utf8_length = 0;
893 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
894 unicode) < 0) {
895 assert(0 && "ConvertWideCharToUCS4 failed");
896 return -1;
897 }
898 PyObject_FREE(_PyUnicode_WSTR(unicode));
899 _PyUnicode_WSTR(unicode) = NULL;
900 _PyUnicode_WSTR_LENGTH(unicode) = 0;
901#else
902 assert(num_surrogates == 0);
903
904 unicode->data.any = _PyUnicode_WSTR(unicode);
905 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
906 unicode->_base.utf8 = NULL;
907 unicode->_base.utf8_length = 0;
908 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
909#endif
910 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
911 }
912 _PyUnicode_STATE(unicode).ready = 1;
913 return 0;
914}
915
Alexander Belopolsky40018472011-02-26 01:02:56 +0000916static void
917unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000918{
Walter Dörwald16807132007-05-25 13:52:07 +0000919 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 case SSTATE_NOT_INTERNED:
921 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000922
Benjamin Peterson29060642009-01-31 22:14:21 +0000923 case SSTATE_INTERNED_MORTAL:
924 /* revive dead object temporarily for DelItem */
925 Py_REFCNT(unicode) = 3;
926 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
927 Py_FatalError(
928 "deletion of interned string failed");
929 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000930
Benjamin Peterson29060642009-01-31 22:14:21 +0000931 case SSTATE_INTERNED_IMMORTAL:
932 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000933
Benjamin Peterson29060642009-01-31 22:14:21 +0000934 default:
935 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000936 }
937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200938 if (_PyUnicode_WSTR(unicode) &&
939 (!PyUnicode_IS_READY(unicode) ||
940 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
941 PyObject_DEL(_PyUnicode_WSTR(unicode));
942 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
943 PyObject_DEL(unicode->_base.utf8);
944
945 if (PyUnicode_IS_COMPACT(unicode)) {
946 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947 }
948 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200949 if (unicode->data.any)
950 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000951 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000952 }
953}
954
Alexander Belopolsky40018472011-02-26 01:02:56 +0000955static int
956_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000957{
958 register PyUnicodeObject *v;
959
960 /* Argument checks */
961 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000962 PyErr_BadInternalCall();
963 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000964 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000965 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
967 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000968 PyErr_BadInternalCall();
969 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000970 }
971
972 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973 possible since these are being shared.
974 The same goes for new-representation unicode objects or objects which
975 have already been readied.
976 For these, we simply return a fresh copy with the same Unicode content.
977 */
978 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
979 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
980 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000981 PyUnicodeObject *w = _PyUnicode_New(length);
982 if (w == NULL)
983 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
985 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000986 Py_DECREF(*unicode);
987 *unicode = w;
988 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000989 }
990
991 /* Note that we don't have to modify *unicode for unshared Unicode
992 objects, since we can modify them in-place. */
993 return unicode_resize(v, length);
994}
995
Alexander Belopolsky40018472011-02-26 01:02:56 +0000996int
997PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000998{
999 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
1000}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002static PyObject*
1003get_latin1_char(unsigned char ch)
1004{
1005 PyUnicodeObject *unicode = unicode_latin1[ch];
1006 if (!unicode) {
1007 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
1008 if (!unicode)
1009 return NULL;
1010 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1011 unicode_latin1[ch] = unicode;
1012 }
1013 Py_INCREF(unicode);
1014 return (PyObject *)unicode;
1015}
1016
Alexander Belopolsky40018472011-02-26 01:02:56 +00001017PyObject *
1018PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019{
1020 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001021 Py_UCS4 maxchar = 0;
1022 Py_ssize_t num_surrogates;
1023
1024 if (u == NULL)
1025 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001027 /* If the Unicode data is known at construction time, we can apply
1028 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030 /* Optimization for empty strings */
1031 if (size == 0 && unicode_empty != NULL) {
1032 Py_INCREF(unicode_empty);
1033 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001034 }
Tim Petersced69f82003-09-16 20:30:58 +00001035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001036 /* Single character Unicode objects in the Latin-1 range are
1037 shared when using this constructor */
1038 if (size == 1 && *u < 256)
1039 return get_latin1_char((unsigned char)*u);
1040
1041 /* If not empty and not single character, copy the Unicode data
1042 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001043 if (find_maxchar_surrogates(u, u + size,
1044 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 return NULL;
1046
1047 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1048 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049 if (!unicode)
1050 return NULL;
1051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 switch (PyUnicode_KIND(unicode)) {
1053 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001054 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001055 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1056 break;
1057 case PyUnicode_2BYTE_KIND:
1058#if Py_UNICODE_SIZE == 2
1059 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1060#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001061 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1063#endif
1064 break;
1065 case PyUnicode_4BYTE_KIND:
1066#if SIZEOF_WCHAR_T == 2
1067 /* This is the only case which has to process surrogates, thus
1068 a simple copy loop is not enough and we need a function. */
1069 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1070 Py_DECREF(unicode);
1071 return NULL;
1072 }
1073#else
1074 assert(num_surrogates == 0);
1075 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1076#endif
1077 break;
1078 default:
1079 assert(0 && "Impossible state");
1080 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001081
1082 return (PyObject *)unicode;
1083}
1084
Alexander Belopolsky40018472011-02-26 01:02:56 +00001085PyObject *
1086PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001087{
1088 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001089
Benjamin Peterson14339b62009-01-31 16:36:08 +00001090 if (size < 0) {
1091 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001092 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001093 return NULL;
1094 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001095
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001096 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001097 some optimizations which share commonly used objects.
1098 Also, this means the input must be UTF-8, so fall back to the
1099 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001100 if (u != NULL) {
1101
Benjamin Peterson29060642009-01-31 22:14:21 +00001102 /* Optimization for empty strings */
1103 if (size == 0 && unicode_empty != NULL) {
1104 Py_INCREF(unicode_empty);
1105 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001106 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001107
1108 /* Single characters are shared when using this constructor.
1109 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110 if (size == 1 && Py_CHARMASK(*u) < 128)
1111 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001112
1113 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001114 }
1115
Walter Dörwald55507312007-05-18 13:12:10 +00001116 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001117 if (!unicode)
1118 return NULL;
1119
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001120 return (PyObject *)unicode;
1121}
1122
Alexander Belopolsky40018472011-02-26 01:02:56 +00001123PyObject *
1124PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001125{
1126 size_t size = strlen(u);
1127 if (size > PY_SSIZE_T_MAX) {
1128 PyErr_SetString(PyExc_OverflowError, "input too long");
1129 return NULL;
1130 }
1131
1132 return PyUnicode_FromStringAndSize(u, size);
1133}
1134
Victor Stinnere57b1c02011-09-28 22:20:48 +02001135static PyObject*
1136_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138 PyObject *res;
1139 unsigned char max = 127;
1140 Py_ssize_t i;
1141 for (i = 0; i < size; i++) {
1142 if (u[i] & 0x80) {
1143 max = 255;
1144 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001145 }
1146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147 res = PyUnicode_New(size, max);
1148 if (!res)
1149 return NULL;
1150 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1151 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001152}
1153
Victor Stinnere57b1c02011-09-28 22:20:48 +02001154static PyObject*
1155_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156{
1157 PyObject *res;
1158 Py_UCS2 max = 0;
1159 Py_ssize_t i;
1160 for (i = 0; i < size; i++)
1161 if (u[i] > max)
1162 max = u[i];
1163 res = PyUnicode_New(size, max);
1164 if (!res)
1165 return NULL;
1166 if (max >= 256)
1167 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1168 else
1169 for (i = 0; i < size; i++)
1170 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1171 return res;
1172}
1173
Victor Stinnere57b1c02011-09-28 22:20:48 +02001174static PyObject*
1175_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176{
1177 PyObject *res;
1178 Py_UCS4 max = 0;
1179 Py_ssize_t i;
1180 for (i = 0; i < size; i++)
1181 if (u[i] > max)
1182 max = u[i];
1183 res = PyUnicode_New(size, max);
1184 if (!res)
1185 return NULL;
1186 if (max >= 0x10000)
1187 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1188 else {
1189 int kind = PyUnicode_KIND(res);
1190 void *data = PyUnicode_DATA(res);
1191 for (i = 0; i < size; i++)
1192 PyUnicode_WRITE(kind, data, i, u[i]);
1193 }
1194 return res;
1195}
1196
1197PyObject*
1198PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1199{
1200 switch(kind) {
1201 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001202 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001204 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001205 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001206 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 }
1208 assert(0);
1209 return NULL;
1210}
1211
Victor Stinner034f6cf2011-09-30 02:26:44 +02001212PyObject*
1213PyUnicode_Copy(PyObject *unicode)
1214{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001215 Py_ssize_t size;
1216 PyObject *copy;
1217 void *data;
1218
Victor Stinner034f6cf2011-09-30 02:26:44 +02001219 if (!PyUnicode_Check(unicode)) {
1220 PyErr_BadInternalCall();
1221 return NULL;
1222 }
1223 if (PyUnicode_READY(unicode))
1224 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001225
1226 size = PyUnicode_GET_LENGTH(unicode);
1227 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1228 if (!copy)
1229 return NULL;
1230 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1231
1232 data = PyUnicode_DATA(unicode);
1233 switch (PyUnicode_KIND(unicode))
1234 {
1235 case PyUnicode_1BYTE_KIND:
1236 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1237 break;
1238 case PyUnicode_2BYTE_KIND:
1239 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1240 break;
1241 case PyUnicode_4BYTE_KIND:
1242 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1243 break;
1244 default:
1245 assert(0);
1246 break;
1247 }
1248 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001249}
1250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251
1252/* Widen Unicode objects to larger buffers.
1253 Return NULL if the string is too wide already. */
1254
1255void*
1256_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1257{
1258 Py_ssize_t i;
1259 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1260 void *d = PyUnicode_DATA(s);
1261 unsigned int skind = PyUnicode_KIND(s);
1262 if (PyUnicode_KIND(s) >= kind) {
1263 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1264 return NULL;
1265 }
1266 switch(kind) {
1267 case PyUnicode_2BYTE_KIND: {
1268 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1269 if (!result) {
1270 PyErr_NoMemory();
1271 return 0;
1272 }
1273 for (i = 0; i < len; i++)
1274 result[i] = ((Py_UCS1*)d)[i];
1275 return result;
1276 }
1277 case PyUnicode_4BYTE_KIND: {
1278 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1279 if (!result) {
1280 PyErr_NoMemory();
1281 return 0;
1282 }
1283 for (i = 0; i < len; i++)
1284 result[i] = PyUnicode_READ(skind, d, i);
1285 return result;
1286 }
1287 }
1288 Py_FatalError("invalid kind");
1289 return NULL;
1290}
1291
1292static Py_UCS4*
1293as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1294 int copy_null)
1295{
1296 int kind;
1297 void *data;
1298 Py_ssize_t len, targetlen;
1299 if (PyUnicode_READY(string) == -1)
1300 return NULL;
1301 kind = PyUnicode_KIND(string);
1302 data = PyUnicode_DATA(string);
1303 len = PyUnicode_GET_LENGTH(string);
1304 targetlen = len;
1305 if (copy_null)
1306 targetlen++;
1307 if (!target) {
1308 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1309 PyErr_NoMemory();
1310 return NULL;
1311 }
1312 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1313 if (!target) {
1314 PyErr_NoMemory();
1315 return NULL;
1316 }
1317 }
1318 else {
1319 if (targetsize < targetlen) {
1320 PyErr_Format(PyExc_SystemError,
1321 "string is longer than the buffer");
1322 if (copy_null && 0 < targetsize)
1323 target[0] = 0;
1324 return NULL;
1325 }
1326 }
1327 if (kind != PyUnicode_4BYTE_KIND) {
1328 Py_ssize_t i;
1329 for (i = 0; i < len; i++)
1330 target[i] = PyUnicode_READ(kind, data, i);
1331 }
1332 else
1333 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1334 if (copy_null)
1335 target[len] = 0;
1336 return target;
1337}
1338
1339Py_UCS4*
1340PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1341 int copy_null)
1342{
1343 if (target == NULL || targetsize < 1) {
1344 PyErr_BadInternalCall();
1345 return NULL;
1346 }
1347 return as_ucs4(string, target, targetsize, copy_null);
1348}
1349
1350Py_UCS4*
1351PyUnicode_AsUCS4Copy(PyObject *string)
1352{
1353 return as_ucs4(string, NULL, 0, 1);
1354}
1355
1356#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001357
Alexander Belopolsky40018472011-02-26 01:02:56 +00001358PyObject *
1359PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001360{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001362 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001364 PyErr_BadInternalCall();
1365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 }
1367
Martin v. Löwis790465f2008-04-05 20:41:37 +00001368 if (size == -1) {
1369 size = wcslen(w);
1370 }
1371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001373}
1374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001376
Walter Dörwald346737f2007-05-31 10:44:43 +00001377static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001378makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1379 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001380{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001381 *fmt++ = '%';
1382 if (width) {
1383 if (zeropad)
1384 *fmt++ = '0';
1385 fmt += sprintf(fmt, "%d", width);
1386 }
1387 if (precision)
1388 fmt += sprintf(fmt, ".%d", precision);
1389 if (longflag)
1390 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001391 else if (longlongflag) {
1392 /* longlongflag should only ever be nonzero on machines with
1393 HAVE_LONG_LONG defined */
1394#ifdef HAVE_LONG_LONG
1395 char *f = PY_FORMAT_LONG_LONG;
1396 while (*f)
1397 *fmt++ = *f++;
1398#else
1399 /* we shouldn't ever get here */
1400 assert(0);
1401 *fmt++ = 'l';
1402#endif
1403 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001404 else if (size_tflag) {
1405 char *f = PY_FORMAT_SIZE_T;
1406 while (*f)
1407 *fmt++ = *f++;
1408 }
1409 *fmt++ = c;
1410 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001411}
1412
Victor Stinner96865452011-03-01 23:44:09 +00001413/* helper for PyUnicode_FromFormatV() */
1414
1415static const char*
1416parse_format_flags(const char *f,
1417 int *p_width, int *p_precision,
1418 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1419{
1420 int width, precision, longflag, longlongflag, size_tflag;
1421
1422 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1423 f++;
1424 width = 0;
1425 while (Py_ISDIGIT((unsigned)*f))
1426 width = (width*10) + *f++ - '0';
1427 precision = 0;
1428 if (*f == '.') {
1429 f++;
1430 while (Py_ISDIGIT((unsigned)*f))
1431 precision = (precision*10) + *f++ - '0';
1432 if (*f == '%') {
1433 /* "%.3%s" => f points to "3" */
1434 f--;
1435 }
1436 }
1437 if (*f == '\0') {
1438 /* bogus format "%.1" => go backward, f points to "1" */
1439 f--;
1440 }
1441 if (p_width != NULL)
1442 *p_width = width;
1443 if (p_precision != NULL)
1444 *p_precision = precision;
1445
1446 /* Handle %ld, %lu, %lld and %llu. */
1447 longflag = 0;
1448 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001449 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001450
1451 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001452 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001453 longflag = 1;
1454 ++f;
1455 }
1456#ifdef HAVE_LONG_LONG
1457 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001458 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001459 longlongflag = 1;
1460 f += 2;
1461 }
1462#endif
1463 }
1464 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001465 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001466 size_tflag = 1;
1467 ++f;
1468 }
1469 if (p_longflag != NULL)
1470 *p_longflag = longflag;
1471 if (p_longlongflag != NULL)
1472 *p_longlongflag = longlongflag;
1473 if (p_size_tflag != NULL)
1474 *p_size_tflag = size_tflag;
1475 return f;
1476}
1477
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001478/* maximum number of characters required for output of %ld. 21 characters
1479 allows for 64-bit integers (in decimal) and an optional sign. */
1480#define MAX_LONG_CHARS 21
1481/* maximum number of characters required for output of %lld.
1482 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1483 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1484#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1485
Walter Dörwaldd2034312007-05-18 16:29:38 +00001486PyObject *
1487PyUnicode_FromFormatV(const char *format, va_list vargs)
1488{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001489 va_list count;
1490 Py_ssize_t callcount = 0;
1491 PyObject **callresults = NULL;
1492 PyObject **callresult = NULL;
1493 Py_ssize_t n = 0;
1494 int width = 0;
1495 int precision = 0;
1496 int zeropad;
1497 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001498 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001499 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001500 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1502 Py_UCS4 argmaxchar;
1503 Py_ssize_t numbersize = 0;
1504 char *numberresults = NULL;
1505 char *numberresult = NULL;
1506 Py_ssize_t i;
1507 int kind;
1508 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001509
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001510 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001511 /* step 1: count the number of %S/%R/%A/%s format specifications
1512 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1513 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001514 * result in an array)
1515 * also esimate a upper bound for all the number formats in the string,
1516 * numbers will be formated in step 3 and be keept in a '\0'-separated
1517 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001518 for (f = format; *f; f++) {
1519 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001520 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1522 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1523 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1524 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001527#ifdef HAVE_LONG_LONG
1528 if (longlongflag) {
1529 if (width < MAX_LONG_LONG_CHARS)
1530 width = MAX_LONG_LONG_CHARS;
1531 }
1532 else
1533#endif
1534 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1535 including sign. Decimal takes the most space. This
1536 isn't enough for octal. If a width is specified we
1537 need more (which we allocate later). */
1538 if (width < MAX_LONG_CHARS)
1539 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540
1541 /* account for the size + '\0' to separate numbers
1542 inside of the numberresults buffer */
1543 numbersize += (width + 1);
1544 }
1545 }
1546 else if ((unsigned char)*f > 127) {
1547 PyErr_Format(PyExc_ValueError,
1548 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1549 "string, got a non-ASCII byte: 0x%02x",
1550 (unsigned char)*f);
1551 return NULL;
1552 }
1553 }
1554 /* step 2: allocate memory for the results of
1555 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1556 if (callcount) {
1557 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1558 if (!callresults) {
1559 PyErr_NoMemory();
1560 return NULL;
1561 }
1562 callresult = callresults;
1563 }
1564 /* step 2.5: allocate memory for the results of formating numbers */
1565 if (numbersize) {
1566 numberresults = PyObject_Malloc(numbersize);
1567 if (!numberresults) {
1568 PyErr_NoMemory();
1569 goto fail;
1570 }
1571 numberresult = numberresults;
1572 }
1573
1574 /* step 3: format numbers and figure out how large a buffer we need */
1575 for (f = format; *f; f++) {
1576 if (*f == '%') {
1577 const char* p;
1578 int longflag;
1579 int longlongflag;
1580 int size_tflag;
1581 int numprinted;
1582
1583 p = f;
1584 zeropad = (f[1] == '0');
1585 f = parse_format_flags(f, &width, &precision,
1586 &longflag, &longlongflag, &size_tflag);
1587 switch (*f) {
1588 case 'c':
1589 {
1590 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001591 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001592 n++;
1593 break;
1594 }
1595 case '%':
1596 n++;
1597 break;
1598 case 'i':
1599 case 'd':
1600 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1601 width, precision, *f);
1602 if (longflag)
1603 numprinted = sprintf(numberresult, fmt,
1604 va_arg(count, long));
1605#ifdef HAVE_LONG_LONG
1606 else if (longlongflag)
1607 numprinted = sprintf(numberresult, fmt,
1608 va_arg(count, PY_LONG_LONG));
1609#endif
1610 else if (size_tflag)
1611 numprinted = sprintf(numberresult, fmt,
1612 va_arg(count, Py_ssize_t));
1613 else
1614 numprinted = sprintf(numberresult, fmt,
1615 va_arg(count, int));
1616 n += numprinted;
1617 /* advance by +1 to skip over the '\0' */
1618 numberresult += (numprinted + 1);
1619 assert(*(numberresult - 1) == '\0');
1620 assert(*(numberresult - 2) != '\0');
1621 assert(numprinted >= 0);
1622 assert(numberresult <= numberresults + numbersize);
1623 break;
1624 case 'u':
1625 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1626 width, precision, 'u');
1627 if (longflag)
1628 numprinted = sprintf(numberresult, fmt,
1629 va_arg(count, unsigned long));
1630#ifdef HAVE_LONG_LONG
1631 else if (longlongflag)
1632 numprinted = sprintf(numberresult, fmt,
1633 va_arg(count, unsigned PY_LONG_LONG));
1634#endif
1635 else if (size_tflag)
1636 numprinted = sprintf(numberresult, fmt,
1637 va_arg(count, size_t));
1638 else
1639 numprinted = sprintf(numberresult, fmt,
1640 va_arg(count, unsigned int));
1641 n += numprinted;
1642 numberresult += (numprinted + 1);
1643 assert(*(numberresult - 1) == '\0');
1644 assert(*(numberresult - 2) != '\0');
1645 assert(numprinted >= 0);
1646 assert(numberresult <= numberresults + numbersize);
1647 break;
1648 case 'x':
1649 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1650 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1651 n += numprinted;
1652 numberresult += (numprinted + 1);
1653 assert(*(numberresult - 1) == '\0');
1654 assert(*(numberresult - 2) != '\0');
1655 assert(numprinted >= 0);
1656 assert(numberresult <= numberresults + numbersize);
1657 break;
1658 case 'p':
1659 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1660 /* %p is ill-defined: ensure leading 0x. */
1661 if (numberresult[1] == 'X')
1662 numberresult[1] = 'x';
1663 else if (numberresult[1] != 'x') {
1664 memmove(numberresult + 2, numberresult,
1665 strlen(numberresult) + 1);
1666 numberresult[0] = '0';
1667 numberresult[1] = 'x';
1668 numprinted += 2;
1669 }
1670 n += numprinted;
1671 numberresult += (numprinted + 1);
1672 assert(*(numberresult - 1) == '\0');
1673 assert(*(numberresult - 2) != '\0');
1674 assert(numprinted >= 0);
1675 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001676 break;
1677 case 's':
1678 {
1679 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001680 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001681 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1682 if (!str)
1683 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 /* since PyUnicode_DecodeUTF8 returns already flexible
1685 unicode objects, there is no need to call ready on them */
1686 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001687 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001689 /* Remember the str and switch to the next slot */
1690 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001691 break;
1692 }
1693 case 'U':
1694 {
1695 PyObject *obj = va_arg(count, PyObject *);
1696 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 if (PyUnicode_READY(obj) == -1)
1698 goto fail;
1699 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001700 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001702 break;
1703 }
1704 case 'V':
1705 {
1706 PyObject *obj = va_arg(count, PyObject *);
1707 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001708 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001709 assert(obj || str);
1710 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001711 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001712 if (PyUnicode_READY(obj) == -1)
1713 goto fail;
1714 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001715 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001717 *callresult++ = NULL;
1718 }
1719 else {
1720 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1721 if (!str_obj)
1722 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001724 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001726 *callresult++ = str_obj;
1727 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001728 break;
1729 }
1730 case 'S':
1731 {
1732 PyObject *obj = va_arg(count, PyObject *);
1733 PyObject *str;
1734 assert(obj);
1735 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001737 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001739 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001741 /* Remember the str and switch to the next slot */
1742 *callresult++ = str;
1743 break;
1744 }
1745 case 'R':
1746 {
1747 PyObject *obj = va_arg(count, PyObject *);
1748 PyObject *repr;
1749 assert(obj);
1750 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001752 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001754 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001756 /* Remember the repr and switch to the next slot */
1757 *callresult++ = repr;
1758 break;
1759 }
1760 case 'A':
1761 {
1762 PyObject *obj = va_arg(count, PyObject *);
1763 PyObject *ascii;
1764 assert(obj);
1765 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001767 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001769 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001771 /* Remember the repr and switch to the next slot */
1772 *callresult++ = ascii;
1773 break;
1774 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001775 default:
1776 /* if we stumble upon an unknown
1777 formatting code, copy the rest of
1778 the format string to the output
1779 string. (we cannot just skip the
1780 code, since there's no way to know
1781 what's in the argument list) */
1782 n += strlen(p);
1783 goto expand;
1784 }
1785 } else
1786 n++;
1787 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001788 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001789 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001791 we don't have to resize the string.
1792 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001794 if (!string)
1795 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 kind = PyUnicode_KIND(string);
1797 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001798 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001802 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001803 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001804
1805 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001806 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1807 /* checking for == because the last argument could be a empty
1808 string, which causes i to point to end, the assert at the end of
1809 the loop */
1810 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001811
Benjamin Peterson14339b62009-01-31 16:36:08 +00001812 switch (*f) {
1813 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001814 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 const int ordinal = va_arg(vargs, int);
1816 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001817 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001818 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001819 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001820 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001821 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001822 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 case 'p':
1824 /* unused, since we already have the result */
1825 if (*f == 'p')
1826 (void) va_arg(vargs, void *);
1827 else
1828 (void) va_arg(vargs, int);
1829 /* extract the result from numberresults and append. */
1830 for (; *numberresult; ++i, ++numberresult)
1831 PyUnicode_WRITE(kind, data, i, *numberresult);
1832 /* skip over the separating '\0' */
1833 assert(*numberresult == '\0');
1834 numberresult++;
1835 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001836 break;
1837 case 's':
1838 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001839 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001841 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842 size = PyUnicode_GET_LENGTH(*callresult);
1843 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001844 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1845 *callresult, 0,
1846 size) < 0)
1847 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001849 /* We're done with the unicode()/repr() => forget it */
1850 Py_DECREF(*callresult);
1851 /* switch to next unicode()/repr() result */
1852 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001853 break;
1854 }
1855 case 'U':
1856 {
1857 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001858 Py_ssize_t size;
1859 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1860 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001861 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1862 obj, 0,
1863 size) < 0)
1864 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001865 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001866 break;
1867 }
1868 case 'V':
1869 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001870 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001871 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001872 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001873 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 size = PyUnicode_GET_LENGTH(obj);
1875 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001876 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1877 obj, 0,
1878 size) < 0)
1879 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001880 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001881 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001882 size = PyUnicode_GET_LENGTH(*callresult);
1883 assert(PyUnicode_KIND(*callresult) <=
1884 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001885 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1886 *callresult,
1887 0, size) < 0)
1888 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001889 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001890 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001891 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001892 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001893 break;
1894 }
1895 case 'S':
1896 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001897 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001898 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001899 /* unused, since we already have the result */
1900 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001902 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1903 *callresult, 0,
1904 PyUnicode_GET_LENGTH(*callresult)) < 0)
1905 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001907 /* We're done with the unicode()/repr() => forget it */
1908 Py_DECREF(*callresult);
1909 /* switch to next unicode()/repr() result */
1910 ++callresult;
1911 break;
1912 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001913 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001915 break;
1916 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001917 for (; *p; ++p, ++i)
1918 PyUnicode_WRITE(kind, data, i, *p);
1919 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001920 goto end;
1921 }
Victor Stinner1205f272010-09-11 00:54:47 +00001922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 else {
1924 assert(i < PyUnicode_GET_LENGTH(string));
1925 PyUnicode_WRITE(kind, data, i++, *f);
1926 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001927 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001928 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001929
Benjamin Peterson29060642009-01-31 22:14:21 +00001930 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001931 if (callresults)
1932 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001933 if (numberresults)
1934 PyObject_Free(numberresults);
1935 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001936 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001937 if (callresults) {
1938 PyObject **callresult2 = callresults;
1939 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001940 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001941 ++callresult2;
1942 }
1943 PyObject_Free(callresults);
1944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001945 if (numberresults)
1946 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001947 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001948}
1949
Walter Dörwaldd2034312007-05-18 16:29:38 +00001950PyObject *
1951PyUnicode_FromFormat(const char *format, ...)
1952{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001953 PyObject* ret;
1954 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001955
1956#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001957 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001958#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001959 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001960#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001961 ret = PyUnicode_FromFormatV(format, vargs);
1962 va_end(vargs);
1963 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001964}
1965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966#ifdef HAVE_WCHAR_H
1967
Victor Stinner5593d8a2010-10-02 11:11:27 +00001968/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1969 convert a Unicode object to a wide character string.
1970
Victor Stinnerd88d9832011-09-06 02:00:05 +02001971 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001972 character) required to convert the unicode object. Ignore size argument.
1973
Victor Stinnerd88d9832011-09-06 02:00:05 +02001974 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001975 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001976 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001977static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001978unicode_aswidechar(PyUnicodeObject *unicode,
1979 wchar_t *w,
1980 Py_ssize_t size)
1981{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001982 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001983 const wchar_t *wstr;
1984
1985 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1986 if (wstr == NULL)
1987 return -1;
1988
Victor Stinner5593d8a2010-10-02 11:11:27 +00001989 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001990 if (size > res)
1991 size = res + 1;
1992 else
1993 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001995 return res;
1996 }
1997 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001999}
2000
2001Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002002PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002003 wchar_t *w,
2004 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005{
2006 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002007 PyErr_BadInternalCall();
2008 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002010 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011}
2012
Victor Stinner137c34c2010-09-29 10:25:54 +00002013wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002014PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002015 Py_ssize_t *size)
2016{
2017 wchar_t* buffer;
2018 Py_ssize_t buflen;
2019
2020 if (unicode == NULL) {
2021 PyErr_BadInternalCall();
2022 return NULL;
2023 }
2024
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002025 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 if (buflen == -1)
2027 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002028 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002029 PyErr_NoMemory();
2030 return NULL;
2031 }
2032
Victor Stinner137c34c2010-09-29 10:25:54 +00002033 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2034 if (buffer == NULL) {
2035 PyErr_NoMemory();
2036 return NULL;
2037 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002038 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002039 if (buflen == -1)
2040 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002041 if (size != NULL)
2042 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002043 return buffer;
2044}
2045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047
Alexander Belopolsky40018472011-02-26 01:02:56 +00002048PyObject *
2049PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002051 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002052 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002053 PyErr_SetString(PyExc_ValueError,
2054 "chr() arg not in range(0x110000)");
2055 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002056 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058 if (ordinal < 256)
2059 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002061 v = PyUnicode_New(1, ordinal);
2062 if (v == NULL)
2063 return NULL;
2064 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2065 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002066}
2067
Alexander Belopolsky40018472011-02-26 01:02:56 +00002068PyObject *
2069PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002071 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002072 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002073 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002074 Py_INCREF(obj);
2075 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002076 }
2077 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002078 /* For a Unicode subtype that's not a Unicode object,
2079 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002080 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002081 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002082 PyErr_Format(PyExc_TypeError,
2083 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002084 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002085 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002086}
2087
Alexander Belopolsky40018472011-02-26 01:02:56 +00002088PyObject *
2089PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002090 const char *encoding,
2091 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002092{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002093 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002094 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002095
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002097 PyErr_BadInternalCall();
2098 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002100
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002101 /* Decoding bytes objects is the most common case and should be fast */
2102 if (PyBytes_Check(obj)) {
2103 if (PyBytes_GET_SIZE(obj) == 0) {
2104 Py_INCREF(unicode_empty);
2105 v = (PyObject *) unicode_empty;
2106 }
2107 else {
2108 v = PyUnicode_Decode(
2109 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2110 encoding, errors);
2111 }
2112 return v;
2113 }
2114
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002115 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002116 PyErr_SetString(PyExc_TypeError,
2117 "decoding str is not supported");
2118 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002119 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002120
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002121 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2122 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2123 PyErr_Format(PyExc_TypeError,
2124 "coercing to str: need bytes, bytearray "
2125 "or buffer-like object, %.80s found",
2126 Py_TYPE(obj)->tp_name);
2127 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002128 }
Tim Petersced69f82003-09-16 20:30:58 +00002129
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002130 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002131 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002132 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 }
Tim Petersced69f82003-09-16 20:30:58 +00002134 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002135 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002136
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002137 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002138 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002139}
2140
Victor Stinner600d3be2010-06-10 12:00:55 +00002141/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002142 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2143 1 on success. */
2144static int
2145normalize_encoding(const char *encoding,
2146 char *lower,
2147 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002149 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002150 char *l;
2151 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002152
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002153 e = encoding;
2154 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002155 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002156 while (*e) {
2157 if (l == l_end)
2158 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002159 if (Py_ISUPPER(*e)) {
2160 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002161 }
2162 else if (*e == '_') {
2163 *l++ = '-';
2164 e++;
2165 }
2166 else {
2167 *l++ = *e++;
2168 }
2169 }
2170 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002171 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002172}
2173
Alexander Belopolsky40018472011-02-26 01:02:56 +00002174PyObject *
2175PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002176 Py_ssize_t size,
2177 const char *encoding,
2178 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002179{
2180 PyObject *buffer = NULL, *unicode;
2181 Py_buffer info;
2182 char lower[11]; /* Enough for any encoding shortcut */
2183
2184 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002185 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002186
2187 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002188 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002189 if ((strcmp(lower, "utf-8") == 0) ||
2190 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002191 return PyUnicode_DecodeUTF8(s, size, errors);
2192 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002193 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002194 (strcmp(lower, "iso-8859-1") == 0))
2195 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002196#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002197 else if (strcmp(lower, "mbcs") == 0)
2198 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002199#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002200 else if (strcmp(lower, "ascii") == 0)
2201 return PyUnicode_DecodeASCII(s, size, errors);
2202 else if (strcmp(lower, "utf-16") == 0)
2203 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2204 else if (strcmp(lower, "utf-32") == 0)
2205 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207
2208 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002209 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002210 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002211 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002212 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 if (buffer == NULL)
2214 goto onError;
2215 unicode = PyCodec_Decode(buffer, encoding, errors);
2216 if (unicode == NULL)
2217 goto onError;
2218 if (!PyUnicode_Check(unicode)) {
2219 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002220 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002221 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 Py_DECREF(unicode);
2223 goto onError;
2224 }
2225 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 if (PyUnicode_READY(unicode)) {
2227 Py_DECREF(unicode);
2228 return NULL;
2229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002231
Benjamin Peterson29060642009-01-31 22:14:21 +00002232 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 Py_XDECREF(buffer);
2234 return NULL;
2235}
2236
Alexander Belopolsky40018472011-02-26 01:02:56 +00002237PyObject *
2238PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002239 const char *encoding,
2240 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002241{
2242 PyObject *v;
2243
2244 if (!PyUnicode_Check(unicode)) {
2245 PyErr_BadArgument();
2246 goto onError;
2247 }
2248
2249 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002250 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002251
2252 /* Decode via the codec registry */
2253 v = PyCodec_Decode(unicode, encoding, errors);
2254 if (v == NULL)
2255 goto onError;
2256 return v;
2257
Benjamin Peterson29060642009-01-31 22:14:21 +00002258 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002259 return NULL;
2260}
2261
Alexander Belopolsky40018472011-02-26 01:02:56 +00002262PyObject *
2263PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002264 const char *encoding,
2265 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002266{
2267 PyObject *v;
2268
2269 if (!PyUnicode_Check(unicode)) {
2270 PyErr_BadArgument();
2271 goto onError;
2272 }
2273
2274 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002275 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002276
2277 /* Decode via the codec registry */
2278 v = PyCodec_Decode(unicode, encoding, errors);
2279 if (v == NULL)
2280 goto onError;
2281 if (!PyUnicode_Check(v)) {
2282 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002283 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002284 Py_TYPE(v)->tp_name);
2285 Py_DECREF(v);
2286 goto onError;
2287 }
2288 return v;
2289
Benjamin Peterson29060642009-01-31 22:14:21 +00002290 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002291 return NULL;
2292}
2293
Alexander Belopolsky40018472011-02-26 01:02:56 +00002294PyObject *
2295PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002296 Py_ssize_t size,
2297 const char *encoding,
2298 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299{
2300 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002301
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302 unicode = PyUnicode_FromUnicode(s, size);
2303 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2306 Py_DECREF(unicode);
2307 return v;
2308}
2309
Alexander Belopolsky40018472011-02-26 01:02:56 +00002310PyObject *
2311PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002312 const char *encoding,
2313 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002314{
2315 PyObject *v;
2316
2317 if (!PyUnicode_Check(unicode)) {
2318 PyErr_BadArgument();
2319 goto onError;
2320 }
2321
2322 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002323 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002324
2325 /* Encode via the codec registry */
2326 v = PyCodec_Encode(unicode, encoding, errors);
2327 if (v == NULL)
2328 goto onError;
2329 return v;
2330
Benjamin Peterson29060642009-01-31 22:14:21 +00002331 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002332 return NULL;
2333}
2334
Victor Stinnerad158722010-10-27 00:25:46 +00002335PyObject *
2336PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002337{
Victor Stinner99b95382011-07-04 14:23:54 +02002338#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002339 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2340 PyUnicode_GET_SIZE(unicode),
2341 NULL);
2342#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002344#else
Victor Stinner793b5312011-04-27 00:24:21 +02002345 PyInterpreterState *interp = PyThreadState_GET()->interp;
2346 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2347 cannot use it to encode and decode filenames before it is loaded. Load
2348 the Python codec requires to encode at least its own filename. Use the C
2349 version of the locale codec until the codec registry is initialized and
2350 the Python codec is loaded.
2351
2352 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2353 cannot only rely on it: check also interp->fscodec_initialized for
2354 subinterpreters. */
2355 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002356 return PyUnicode_AsEncodedString(unicode,
2357 Py_FileSystemDefaultEncoding,
2358 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002359 }
2360 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002361 /* locale encoding with surrogateescape */
2362 wchar_t *wchar;
2363 char *bytes;
2364 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002365 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002366
2367 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2368 if (wchar == NULL)
2369 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002370 bytes = _Py_wchar2char(wchar, &error_pos);
2371 if (bytes == NULL) {
2372 if (error_pos != (size_t)-1) {
2373 char *errmsg = strerror(errno);
2374 PyObject *exc = NULL;
2375 if (errmsg == NULL)
2376 errmsg = "Py_wchar2char() failed";
2377 raise_encode_exception(&exc,
2378 "filesystemencoding",
2379 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2380 error_pos, error_pos+1,
2381 errmsg);
2382 Py_XDECREF(exc);
2383 }
2384 else
2385 PyErr_NoMemory();
2386 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002387 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002388 }
2389 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002390
2391 bytes_obj = PyBytes_FromString(bytes);
2392 PyMem_Free(bytes);
2393 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002394 }
Victor Stinnerad158722010-10-27 00:25:46 +00002395#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002396}
2397
Alexander Belopolsky40018472011-02-26 01:02:56 +00002398PyObject *
2399PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002400 const char *encoding,
2401 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402{
2403 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002404 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002405
Guido van Rossumd57fd912000-03-10 22:53:23 +00002406 if (!PyUnicode_Check(unicode)) {
2407 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002408 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002409 }
Fred Drakee4315f52000-05-09 19:53:39 +00002410
Victor Stinner2f283c22011-03-02 01:21:46 +00002411 if (encoding == NULL) {
2412 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002414 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002416 }
Fred Drakee4315f52000-05-09 19:53:39 +00002417
2418 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002419 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002420 if ((strcmp(lower, "utf-8") == 0) ||
2421 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002422 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002423 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002425 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002427 }
Victor Stinner37296e82010-06-10 13:36:23 +00002428 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002429 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002430 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002431 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002432#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002433 else if (strcmp(lower, "mbcs") == 0)
2434 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2435 PyUnicode_GET_SIZE(unicode),
2436 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002437#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002438 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002440 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441
2442 /* Encode via the codec registry */
2443 v = PyCodec_Encode(unicode, encoding, errors);
2444 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002445 return NULL;
2446
2447 /* The normal path */
2448 if (PyBytes_Check(v))
2449 return v;
2450
2451 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002452 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002453 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002454 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002455
2456 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2457 "encoder %s returned bytearray instead of bytes",
2458 encoding);
2459 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002460 Py_DECREF(v);
2461 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002462 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002463
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002464 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2465 Py_DECREF(v);
2466 return b;
2467 }
2468
2469 PyErr_Format(PyExc_TypeError,
2470 "encoder did not return a bytes object (type=%.400s)",
2471 Py_TYPE(v)->tp_name);
2472 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002473 return NULL;
2474}
2475
Alexander Belopolsky40018472011-02-26 01:02:56 +00002476PyObject *
2477PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002478 const char *encoding,
2479 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002480{
2481 PyObject *v;
2482
2483 if (!PyUnicode_Check(unicode)) {
2484 PyErr_BadArgument();
2485 goto onError;
2486 }
2487
2488 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002489 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002490
2491 /* Encode via the codec registry */
2492 v = PyCodec_Encode(unicode, encoding, errors);
2493 if (v == NULL)
2494 goto onError;
2495 if (!PyUnicode_Check(v)) {
2496 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002497 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002498 Py_TYPE(v)->tp_name);
2499 Py_DECREF(v);
2500 goto onError;
2501 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002503
Benjamin Peterson29060642009-01-31 22:14:21 +00002504 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 return NULL;
2506}
2507
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002508PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002509PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002510 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002511 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2512}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002513
Christian Heimes5894ba72007-11-04 11:43:14 +00002514PyObject*
2515PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2516{
Victor Stinner99b95382011-07-04 14:23:54 +02002517#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002518 return PyUnicode_DecodeMBCS(s, size, NULL);
2519#elif defined(__APPLE__)
2520 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2521#else
Victor Stinner793b5312011-04-27 00:24:21 +02002522 PyInterpreterState *interp = PyThreadState_GET()->interp;
2523 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2524 cannot use it to encode and decode filenames before it is loaded. Load
2525 the Python codec requires to encode at least its own filename. Use the C
2526 version of the locale codec until the codec registry is initialized and
2527 the Python codec is loaded.
2528
2529 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2530 cannot only rely on it: check also interp->fscodec_initialized for
2531 subinterpreters. */
2532 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002533 return PyUnicode_Decode(s, size,
2534 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002535 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002536 }
2537 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002538 /* locale encoding with surrogateescape */
2539 wchar_t *wchar;
2540 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002541 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002542
2543 if (s[size] != '\0' || size != strlen(s)) {
2544 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2545 return NULL;
2546 }
2547
Victor Stinner168e1172010-10-16 23:16:16 +00002548 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002549 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002550 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002551
Victor Stinner168e1172010-10-16 23:16:16 +00002552 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002553 PyMem_Free(wchar);
2554 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002555 }
Victor Stinnerad158722010-10-27 00:25:46 +00002556#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002557}
2558
Martin v. Löwis011e8422009-05-05 04:43:17 +00002559
2560int
2561PyUnicode_FSConverter(PyObject* arg, void* addr)
2562{
2563 PyObject *output = NULL;
2564 Py_ssize_t size;
2565 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002566 if (arg == NULL) {
2567 Py_DECREF(*(PyObject**)addr);
2568 return 1;
2569 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002570 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002571 output = arg;
2572 Py_INCREF(output);
2573 }
2574 else {
2575 arg = PyUnicode_FromObject(arg);
2576 if (!arg)
2577 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002578 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002579 Py_DECREF(arg);
2580 if (!output)
2581 return 0;
2582 if (!PyBytes_Check(output)) {
2583 Py_DECREF(output);
2584 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2585 return 0;
2586 }
2587 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002588 size = PyBytes_GET_SIZE(output);
2589 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002590 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002591 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002592 Py_DECREF(output);
2593 return 0;
2594 }
2595 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002596 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002597}
2598
2599
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002600int
2601PyUnicode_FSDecoder(PyObject* arg, void* addr)
2602{
2603 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002604 if (arg == NULL) {
2605 Py_DECREF(*(PyObject**)addr);
2606 return 1;
2607 }
2608 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 if (PyUnicode_READY(arg))
2610 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002611 output = arg;
2612 Py_INCREF(output);
2613 }
2614 else {
2615 arg = PyBytes_FromObject(arg);
2616 if (!arg)
2617 return 0;
2618 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2619 PyBytes_GET_SIZE(arg));
2620 Py_DECREF(arg);
2621 if (!output)
2622 return 0;
2623 if (!PyUnicode_Check(output)) {
2624 Py_DECREF(output);
2625 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2626 return 0;
2627 }
2628 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002629 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2630 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002631 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2632 Py_DECREF(output);
2633 return 0;
2634 }
2635 *(PyObject**)addr = output;
2636 return Py_CLEANUP_SUPPORTED;
2637}
2638
2639
Martin v. Löwis5b222132007-06-10 09:51:05 +00002640char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002641PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002642{
Christian Heimesf3863112007-11-22 07:46:41 +00002643 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2645
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002646 if (!PyUnicode_Check(unicode)) {
2647 PyErr_BadArgument();
2648 return NULL;
2649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002650 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002651 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652
2653 if (_PyUnicode_UTF8(unicode) == NULL) {
2654 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2655 if (bytes == NULL)
2656 return NULL;
2657 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2658 if (u->_base.utf8 == NULL) {
2659 Py_DECREF(bytes);
2660 return NULL;
2661 }
2662 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2663 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2664 Py_DECREF(bytes);
2665 }
2666
2667 if (psize)
2668 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2669 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002670}
2671
2672char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2676}
2677
2678#ifdef Py_DEBUG
2679int unicode_as_unicode_calls = 0;
2680#endif
2681
2682
2683Py_UNICODE *
2684PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2685{
2686 PyUnicodeObject *u;
2687 const unsigned char *one_byte;
2688#if SIZEOF_WCHAR_T == 4
2689 const Py_UCS2 *two_bytes;
2690#else
2691 const Py_UCS4 *four_bytes;
2692 const Py_UCS4 *ucs4_end;
2693 Py_ssize_t num_surrogates;
2694#endif
2695 wchar_t *w;
2696 wchar_t *wchar_end;
2697
2698 if (!PyUnicode_Check(unicode)) {
2699 PyErr_BadArgument();
2700 return NULL;
2701 }
2702 u = (PyUnicodeObject*)unicode;
2703 if (_PyUnicode_WSTR(u) == NULL) {
2704 /* Non-ASCII compact unicode object */
2705 assert(_PyUnicode_KIND(u) != 0);
2706 assert(PyUnicode_IS_READY(u));
2707
2708#ifdef Py_DEBUG
2709 ++unicode_as_unicode_calls;
2710#endif
2711
2712 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2713#if SIZEOF_WCHAR_T == 2
2714 four_bytes = PyUnicode_4BYTE_DATA(u);
2715 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2716 num_surrogates = 0;
2717
2718 for (; four_bytes < ucs4_end; ++four_bytes) {
2719 if (*four_bytes > 0xFFFF)
2720 ++num_surrogates;
2721 }
2722
2723 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2724 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2725 if (!_PyUnicode_WSTR(u)) {
2726 PyErr_NoMemory();
2727 return NULL;
2728 }
2729 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2730
2731 w = _PyUnicode_WSTR(u);
2732 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2733 four_bytes = PyUnicode_4BYTE_DATA(u);
2734 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2735 if (*four_bytes > 0xFFFF) {
2736 /* encode surrogate pair in this case */
2737 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2738 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2739 }
2740 else
2741 *w = *four_bytes;
2742
2743 if (w > wchar_end) {
2744 assert(0 && "Miscalculated string end");
2745 }
2746 }
2747 *w = 0;
2748#else
2749 /* sizeof(wchar_t) == 4 */
2750 Py_FatalError("Impossible unicode object state, wstr and str "
2751 "should share memory already.");
2752 return NULL;
2753#endif
2754 }
2755 else {
2756 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2757 (_PyUnicode_LENGTH(u) + 1));
2758 if (!_PyUnicode_WSTR(u)) {
2759 PyErr_NoMemory();
2760 return NULL;
2761 }
2762 if (!PyUnicode_IS_COMPACT_ASCII(u))
2763 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2764 w = _PyUnicode_WSTR(u);
2765 wchar_end = w + _PyUnicode_LENGTH(u);
2766
2767 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2768 one_byte = PyUnicode_1BYTE_DATA(u);
2769 for (; w < wchar_end; ++one_byte, ++w)
2770 *w = *one_byte;
2771 /* null-terminate the wstr */
2772 *w = 0;
2773 }
2774 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2775#if SIZEOF_WCHAR_T == 4
2776 two_bytes = PyUnicode_2BYTE_DATA(u);
2777 for (; w < wchar_end; ++two_bytes, ++w)
2778 *w = *two_bytes;
2779 /* null-terminate the wstr */
2780 *w = 0;
2781#else
2782 /* sizeof(wchar_t) == 2 */
2783 PyObject_FREE(_PyUnicode_WSTR(u));
2784 _PyUnicode_WSTR(u) = NULL;
2785 Py_FatalError("Impossible unicode object state, wstr "
2786 "and str should share memory already.");
2787 return NULL;
2788#endif
2789 }
2790 else {
2791 assert(0 && "This should never happen.");
2792 }
2793 }
2794 }
2795 if (size != NULL)
2796 *size = PyUnicode_WSTR_LENGTH(u);
2797 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002798}
2799
Alexander Belopolsky40018472011-02-26 01:02:56 +00002800Py_UNICODE *
2801PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002803 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804}
2805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002806
Alexander Belopolsky40018472011-02-26 01:02:56 +00002807Py_ssize_t
2808PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809{
2810 if (!PyUnicode_Check(unicode)) {
2811 PyErr_BadArgument();
2812 goto onError;
2813 }
2814 return PyUnicode_GET_SIZE(unicode);
2815
Benjamin Peterson29060642009-01-31 22:14:21 +00002816 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817 return -1;
2818}
2819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002820Py_ssize_t
2821PyUnicode_GetLength(PyObject *unicode)
2822{
2823 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2824 PyErr_BadArgument();
2825 return -1;
2826 }
2827
2828 return PyUnicode_GET_LENGTH(unicode);
2829}
2830
2831Py_UCS4
2832PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2833{
2834 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2835 return PyErr_BadArgument();
2836 return (Py_UCS4)-1;
2837 }
2838 return PyUnicode_READ_CHAR(unicode, index);
2839}
2840
2841int
2842PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2843{
2844 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2845 return PyErr_BadArgument();
2846 return -1;
2847 }
2848
2849 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2850 index, ch);
2851 return 0;
2852}
2853
Alexander Belopolsky40018472011-02-26 01:02:56 +00002854const char *
2855PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002856{
Victor Stinner42cb4622010-09-01 19:39:01 +00002857 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002858}
2859
Victor Stinner554f3f02010-06-16 23:33:54 +00002860/* create or adjust a UnicodeDecodeError */
2861static void
2862make_decode_exception(PyObject **exceptionObject,
2863 const char *encoding,
2864 const char *input, Py_ssize_t length,
2865 Py_ssize_t startpos, Py_ssize_t endpos,
2866 const char *reason)
2867{
2868 if (*exceptionObject == NULL) {
2869 *exceptionObject = PyUnicodeDecodeError_Create(
2870 encoding, input, length, startpos, endpos, reason);
2871 }
2872 else {
2873 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2874 goto onError;
2875 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2876 goto onError;
2877 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2878 goto onError;
2879 }
2880 return;
2881
2882onError:
2883 Py_DECREF(*exceptionObject);
2884 *exceptionObject = NULL;
2885}
2886
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887/* error handling callback helper:
2888 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002889 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002890 and adjust various state variables.
2891 return 0 on success, -1 on error
2892*/
2893
Alexander Belopolsky40018472011-02-26 01:02:56 +00002894static int
2895unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002896 const char *encoding, const char *reason,
2897 const char **input, const char **inend, Py_ssize_t *startinpos,
2898 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2899 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002900{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002901 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002902
2903 PyObject *restuple = NULL;
2904 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002905 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002906 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002907 Py_ssize_t requiredsize;
2908 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002909 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002910 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002911 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002912 int res = -1;
2913
2914 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002915 *errorHandler = PyCodec_LookupError(errors);
2916 if (*errorHandler == NULL)
2917 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002918 }
2919
Victor Stinner554f3f02010-06-16 23:33:54 +00002920 make_decode_exception(exceptionObject,
2921 encoding,
2922 *input, *inend - *input,
2923 *startinpos, *endinpos,
2924 reason);
2925 if (*exceptionObject == NULL)
2926 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002927
2928 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2929 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002930 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002931 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002932 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002933 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002934 }
2935 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002936 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002937
2938 /* Copy back the bytes variables, which might have been modified by the
2939 callback */
2940 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2941 if (!inputobj)
2942 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002943 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002944 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002945 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002946 *input = PyBytes_AS_STRING(inputobj);
2947 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002948 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002949 /* we can DECREF safely, as the exception has another reference,
2950 so the object won't go away. */
2951 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002952
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002953 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002954 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002955 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002956 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2957 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002958 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002959
2960 /* need more space? (at least enough for what we
2961 have+the replacement+the rest of the string (starting
2962 at the new input position), so we won't have to check space
2963 when there are no errors in the rest of the string) */
2964 repptr = PyUnicode_AS_UNICODE(repunicode);
2965 repsize = PyUnicode_GET_SIZE(repunicode);
2966 requiredsize = *outpos + repsize + insize-newpos;
2967 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002968 if (requiredsize<2*outsize)
2969 requiredsize = 2*outsize;
2970 if (_PyUnicode_Resize(output, requiredsize) < 0)
2971 goto onError;
2972 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002973 }
2974 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002975 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002976 Py_UNICODE_COPY(*outptr, repptr, repsize);
2977 *outptr += repsize;
2978 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002980 /* we made it! */
2981 res = 0;
2982
Benjamin Peterson29060642009-01-31 22:14:21 +00002983 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984 Py_XDECREF(restuple);
2985 return res;
2986}
2987
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002988/* --- UTF-7 Codec -------------------------------------------------------- */
2989
Antoine Pitrou244651a2009-05-04 18:56:13 +00002990/* See RFC2152 for details. We encode conservatively and decode liberally. */
2991
2992/* Three simple macros defining base-64. */
2993
2994/* Is c a base-64 character? */
2995
2996#define IS_BASE64(c) \
2997 (((c) >= 'A' && (c) <= 'Z') || \
2998 ((c) >= 'a' && (c) <= 'z') || \
2999 ((c) >= '0' && (c) <= '9') || \
3000 (c) == '+' || (c) == '/')
3001
3002/* given that c is a base-64 character, what is its base-64 value? */
3003
3004#define FROM_BASE64(c) \
3005 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3006 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3007 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3008 (c) == '+' ? 62 : 63)
3009
3010/* What is the base-64 character of the bottom 6 bits of n? */
3011
3012#define TO_BASE64(n) \
3013 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3014
3015/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3016 * decoded as itself. We are permissive on decoding; the only ASCII
3017 * byte not decoding to itself is the + which begins a base64
3018 * string. */
3019
3020#define DECODE_DIRECT(c) \
3021 ((c) <= 127 && (c) != '+')
3022
3023/* The UTF-7 encoder treats ASCII characters differently according to
3024 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3025 * the above). See RFC2152. This array identifies these different
3026 * sets:
3027 * 0 : "Set D"
3028 * alphanumeric and '(),-./:?
3029 * 1 : "Set O"
3030 * !"#$%&*;<=>@[]^_`{|}
3031 * 2 : "whitespace"
3032 * ht nl cr sp
3033 * 3 : special (must be base64 encoded)
3034 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3035 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003036
Tim Petersced69f82003-09-16 20:30:58 +00003037static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003038char utf7_category[128] = {
3039/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3040 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3041/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3042 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3043/* sp ! " # $ % & ' ( ) * + , - . / */
3044 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3045/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3046 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3047/* @ A B C D E F G H I J K L M N O */
3048 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3049/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3050 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3051/* ` a b c d e f g h i j k l m n o */
3052 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3053/* p q r s t u v w x y z { | } ~ del */
3054 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003055};
3056
Antoine Pitrou244651a2009-05-04 18:56:13 +00003057/* ENCODE_DIRECT: this character should be encoded as itself. The
3058 * answer depends on whether we are encoding set O as itself, and also
3059 * on whether we are encoding whitespace as itself. RFC2152 makes it
3060 * clear that the answers to these questions vary between
3061 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003062
Antoine Pitrou244651a2009-05-04 18:56:13 +00003063#define ENCODE_DIRECT(c, directO, directWS) \
3064 ((c) < 128 && (c) > 0 && \
3065 ((utf7_category[(c)] == 0) || \
3066 (directWS && (utf7_category[(c)] == 2)) || \
3067 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003068
Alexander Belopolsky40018472011-02-26 01:02:56 +00003069PyObject *
3070PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003071 Py_ssize_t size,
3072 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003073{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003074 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3075}
3076
Antoine Pitrou244651a2009-05-04 18:56:13 +00003077/* The decoder. The only state we preserve is our read position,
3078 * i.e. how many characters we have consumed. So if we end in the
3079 * middle of a shift sequence we have to back off the read position
3080 * and the output to the beginning of the sequence, otherwise we lose
3081 * all the shift state (seen bits, number of bits seen, high
3082 * surrogate). */
3083
Alexander Belopolsky40018472011-02-26 01:02:56 +00003084PyObject *
3085PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003086 Py_ssize_t size,
3087 const char *errors,
3088 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003089{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003090 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003091 Py_ssize_t startinpos;
3092 Py_ssize_t endinpos;
3093 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003094 const char *e;
3095 PyUnicodeObject *unicode;
3096 Py_UNICODE *p;
3097 const char *errmsg = "";
3098 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003099 Py_UNICODE *shiftOutStart;
3100 unsigned int base64bits = 0;
3101 unsigned long base64buffer = 0;
3102 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003103 PyObject *errorHandler = NULL;
3104 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003105
3106 unicode = _PyUnicode_New(size);
3107 if (!unicode)
3108 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003109 if (size == 0) {
3110 if (consumed)
3111 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003112 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003113 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003115 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003116 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003117 e = s + size;
3118
3119 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003121 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003122 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003123
Antoine Pitrou244651a2009-05-04 18:56:13 +00003124 if (inShift) { /* in a base-64 section */
3125 if (IS_BASE64(ch)) { /* consume a base-64 character */
3126 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3127 base64bits += 6;
3128 s++;
3129 if (base64bits >= 16) {
3130 /* we have enough bits for a UTF-16 value */
3131 Py_UNICODE outCh = (Py_UNICODE)
3132 (base64buffer >> (base64bits-16));
3133 base64bits -= 16;
3134 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3135 if (surrogate) {
3136 /* expecting a second surrogate */
3137 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3138#ifdef Py_UNICODE_WIDE
3139 *p++ = (((surrogate & 0x3FF)<<10)
3140 | (outCh & 0x3FF)) + 0x10000;
3141#else
3142 *p++ = surrogate;
3143 *p++ = outCh;
3144#endif
3145 surrogate = 0;
3146 }
3147 else {
3148 surrogate = 0;
3149 errmsg = "second surrogate missing";
3150 goto utf7Error;
3151 }
3152 }
3153 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3154 /* first surrogate */
3155 surrogate = outCh;
3156 }
3157 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3158 errmsg = "unexpected second surrogate";
3159 goto utf7Error;
3160 }
3161 else {
3162 *p++ = outCh;
3163 }
3164 }
3165 }
3166 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003167 inShift = 0;
3168 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003169 if (surrogate) {
3170 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003171 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003172 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003173 if (base64bits > 0) { /* left-over bits */
3174 if (base64bits >= 6) {
3175 /* We've seen at least one base-64 character */
3176 errmsg = "partial character in shift sequence";
3177 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003178 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003179 else {
3180 /* Some bits remain; they should be zero */
3181 if (base64buffer != 0) {
3182 errmsg = "non-zero padding bits in shift sequence";
3183 goto utf7Error;
3184 }
3185 }
3186 }
3187 if (ch != '-') {
3188 /* '-' is absorbed; other terminating
3189 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003190 *p++ = ch;
3191 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003192 }
3193 }
3194 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003195 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003196 s++; /* consume '+' */
3197 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003198 s++;
3199 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003200 }
3201 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003202 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003203 shiftOutStart = p;
3204 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003205 }
3206 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003207 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003208 *p++ = ch;
3209 s++;
3210 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003211 else {
3212 startinpos = s-starts;
3213 s++;
3214 errmsg = "unexpected special character";
3215 goto utf7Error;
3216 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003217 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003218utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003219 outpos = p-PyUnicode_AS_UNICODE(unicode);
3220 endinpos = s-starts;
3221 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003222 errors, &errorHandler,
3223 "utf7", errmsg,
3224 &starts, &e, &startinpos, &endinpos, &exc, &s,
3225 &unicode, &outpos, &p))
3226 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003227 }
3228
Antoine Pitrou244651a2009-05-04 18:56:13 +00003229 /* end of string */
3230
3231 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3232 /* if we're in an inconsistent state, that's an error */
3233 if (surrogate ||
3234 (base64bits >= 6) ||
3235 (base64bits > 0 && base64buffer != 0)) {
3236 outpos = p-PyUnicode_AS_UNICODE(unicode);
3237 endinpos = size;
3238 if (unicode_decode_call_errorhandler(
3239 errors, &errorHandler,
3240 "utf7", "unterminated shift sequence",
3241 &starts, &e, &startinpos, &endinpos, &exc, &s,
3242 &unicode, &outpos, &p))
3243 goto onError;
3244 if (s < e)
3245 goto restart;
3246 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003247 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003248
3249 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003250 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003251 if (inShift) {
3252 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003253 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003254 }
3255 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003256 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003257 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003258 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003259
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003260 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003261 goto onError;
3262
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 Py_XDECREF(errorHandler);
3264 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003265 if (PyUnicode_READY(unicode) == -1) {
3266 Py_DECREF(unicode);
3267 return NULL;
3268 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003269 return (PyObject *)unicode;
3270
Benjamin Peterson29060642009-01-31 22:14:21 +00003271 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003272 Py_XDECREF(errorHandler);
3273 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003274 Py_DECREF(unicode);
3275 return NULL;
3276}
3277
3278
Alexander Belopolsky40018472011-02-26 01:02:56 +00003279PyObject *
3280PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003281 Py_ssize_t size,
3282 int base64SetO,
3283 int base64WhiteSpace,
3284 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003285{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003286 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003287 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003288 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003289 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003290 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003291 unsigned int base64bits = 0;
3292 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003293 char * out;
3294 char * start;
3295
3296 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003297 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003298
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003299 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003300 return PyErr_NoMemory();
3301
Antoine Pitrou244651a2009-05-04 18:56:13 +00003302 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003303 if (v == NULL)
3304 return NULL;
3305
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003306 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003307 for (;i < size; ++i) {
3308 Py_UNICODE ch = s[i];
3309
Antoine Pitrou244651a2009-05-04 18:56:13 +00003310 if (inShift) {
3311 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3312 /* shifting out */
3313 if (base64bits) { /* output remaining bits */
3314 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3315 base64buffer = 0;
3316 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003317 }
3318 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003319 /* Characters not in the BASE64 set implicitly unshift the sequence
3320 so no '-' is required, except if the character is itself a '-' */
3321 if (IS_BASE64(ch) || ch == '-') {
3322 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003323 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003324 *out++ = (char) ch;
3325 }
3326 else {
3327 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003328 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003329 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003330 else { /* not in a shift sequence */
3331 if (ch == '+') {
3332 *out++ = '+';
3333 *out++ = '-';
3334 }
3335 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3336 *out++ = (char) ch;
3337 }
3338 else {
3339 *out++ = '+';
3340 inShift = 1;
3341 goto encode_char;
3342 }
3343 }
3344 continue;
3345encode_char:
3346#ifdef Py_UNICODE_WIDE
3347 if (ch >= 0x10000) {
3348 /* code first surrogate */
3349 base64bits += 16;
3350 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3351 while (base64bits >= 6) {
3352 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3353 base64bits -= 6;
3354 }
3355 /* prepare second surrogate */
3356 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3357 }
3358#endif
3359 base64bits += 16;
3360 base64buffer = (base64buffer << 16) | ch;
3361 while (base64bits >= 6) {
3362 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3363 base64bits -= 6;
3364 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003365 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003366 if (base64bits)
3367 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3368 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003369 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003370 if (_PyBytes_Resize(&v, out - start) < 0)
3371 return NULL;
3372 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003373}
3374
Antoine Pitrou244651a2009-05-04 18:56:13 +00003375#undef IS_BASE64
3376#undef FROM_BASE64
3377#undef TO_BASE64
3378#undef DECODE_DIRECT
3379#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003380
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381/* --- UTF-8 Codec -------------------------------------------------------- */
3382
Tim Petersced69f82003-09-16 20:30:58 +00003383static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003385 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3386 illegal prefix. See RFC 3629 for details */
3387 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3388 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003389 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003390 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3391 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3392 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3393 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003394 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3395 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003396 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3397 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003398 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3399 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3400 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3401 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3402 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403};
3404
Alexander Belopolsky40018472011-02-26 01:02:56 +00003405PyObject *
3406PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003407 Py_ssize_t size,
3408 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409{
Walter Dörwald69652032004-09-07 20:24:22 +00003410 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3411}
3412
Antoine Pitrouab868312009-01-10 15:40:25 +00003413/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3414#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3415
3416/* Mask to quickly check whether a C 'long' contains a
3417 non-ASCII, UTF8-encoded char. */
3418#if (SIZEOF_LONG == 8)
3419# define ASCII_CHAR_MASK 0x8080808080808080L
3420#elif (SIZEOF_LONG == 4)
3421# define ASCII_CHAR_MASK 0x80808080L
3422#else
3423# error C 'long' size should be either 4 or 8!
3424#endif
3425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003426/* Scans a UTF-8 string and returns the maximum character to be expected,
3427 the size of the decoded unicode string and if any major errors were
3428 encountered.
3429
3430 This function does check basic UTF-8 sanity, it does however NOT CHECK
3431 if the string contains surrogates, and if all continuation bytes are
3432 within the correct ranges, these checks are performed in
3433 PyUnicode_DecodeUTF8Stateful.
3434
3435 If it sets has_errors to 1, it means the value of unicode_size and max_char
3436 will be bogus and you should not rely on useful information in them.
3437 */
3438static Py_UCS4
3439utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3440 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3441 int *has_errors)
3442{
3443 Py_ssize_t n;
3444 Py_ssize_t char_count = 0;
3445 Py_UCS4 max_char = 127, new_max;
3446 Py_UCS4 upper_bound;
3447 const unsigned char *p = (const unsigned char *)s;
3448 const unsigned char *end = p + string_size;
3449 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3450 int err = 0;
3451
3452 for (; p < end && !err; ++p, ++char_count) {
3453 /* Only check value if it's not a ASCII char... */
3454 if (*p < 0x80) {
3455 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3456 an explanation. */
3457 if (!((size_t) p & LONG_PTR_MASK)) {
3458 /* Help register allocation */
3459 register const unsigned char *_p = p;
3460 while (_p < aligned_end) {
3461 unsigned long value = *(unsigned long *) _p;
3462 if (value & ASCII_CHAR_MASK)
3463 break;
3464 _p += SIZEOF_LONG;
3465 char_count += SIZEOF_LONG;
3466 }
3467 p = _p;
3468 if (p == end)
3469 break;
3470 }
3471 }
3472 if (*p >= 0x80) {
3473 n = utf8_code_length[*p];
3474 new_max = max_char;
3475 switch (n) {
3476 /* invalid start byte */
3477 case 0:
3478 err = 1;
3479 break;
3480 case 2:
3481 /* Code points between 0x00FF and 0x07FF inclusive.
3482 Approximate the upper bound of the code point,
3483 if this flips over 255 we can be sure it will be more
3484 than 255 and the string will need 2 bytes per code coint,
3485 if it stays under or equal to 255, we can be sure 1 byte
3486 is enough.
3487 ((*p & 0b00011111) << 6) | 0b00111111 */
3488 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3489 if (max_char < upper_bound)
3490 new_max = upper_bound;
3491 /* Ensure we track at least that we left ASCII space. */
3492 if (new_max < 128)
3493 new_max = 128;
3494 break;
3495 case 3:
3496 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3497 always > 255 and <= 65535 and will always need 2 bytes. */
3498 if (max_char < 65535)
3499 new_max = 65535;
3500 break;
3501 case 4:
3502 /* Code point will be above 0xFFFF for sure in this case. */
3503 new_max = 65537;
3504 break;
3505 /* Internal error, this should be caught by the first if */
3506 case 1:
3507 default:
3508 assert(0 && "Impossible case in utf8_max_char_and_size");
3509 err = 1;
3510 }
3511 /* Instead of number of overall bytes for this code point,
3512 n containts the number of following bytes: */
3513 --n;
3514 /* Check if the follow up chars are all valid continuation bytes */
3515 if (n >= 1) {
3516 const unsigned char *cont;
3517 if ((p + n) >= end) {
3518 if (consumed == 0)
3519 /* incomplete data, non-incremental decoding */
3520 err = 1;
3521 break;
3522 }
3523 for (cont = p + 1; cont < (p + n); ++cont) {
3524 if ((*cont & 0xc0) != 0x80) {
3525 err = 1;
3526 break;
3527 }
3528 }
3529 p += n;
3530 }
3531 else
3532 err = 1;
3533 max_char = new_max;
3534 }
3535 }
3536
3537 if (unicode_size)
3538 *unicode_size = char_count;
3539 if (has_errors)
3540 *has_errors = err;
3541 return max_char;
3542}
3543
3544/* Similar to PyUnicode_WRITE but can also write into wstr field
3545 of the legacy unicode representation */
3546#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3547 do { \
3548 const int k_ = (kind); \
3549 if (k_ == PyUnicode_WCHAR_KIND) \
3550 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3551 else if (k_ == PyUnicode_1BYTE_KIND) \
3552 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3553 else if (k_ == PyUnicode_2BYTE_KIND) \
3554 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3555 else \
3556 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3557 } while (0)
3558
Alexander Belopolsky40018472011-02-26 01:02:56 +00003559PyObject *
3560PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003561 Py_ssize_t size,
3562 const char *errors,
3563 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003564{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003567 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003568 Py_ssize_t startinpos;
3569 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003570 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003572 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573 PyObject *errorHandler = NULL;
3574 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003575 Py_UCS4 maxchar = 0;
3576 Py_ssize_t unicode_size;
3577 Py_ssize_t i;
3578 int kind;
3579 void *data;
3580 int has_errors;
3581 Py_UNICODE *error_outptr;
3582#if SIZEOF_WCHAR_T == 2
3583 Py_ssize_t wchar_offset = 0;
3584#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585
Walter Dörwald69652032004-09-07 20:24:22 +00003586 if (size == 0) {
3587 if (consumed)
3588 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003589 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003591 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3592 consumed, &has_errors);
3593 if (has_errors) {
3594 unicode = _PyUnicode_New(size);
3595 if (!unicode)
3596 return NULL;
3597 kind = PyUnicode_WCHAR_KIND;
3598 data = PyUnicode_AS_UNICODE(unicode);
3599 assert(data != NULL);
3600 }
3601 else {
3602 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3603 if (!unicode)
3604 return NULL;
3605 /* When the string is ASCII only, just use memcpy and return.
3606 unicode_size may be != size if there is an incomplete UTF-8
3607 sequence at the end of the ASCII block. */
3608 if (maxchar < 128 && size == unicode_size) {
3609 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3610 return (PyObject *)unicode;
3611 }
3612 kind = PyUnicode_KIND(unicode);
3613 data = PyUnicode_DATA(unicode);
3614 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003616 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003617 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003618 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619
3620 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003621 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622
3623 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003624 /* Fast path for runs of ASCII characters. Given that common UTF-8
3625 input will consist of an overwhelming majority of ASCII
3626 characters, we try to optimize for this case by checking
3627 as many characters as a C 'long' can contain.
3628 First, check if we can do an aligned read, as most CPUs have
3629 a penalty for unaligned reads.
3630 */
3631 if (!((size_t) s & LONG_PTR_MASK)) {
3632 /* Help register allocation */
3633 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003634 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003635 while (_s < aligned_end) {
3636 /* Read a whole long at a time (either 4 or 8 bytes),
3637 and do a fast unrolled copy if it only contains ASCII
3638 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003639 unsigned long value = *(unsigned long *) _s;
3640 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003641 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003642 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3643 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3644 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3645 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003646#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003647 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3648 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3649 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3650 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003651#endif
3652 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003653 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003654 }
3655 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003656 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003657 if (s == e)
3658 break;
3659 ch = (unsigned char)*s;
3660 }
3661 }
3662
3663 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003664 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665 s++;
3666 continue;
3667 }
3668
3669 n = utf8_code_length[ch];
3670
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003671 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 if (consumed)
3673 break;
3674 else {
3675 errmsg = "unexpected end of data";
3676 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003677 endinpos = startinpos+1;
3678 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3679 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003680 goto utf8Error;
3681 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683
3684 switch (n) {
3685
3686 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003687 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 startinpos = s-starts;
3689 endinpos = startinpos+1;
3690 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691
3692 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003693 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003694 startinpos = s-starts;
3695 endinpos = startinpos+1;
3696 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697
3698 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003699 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003700 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003701 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003702 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003703 goto utf8Error;
3704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003706 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003707 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 break;
3709
3710 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003711 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3712 will result in surrogates in range d800-dfff. Surrogates are
3713 not valid UTF-8 so they are rejected.
3714 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3715 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003716 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003717 (s[2] & 0xc0) != 0x80 ||
3718 ((unsigned char)s[0] == 0xE0 &&
3719 (unsigned char)s[1] < 0xA0) ||
3720 ((unsigned char)s[0] == 0xED &&
3721 (unsigned char)s[1] > 0x9F)) {
3722 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003723 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003724 endinpos = startinpos + 1;
3725
3726 /* if s[1] first two bits are 1 and 0, then the invalid
3727 continuation byte is s[2], so increment endinpos by 1,
3728 if not, s[1] is invalid and endinpos doesn't need to
3729 be incremented. */
3730 if ((s[1] & 0xC0) == 0x80)
3731 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003732 goto utf8Error;
3733 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003735 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003736 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003737 break;
3738
3739 case 4:
3740 if ((s[1] & 0xc0) != 0x80 ||
3741 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003742 (s[3] & 0xc0) != 0x80 ||
3743 ((unsigned char)s[0] == 0xF0 &&
3744 (unsigned char)s[1] < 0x90) ||
3745 ((unsigned char)s[0] == 0xF4 &&
3746 (unsigned char)s[1] > 0x8F)) {
3747 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003748 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003749 endinpos = startinpos + 1;
3750 if ((s[1] & 0xC0) == 0x80) {
3751 endinpos++;
3752 if ((s[2] & 0xC0) == 0x80)
3753 endinpos++;
3754 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003755 goto utf8Error;
3756 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003757 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003758 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3759 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761 /* If the string is flexible or we have native UCS-4, write
3762 directly.. */
3763 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3764 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766 else {
3767 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769 /* translate from 10000..10FFFF to 0..FFFF */
3770 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003772 /* high surrogate = top 10 bits added to D800 */
3773 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3774 (Py_UNICODE)(0xD800 + (ch >> 10)));
3775
3776 /* low surrogate = bottom 10 bits added to DC00 */
3777 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3778 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3779 }
3780#if SIZEOF_WCHAR_T == 2
3781 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003782#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 }
3785 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003786 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003787
Benjamin Peterson29060642009-01-31 22:14:21 +00003788 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003789 /* If this is not yet a resizable string, make it one.. */
3790 if (kind != PyUnicode_WCHAR_KIND) {
3791 const Py_UNICODE *u;
3792 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3793 if (!new_unicode)
3794 goto onError;
3795 u = PyUnicode_AsUnicode((PyObject *)unicode);
3796 if (!u)
3797 goto onError;
3798#if SIZEOF_WCHAR_T == 2
3799 i += wchar_offset;
3800#endif
3801 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3802 Py_DECREF(unicode);
3803 unicode = new_unicode;
3804 kind = 0;
3805 data = PyUnicode_AS_UNICODE(new_unicode);
3806 assert(data != NULL);
3807 }
3808 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003809 if (unicode_decode_call_errorhandler(
3810 errors, &errorHandler,
3811 "utf8", errmsg,
3812 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003813 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003814 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003815 /* Update data because unicode_decode_call_errorhandler might have
3816 re-created or resized the unicode object. */
3817 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003818 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820 /* Ensure the unicode_size calculation above was correct: */
3821 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3822
Walter Dörwald69652032004-09-07 20:24:22 +00003823 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003824 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003826 /* Adjust length and ready string when it contained errors and
3827 is of the old resizable kind. */
3828 if (kind == PyUnicode_WCHAR_KIND) {
3829 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3830 PyUnicode_READY(unicode) == -1)
3831 goto onError;
3832 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003834 Py_XDECREF(errorHandler);
3835 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836 if (PyUnicode_READY(unicode) == -1) {
3837 Py_DECREF(unicode);
3838 return NULL;
3839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840 return (PyObject *)unicode;
3841
Benjamin Peterson29060642009-01-31 22:14:21 +00003842 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003843 Py_XDECREF(errorHandler);
3844 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845 Py_DECREF(unicode);
3846 return NULL;
3847}
3848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003849#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003850
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003851#ifdef __APPLE__
3852
3853/* Simplified UTF-8 decoder using surrogateescape error handler,
3854 used to decode the command line arguments on Mac OS X. */
3855
3856wchar_t*
3857_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3858{
3859 int n;
3860 const char *e;
3861 wchar_t *unicode, *p;
3862
3863 /* Note: size will always be longer than the resulting Unicode
3864 character count */
3865 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3866 PyErr_NoMemory();
3867 return NULL;
3868 }
3869 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3870 if (!unicode)
3871 return NULL;
3872
3873 /* Unpack UTF-8 encoded data */
3874 p = unicode;
3875 e = s + size;
3876 while (s < e) {
3877 Py_UCS4 ch = (unsigned char)*s;
3878
3879 if (ch < 0x80) {
3880 *p++ = (wchar_t)ch;
3881 s++;
3882 continue;
3883 }
3884
3885 n = utf8_code_length[ch];
3886 if (s + n > e) {
3887 goto surrogateescape;
3888 }
3889
3890 switch (n) {
3891 case 0:
3892 case 1:
3893 goto surrogateescape;
3894
3895 case 2:
3896 if ((s[1] & 0xc0) != 0x80)
3897 goto surrogateescape;
3898 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3899 assert ((ch > 0x007F) && (ch <= 0x07FF));
3900 *p++ = (wchar_t)ch;
3901 break;
3902
3903 case 3:
3904 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3905 will result in surrogates in range d800-dfff. Surrogates are
3906 not valid UTF-8 so they are rejected.
3907 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3908 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3909 if ((s[1] & 0xc0) != 0x80 ||
3910 (s[2] & 0xc0) != 0x80 ||
3911 ((unsigned char)s[0] == 0xE0 &&
3912 (unsigned char)s[1] < 0xA0) ||
3913 ((unsigned char)s[0] == 0xED &&
3914 (unsigned char)s[1] > 0x9F)) {
3915
3916 goto surrogateescape;
3917 }
3918 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3919 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003921 break;
3922
3923 case 4:
3924 if ((s[1] & 0xc0) != 0x80 ||
3925 (s[2] & 0xc0) != 0x80 ||
3926 (s[3] & 0xc0) != 0x80 ||
3927 ((unsigned char)s[0] == 0xF0 &&
3928 (unsigned char)s[1] < 0x90) ||
3929 ((unsigned char)s[0] == 0xF4 &&
3930 (unsigned char)s[1] > 0x8F)) {
3931 goto surrogateescape;
3932 }
3933 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3934 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3935 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3936
3937#if SIZEOF_WCHAR_T == 4
3938 *p++ = (wchar_t)ch;
3939#else
3940 /* compute and append the two surrogates: */
3941
3942 /* translate from 10000..10FFFF to 0..FFFF */
3943 ch -= 0x10000;
3944
3945 /* high surrogate = top 10 bits added to D800 */
3946 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3947
3948 /* low surrogate = bottom 10 bits added to DC00 */
3949 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3950#endif
3951 break;
3952 }
3953 s += n;
3954 continue;
3955
3956 surrogateescape:
3957 *p++ = 0xDC00 + ch;
3958 s++;
3959 }
3960 *p = L'\0';
3961 return unicode;
3962}
3963
3964#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003966/* Primary internal function which creates utf8 encoded bytes objects.
3967
3968 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003969 and allocate exactly as much space needed at the end. Else allocate the
3970 maximum possible needed (4 result bytes per Unicode character), and return
3971 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003972*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003973PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975{
Tim Peters602f7402002-04-27 18:03:26 +00003976#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003977
Guido van Rossum98297ee2007-11-06 21:34:58 +00003978 Py_ssize_t i; /* index into s of next input byte */
3979 PyObject *result; /* result string object */
3980 char *p; /* next free byte in output buffer */
3981 Py_ssize_t nallocated; /* number of result bytes allocated */
3982 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003983 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003984 PyObject *errorHandler = NULL;
3985 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986 int kind;
3987 void *data;
3988 Py_ssize_t size;
3989 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3990#if SIZEOF_WCHAR_T == 2
3991 Py_ssize_t wchar_offset = 0;
3992#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003994 if (!PyUnicode_Check(unicode)) {
3995 PyErr_BadArgument();
3996 return NULL;
3997 }
3998
3999 if (PyUnicode_READY(unicode) == -1)
4000 return NULL;
4001
4002 if (_PyUnicode_UTF8(unicode))
4003 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
4004 _PyUnicode_UTF8_LENGTH(unicode));
4005
4006 kind = PyUnicode_KIND(unicode);
4007 data = PyUnicode_DATA(unicode);
4008 size = PyUnicode_GET_LENGTH(unicode);
4009
Tim Peters602f7402002-04-27 18:03:26 +00004010 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011
Tim Peters602f7402002-04-27 18:03:26 +00004012 if (size <= MAX_SHORT_UNICHARS) {
4013 /* Write into the stack buffer; nallocated can't overflow.
4014 * At the end, we'll allocate exactly as much heap space as it
4015 * turns out we need.
4016 */
4017 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004018 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004019 p = stackbuf;
4020 }
4021 else {
4022 /* Overallocate on the heap, and give the excess back at the end. */
4023 nallocated = size * 4;
4024 if (nallocated / 4 != size) /* overflow! */
4025 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004026 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004027 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004028 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004029 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004030 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004031
Tim Peters602f7402002-04-27 18:03:26 +00004032 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004034
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004035 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004036 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004038
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004040 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004041 *p++ = (char)(0xc0 | (ch >> 6));
4042 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004043 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004044 Py_ssize_t newpos;
4045 PyObject *rep;
4046 Py_ssize_t repsize, k, startpos;
4047 startpos = i-1;
4048#if SIZEOF_WCHAR_T == 2
4049 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004050#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 rep = unicode_encode_call_errorhandler(
4052 errors, &errorHandler, "utf-8", "surrogates not allowed",
4053 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4054 &exc, startpos, startpos+1, &newpos);
4055 if (!rep)
4056 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058 if (PyBytes_Check(rep))
4059 repsize = PyBytes_GET_SIZE(rep);
4060 else
4061 repsize = PyUnicode_GET_SIZE(rep);
4062
4063 if (repsize > 4) {
4064 Py_ssize_t offset;
4065
4066 if (result == NULL)
4067 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004068 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004071 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4072 /* integer overflow */
4073 PyErr_NoMemory();
4074 goto error;
4075 }
4076 nallocated += repsize - 4;
4077 if (result != NULL) {
4078 if (_PyBytes_Resize(&result, nallocated) < 0)
4079 goto error;
4080 } else {
4081 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004082 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083 goto error;
4084 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4085 }
4086 p = PyBytes_AS_STRING(result) + offset;
4087 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089 if (PyBytes_Check(rep)) {
4090 char *prep = PyBytes_AS_STRING(rep);
4091 for(k = repsize; k > 0; k--)
4092 *p++ = *prep++;
4093 } else /* rep is unicode */ {
4094 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4095 Py_UNICODE c;
4096
4097 for(k=0; k<repsize; k++) {
4098 c = prep[k];
4099 if (0x80 <= c) {
4100 raise_encode_exception(&exc, "utf-8",
4101 PyUnicode_AS_UNICODE(unicode),
4102 size, i-1, i,
4103 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004104 goto error;
4105 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004106 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004107 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004108 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004109 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004110 } else if (ch < 0x10000) {
4111 *p++ = (char)(0xe0 | (ch >> 12));
4112 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4113 *p++ = (char)(0x80 | (ch & 0x3f));
4114 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004115 /* Encode UCS4 Unicode ordinals */
4116 *p++ = (char)(0xf0 | (ch >> 18));
4117 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4118 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4119 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120#if SIZEOF_WCHAR_T == 2
4121 wchar_offset++;
4122#endif
Tim Peters602f7402002-04-27 18:03:26 +00004123 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004125
Guido van Rossum98297ee2007-11-06 21:34:58 +00004126 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004127 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004128 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004129 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004130 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004131 }
4132 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004133 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004134 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004135 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004136 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004138
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004139 Py_XDECREF(errorHandler);
4140 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004141 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004142 error:
4143 Py_XDECREF(errorHandler);
4144 Py_XDECREF(exc);
4145 Py_XDECREF(result);
4146 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004147
Tim Peters602f7402002-04-27 18:03:26 +00004148#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149}
4150
Alexander Belopolsky40018472011-02-26 01:02:56 +00004151PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004152PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4153 Py_ssize_t size,
4154 const char *errors)
4155{
4156 PyObject *v, *unicode;
4157
4158 unicode = PyUnicode_FromUnicode(s, size);
4159 if (unicode == NULL)
4160 return NULL;
4161 v = _PyUnicode_AsUTF8String(unicode, errors);
4162 Py_DECREF(unicode);
4163 return v;
4164}
4165
4166PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004167PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004169 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170}
4171
Walter Dörwald41980ca2007-08-16 21:55:45 +00004172/* --- UTF-32 Codec ------------------------------------------------------- */
4173
4174PyObject *
4175PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 Py_ssize_t size,
4177 const char *errors,
4178 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004179{
4180 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4181}
4182
4183PyObject *
4184PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004185 Py_ssize_t size,
4186 const char *errors,
4187 int *byteorder,
4188 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004189{
4190 const char *starts = s;
4191 Py_ssize_t startinpos;
4192 Py_ssize_t endinpos;
4193 Py_ssize_t outpos;
4194 PyUnicodeObject *unicode;
4195 Py_UNICODE *p;
4196#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004197 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004198 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004199#else
4200 const int pairs = 0;
4201#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004202 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004203 int bo = 0; /* assume native ordering by default */
4204 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004205 /* Offsets from q for retrieving bytes in the right order. */
4206#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4207 int iorder[] = {0, 1, 2, 3};
4208#else
4209 int iorder[] = {3, 2, 1, 0};
4210#endif
4211 PyObject *errorHandler = NULL;
4212 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004213
Walter Dörwald41980ca2007-08-16 21:55:45 +00004214 q = (unsigned char *)s;
4215 e = q + size;
4216
4217 if (byteorder)
4218 bo = *byteorder;
4219
4220 /* Check for BOM marks (U+FEFF) in the input and adjust current
4221 byte order setting accordingly. In native mode, the leading BOM
4222 mark is skipped, in all other modes, it is copied to the output
4223 stream as-is (giving a ZWNBSP character). */
4224 if (bo == 0) {
4225 if (size >= 4) {
4226 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004227 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004228#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004229 if (bom == 0x0000FEFF) {
4230 q += 4;
4231 bo = -1;
4232 }
4233 else if (bom == 0xFFFE0000) {
4234 q += 4;
4235 bo = 1;
4236 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004237#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004238 if (bom == 0x0000FEFF) {
4239 q += 4;
4240 bo = 1;
4241 }
4242 else if (bom == 0xFFFE0000) {
4243 q += 4;
4244 bo = -1;
4245 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004246#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004248 }
4249
4250 if (bo == -1) {
4251 /* force LE */
4252 iorder[0] = 0;
4253 iorder[1] = 1;
4254 iorder[2] = 2;
4255 iorder[3] = 3;
4256 }
4257 else if (bo == 1) {
4258 /* force BE */
4259 iorder[0] = 3;
4260 iorder[1] = 2;
4261 iorder[2] = 1;
4262 iorder[3] = 0;
4263 }
4264
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004265 /* On narrow builds we split characters outside the BMP into two
4266 codepoints => count how much extra space we need. */
4267#ifndef Py_UNICODE_WIDE
4268 for (qq = q; qq < e; qq += 4)
4269 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4270 pairs++;
4271#endif
4272
4273 /* This might be one to much, because of a BOM */
4274 unicode = _PyUnicode_New((size+3)/4+pairs);
4275 if (!unicode)
4276 return NULL;
4277 if (size == 0)
4278 return (PyObject *)unicode;
4279
4280 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004281 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004282
Walter Dörwald41980ca2007-08-16 21:55:45 +00004283 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004284 Py_UCS4 ch;
4285 /* remaining bytes at the end? (size should be divisible by 4) */
4286 if (e-q<4) {
4287 if (consumed)
4288 break;
4289 errmsg = "truncated data";
4290 startinpos = ((const char *)q)-starts;
4291 endinpos = ((const char *)e)-starts;
4292 goto utf32Error;
4293 /* The remaining input chars are ignored if the callback
4294 chooses to skip the input */
4295 }
4296 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4297 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004298
Benjamin Peterson29060642009-01-31 22:14:21 +00004299 if (ch >= 0x110000)
4300 {
4301 errmsg = "codepoint not in range(0x110000)";
4302 startinpos = ((const char *)q)-starts;
4303 endinpos = startinpos+4;
4304 goto utf32Error;
4305 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004306#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004307 if (ch >= 0x10000)
4308 {
4309 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4310 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4311 }
4312 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004313#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 *p++ = ch;
4315 q += 4;
4316 continue;
4317 utf32Error:
4318 outpos = p-PyUnicode_AS_UNICODE(unicode);
4319 if (unicode_decode_call_errorhandler(
4320 errors, &errorHandler,
4321 "utf32", errmsg,
4322 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4323 &unicode, &outpos, &p))
4324 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004325 }
4326
4327 if (byteorder)
4328 *byteorder = bo;
4329
4330 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004331 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004332
4333 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004334 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004335 goto onError;
4336
4337 Py_XDECREF(errorHandler);
4338 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004339 if (PyUnicode_READY(unicode) == -1) {
4340 Py_DECREF(unicode);
4341 return NULL;
4342 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004343 return (PyObject *)unicode;
4344
Benjamin Peterson29060642009-01-31 22:14:21 +00004345 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004346 Py_DECREF(unicode);
4347 Py_XDECREF(errorHandler);
4348 Py_XDECREF(exc);
4349 return NULL;
4350}
4351
4352PyObject *
4353PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004354 Py_ssize_t size,
4355 const char *errors,
4356 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004357{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004358 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004359 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004360 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004361#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004362 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004363#else
4364 const int pairs = 0;
4365#endif
4366 /* Offsets from p for storing byte pairs in the right order. */
4367#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4368 int iorder[] = {0, 1, 2, 3};
4369#else
4370 int iorder[] = {3, 2, 1, 0};
4371#endif
4372
Benjamin Peterson29060642009-01-31 22:14:21 +00004373#define STORECHAR(CH) \
4374 do { \
4375 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4376 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4377 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4378 p[iorder[0]] = (CH) & 0xff; \
4379 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004380 } while(0)
4381
4382 /* In narrow builds we can output surrogate pairs as one codepoint,
4383 so we need less space. */
4384#ifndef Py_UNICODE_WIDE
4385 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004386 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4387 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4388 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004389#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004390 nsize = (size - pairs + (byteorder == 0));
4391 bytesize = nsize * 4;
4392 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004393 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004394 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004395 if (v == NULL)
4396 return NULL;
4397
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004398 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004399 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004400 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004401 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004402 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004403
4404 if (byteorder == -1) {
4405 /* force LE */
4406 iorder[0] = 0;
4407 iorder[1] = 1;
4408 iorder[2] = 2;
4409 iorder[3] = 3;
4410 }
4411 else if (byteorder == 1) {
4412 /* force BE */
4413 iorder[0] = 3;
4414 iorder[1] = 2;
4415 iorder[2] = 1;
4416 iorder[3] = 0;
4417 }
4418
4419 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004421#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4423 Py_UCS4 ch2 = *s;
4424 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4425 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4426 s++;
4427 size--;
4428 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004429 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004430#endif
4431 STORECHAR(ch);
4432 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004433
4434 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004435 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004436#undef STORECHAR
4437}
4438
Alexander Belopolsky40018472011-02-26 01:02:56 +00004439PyObject *
4440PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004441{
4442 if (!PyUnicode_Check(unicode)) {
4443 PyErr_BadArgument();
4444 return NULL;
4445 }
4446 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004447 PyUnicode_GET_SIZE(unicode),
4448 NULL,
4449 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004450}
4451
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452/* --- UTF-16 Codec ------------------------------------------------------- */
4453
Tim Peters772747b2001-08-09 22:21:55 +00004454PyObject *
4455PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 Py_ssize_t size,
4457 const char *errors,
4458 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459{
Walter Dörwald69652032004-09-07 20:24:22 +00004460 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4461}
4462
Antoine Pitrouab868312009-01-10 15:40:25 +00004463/* Two masks for fast checking of whether a C 'long' may contain
4464 UTF16-encoded surrogate characters. This is an efficient heuristic,
4465 assuming that non-surrogate characters with a code point >= 0x8000 are
4466 rare in most input.
4467 FAST_CHAR_MASK is used when the input is in native byte ordering,
4468 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004469*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004470#if (SIZEOF_LONG == 8)
4471# define FAST_CHAR_MASK 0x8000800080008000L
4472# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4473#elif (SIZEOF_LONG == 4)
4474# define FAST_CHAR_MASK 0x80008000L
4475# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4476#else
4477# error C 'long' size should be either 4 or 8!
4478#endif
4479
Walter Dörwald69652032004-09-07 20:24:22 +00004480PyObject *
4481PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004482 Py_ssize_t size,
4483 const char *errors,
4484 int *byteorder,
4485 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004486{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004488 Py_ssize_t startinpos;
4489 Py_ssize_t endinpos;
4490 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491 PyUnicodeObject *unicode;
4492 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004493 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004494 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004495 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004496 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004497 /* Offsets from q for retrieving byte pairs in the right order. */
4498#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4499 int ihi = 1, ilo = 0;
4500#else
4501 int ihi = 0, ilo = 1;
4502#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503 PyObject *errorHandler = NULL;
4504 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505
4506 /* Note: size will always be longer than the resulting Unicode
4507 character count */
4508 unicode = _PyUnicode_New(size);
4509 if (!unicode)
4510 return NULL;
4511 if (size == 0)
4512 return (PyObject *)unicode;
4513
4514 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004515 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004516 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004517 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518
4519 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004520 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004522 /* Check for BOM marks (U+FEFF) in the input and adjust current
4523 byte order setting accordingly. In native mode, the leading BOM
4524 mark is skipped, in all other modes, it is copied to the output
4525 stream as-is (giving a ZWNBSP character). */
4526 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004527 if (size >= 2) {
4528 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004529#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 if (bom == 0xFEFF) {
4531 q += 2;
4532 bo = -1;
4533 }
4534 else if (bom == 0xFFFE) {
4535 q += 2;
4536 bo = 1;
4537 }
Tim Petersced69f82003-09-16 20:30:58 +00004538#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 if (bom == 0xFEFF) {
4540 q += 2;
4541 bo = 1;
4542 }
4543 else if (bom == 0xFFFE) {
4544 q += 2;
4545 bo = -1;
4546 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004547#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550
Tim Peters772747b2001-08-09 22:21:55 +00004551 if (bo == -1) {
4552 /* force LE */
4553 ihi = 1;
4554 ilo = 0;
4555 }
4556 else if (bo == 1) {
4557 /* force BE */
4558 ihi = 0;
4559 ilo = 1;
4560 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004561#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4562 native_ordering = ilo < ihi;
4563#else
4564 native_ordering = ilo > ihi;
4565#endif
Tim Peters772747b2001-08-09 22:21:55 +00004566
Antoine Pitrouab868312009-01-10 15:40:25 +00004567 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004568 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004570 /* First check for possible aligned read of a C 'long'. Unaligned
4571 reads are more expensive, better to defer to another iteration. */
4572 if (!((size_t) q & LONG_PTR_MASK)) {
4573 /* Fast path for runs of non-surrogate chars. */
4574 register const unsigned char *_q = q;
4575 Py_UNICODE *_p = p;
4576 if (native_ordering) {
4577 /* Native ordering is simple: as long as the input cannot
4578 possibly contain a surrogate char, do an unrolled copy
4579 of several 16-bit code points to the target object.
4580 The non-surrogate check is done on several input bytes
4581 at a time (as many as a C 'long' can contain). */
4582 while (_q < aligned_end) {
4583 unsigned long data = * (unsigned long *) _q;
4584 if (data & FAST_CHAR_MASK)
4585 break;
4586 _p[0] = ((unsigned short *) _q)[0];
4587 _p[1] = ((unsigned short *) _q)[1];
4588#if (SIZEOF_LONG == 8)
4589 _p[2] = ((unsigned short *) _q)[2];
4590 _p[3] = ((unsigned short *) _q)[3];
4591#endif
4592 _q += SIZEOF_LONG;
4593 _p += SIZEOF_LONG / 2;
4594 }
4595 }
4596 else {
4597 /* Byteswapped ordering is similar, but we must decompose
4598 the copy bytewise, and take care of zero'ing out the
4599 upper bytes if the target object is in 32-bit units
4600 (that is, in UCS-4 builds). */
4601 while (_q < aligned_end) {
4602 unsigned long data = * (unsigned long *) _q;
4603 if (data & SWAPPED_FAST_CHAR_MASK)
4604 break;
4605 /* Zero upper bytes in UCS-4 builds */
4606#if (Py_UNICODE_SIZE > 2)
4607 _p[0] = 0;
4608 _p[1] = 0;
4609#if (SIZEOF_LONG == 8)
4610 _p[2] = 0;
4611 _p[3] = 0;
4612#endif
4613#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004614 /* Issue #4916; UCS-4 builds on big endian machines must
4615 fill the two last bytes of each 4-byte unit. */
4616#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4617# define OFF 2
4618#else
4619# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004620#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004621 ((unsigned char *) _p)[OFF + 1] = _q[0];
4622 ((unsigned char *) _p)[OFF + 0] = _q[1];
4623 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4624 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4625#if (SIZEOF_LONG == 8)
4626 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4627 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4628 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4629 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4630#endif
4631#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004632 _q += SIZEOF_LONG;
4633 _p += SIZEOF_LONG / 2;
4634 }
4635 }
4636 p = _p;
4637 q = _q;
4638 if (q >= e)
4639 break;
4640 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004641 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642
Benjamin Peterson14339b62009-01-31 16:36:08 +00004643 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004644
4645 if (ch < 0xD800 || ch > 0xDFFF) {
4646 *p++ = ch;
4647 continue;
4648 }
4649
4650 /* UTF-16 code pair: */
4651 if (q > e) {
4652 errmsg = "unexpected end of data";
4653 startinpos = (((const char *)q) - 2) - starts;
4654 endinpos = ((const char *)e) + 1 - starts;
4655 goto utf16Error;
4656 }
4657 if (0xD800 <= ch && ch <= 0xDBFF) {
4658 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4659 q += 2;
4660 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004661#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004662 *p++ = ch;
4663 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004664#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004665 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004666#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004667 continue;
4668 }
4669 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004670 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004671 startinpos = (((const char *)q)-4)-starts;
4672 endinpos = startinpos+2;
4673 goto utf16Error;
4674 }
4675
Benjamin Peterson14339b62009-01-31 16:36:08 +00004676 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004677 errmsg = "illegal encoding";
4678 startinpos = (((const char *)q)-2)-starts;
4679 endinpos = startinpos+2;
4680 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004681
Benjamin Peterson29060642009-01-31 22:14:21 +00004682 utf16Error:
4683 outpos = p - PyUnicode_AS_UNICODE(unicode);
4684 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004685 errors,
4686 &errorHandler,
4687 "utf16", errmsg,
4688 &starts,
4689 (const char **)&e,
4690 &startinpos,
4691 &endinpos,
4692 &exc,
4693 (const char **)&q,
4694 &unicode,
4695 &outpos,
4696 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004697 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004699 /* remaining byte at the end? (size should be even) */
4700 if (e == q) {
4701 if (!consumed) {
4702 errmsg = "truncated data";
4703 startinpos = ((const char *)q) - starts;
4704 endinpos = ((const char *)e) + 1 - starts;
4705 outpos = p - PyUnicode_AS_UNICODE(unicode);
4706 if (unicode_decode_call_errorhandler(
4707 errors,
4708 &errorHandler,
4709 "utf16", errmsg,
4710 &starts,
4711 (const char **)&e,
4712 &startinpos,
4713 &endinpos,
4714 &exc,
4715 (const char **)&q,
4716 &unicode,
4717 &outpos,
4718 &p))
4719 goto onError;
4720 /* The remaining input chars are ignored if the callback
4721 chooses to skip the input */
4722 }
4723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724
4725 if (byteorder)
4726 *byteorder = bo;
4727
Walter Dörwald69652032004-09-07 20:24:22 +00004728 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004730
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004732 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733 goto onError;
4734
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 Py_XDECREF(errorHandler);
4736 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004737 if (PyUnicode_READY(unicode) == -1) {
4738 Py_DECREF(unicode);
4739 return NULL;
4740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 return (PyObject *)unicode;
4742
Benjamin Peterson29060642009-01-31 22:14:21 +00004743 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004745 Py_XDECREF(errorHandler);
4746 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 return NULL;
4748}
4749
Antoine Pitrouab868312009-01-10 15:40:25 +00004750#undef FAST_CHAR_MASK
4751#undef SWAPPED_FAST_CHAR_MASK
4752
Tim Peters772747b2001-08-09 22:21:55 +00004753PyObject *
4754PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004755 Py_ssize_t size,
4756 const char *errors,
4757 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004759 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004760 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004761 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004762#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004763 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004764#else
4765 const int pairs = 0;
4766#endif
Tim Peters772747b2001-08-09 22:21:55 +00004767 /* Offsets from p for storing byte pairs in the right order. */
4768#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4769 int ihi = 1, ilo = 0;
4770#else
4771 int ihi = 0, ilo = 1;
4772#endif
4773
Benjamin Peterson29060642009-01-31 22:14:21 +00004774#define STORECHAR(CH) \
4775 do { \
4776 p[ihi] = ((CH) >> 8) & 0xff; \
4777 p[ilo] = (CH) & 0xff; \
4778 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004779 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004781#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004782 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004783 if (s[i] >= 0x10000)
4784 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004785#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004786 /* 2 * (size + pairs + (byteorder == 0)) */
4787 if (size > PY_SSIZE_T_MAX ||
4788 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004789 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004790 nsize = size + pairs + (byteorder == 0);
4791 bytesize = nsize * 2;
4792 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004793 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004794 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795 if (v == NULL)
4796 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004798 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004800 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004801 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004802 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004803
4804 if (byteorder == -1) {
4805 /* force LE */
4806 ihi = 1;
4807 ilo = 0;
4808 }
4809 else if (byteorder == 1) {
4810 /* force BE */
4811 ihi = 0;
4812 ilo = 1;
4813 }
4814
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004815 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004816 Py_UNICODE ch = *s++;
4817 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004818#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004819 if (ch >= 0x10000) {
4820 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4821 ch = 0xD800 | ((ch-0x10000) >> 10);
4822 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004823#endif
Tim Peters772747b2001-08-09 22:21:55 +00004824 STORECHAR(ch);
4825 if (ch2)
4826 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004827 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004828
4829 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004830 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004831#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832}
4833
Alexander Belopolsky40018472011-02-26 01:02:56 +00004834PyObject *
4835PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836{
4837 if (!PyUnicode_Check(unicode)) {
4838 PyErr_BadArgument();
4839 return NULL;
4840 }
4841 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004842 PyUnicode_GET_SIZE(unicode),
4843 NULL,
4844 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845}
4846
4847/* --- Unicode Escape Codec ----------------------------------------------- */
4848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004849/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4850 if all the escapes in the string make it still a valid ASCII string.
4851 Returns -1 if any escapes were found which cause the string to
4852 pop out of ASCII range. Otherwise returns the length of the
4853 required buffer to hold the string.
4854 */
4855Py_ssize_t
4856length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4857{
4858 const unsigned char *p = (const unsigned char *)s;
4859 const unsigned char *end = p + size;
4860 Py_ssize_t length = 0;
4861
4862 if (size < 0)
4863 return -1;
4864
4865 for (; p < end; ++p) {
4866 if (*p > 127) {
4867 /* Non-ASCII */
4868 return -1;
4869 }
4870 else if (*p != '\\') {
4871 /* Normal character */
4872 ++length;
4873 }
4874 else {
4875 /* Backslash-escape, check next char */
4876 ++p;
4877 /* Escape sequence reaches till end of string or
4878 non-ASCII follow-up. */
4879 if (p >= end || *p > 127)
4880 return -1;
4881 switch (*p) {
4882 case '\n':
4883 /* backslash + \n result in zero characters */
4884 break;
4885 case '\\': case '\'': case '\"':
4886 case 'b': case 'f': case 't':
4887 case 'n': case 'r': case 'v': case 'a':
4888 ++length;
4889 break;
4890 case '0': case '1': case '2': case '3':
4891 case '4': case '5': case '6': case '7':
4892 case 'x': case 'u': case 'U': case 'N':
4893 /* these do not guarantee ASCII characters */
4894 return -1;
4895 default:
4896 /* count the backslash + the other character */
4897 length += 2;
4898 }
4899 }
4900 }
4901 return length;
4902}
4903
4904/* Similar to PyUnicode_WRITE but either write into wstr field
4905 or treat string as ASCII. */
4906#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4907 do { \
4908 if ((kind) != PyUnicode_WCHAR_KIND) \
4909 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4910 else \
4911 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4912 } while (0)
4913
4914#define WRITE_WSTR(buf, index, value) \
4915 assert(kind == PyUnicode_WCHAR_KIND), \
4916 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4917
4918
Fredrik Lundh06d12682001-01-24 07:59:11 +00004919static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004920
Alexander Belopolsky40018472011-02-26 01:02:56 +00004921PyObject *
4922PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004923 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02004924 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004926 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004927 Py_ssize_t startinpos;
4928 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004929 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004931 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004933 char* message;
4934 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004935 PyObject *errorHandler = NULL;
4936 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004937 Py_ssize_t ascii_length;
4938 Py_ssize_t i;
4939 int kind;
4940 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004942 ascii_length = length_of_escaped_ascii_string(s, size);
4943
4944 /* After length_of_escaped_ascii_string() there are two alternatives,
4945 either the string is pure ASCII with named escapes like \n, etc.
4946 and we determined it's exact size (common case)
4947 or it contains \x, \u, ... escape sequences. then we create a
4948 legacy wchar string and resize it at the end of this function. */
4949 if (ascii_length >= 0) {
4950 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4951 if (!v)
4952 goto onError;
4953 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4954 kind = PyUnicode_1BYTE_KIND;
4955 data = PyUnicode_DATA(v);
4956 }
4957 else {
4958 /* Escaped strings will always be longer than the resulting
4959 Unicode string, so we start with size here and then reduce the
4960 length after conversion to the true value.
4961 (but if the error callback returns a long replacement string
4962 we'll have to allocate more space) */
4963 v = _PyUnicode_New(size);
4964 if (!v)
4965 goto onError;
4966 kind = PyUnicode_WCHAR_KIND;
4967 data = PyUnicode_AS_UNICODE(v);
4968 }
4969
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970 if (size == 0)
4971 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004972 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004974
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 while (s < end) {
4976 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004977 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004978 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004980 if (kind == PyUnicode_WCHAR_KIND) {
4981 assert(i < _PyUnicode_WSTR_LENGTH(v));
4982 }
4983 else {
4984 /* The only case in which i == ascii_length is a backslash
4985 followed by a newline. */
4986 assert(i <= ascii_length);
4987 }
4988
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989 /* Non-escape characters are interpreted as Unicode ordinals */
4990 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004991 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992 continue;
4993 }
4994
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004995 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996 /* \ - Escapes */
4997 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004998 c = *s++;
4999 if (s > end)
5000 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005001
5002 if (kind == PyUnicode_WCHAR_KIND) {
5003 assert(i < _PyUnicode_WSTR_LENGTH(v));
5004 }
5005 else {
5006 /* The only case in which i == ascii_length is a backslash
5007 followed by a newline. */
5008 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5009 }
5010
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005011 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005015 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5016 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5017 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5018 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5019 /* FF */
5020 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5021 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5022 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5023 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5024 /* VT */
5025 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5026 /* BEL, not classic C */
5027 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030 case '0': case '1': case '2': case '3':
5031 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005032 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005033 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005034 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005035 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005036 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005038 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039 break;
5040
Benjamin Peterson29060642009-01-31 22:14:21 +00005041 /* hex escapes */
5042 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005044 digits = 2;
5045 message = "truncated \\xXX escape";
5046 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005050 digits = 4;
5051 message = "truncated \\uXXXX escape";
5052 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005055 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005056 digits = 8;
5057 message = "truncated \\UXXXXXXXX escape";
5058 hexescape:
5059 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005060 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005061 if (s+digits>end) {
5062 endinpos = size;
5063 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 errors, &errorHandler,
5065 "unicodeescape", "end of string in escape sequence",
5066 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005067 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005068 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005069 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005070 goto nextByte;
5071 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005072 for (j = 0; j < digits; ++j) {
5073 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005074 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005075 endinpos = (s+j+1)-starts;
5076 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005077 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 errors, &errorHandler,
5079 "unicodeescape", message,
5080 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005081 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005082 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005083 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005084 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005085 }
5086 chr = (chr<<4) & ~0xF;
5087 if (c >= '0' && c <= '9')
5088 chr += c - '0';
5089 else if (c >= 'a' && c <= 'f')
5090 chr += 10 + c - 'a';
5091 else
5092 chr += 10 + c - 'A';
5093 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005094 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005095 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005096 /* _decoding_error will have already written into the
5097 target buffer. */
5098 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005099 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005100 /* when we get here, chr is a 32-bit unicode character */
5101 if (chr <= 0xffff)
5102 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005103 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005104 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005105 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005106 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005107#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005108 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005109#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005110 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005111 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5112 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005113#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005114 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005115 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005116 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005117 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 errors, &errorHandler,
5119 "unicodeescape", "illegal Unicode character",
5120 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005121 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005122 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005123 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005124 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005125 break;
5126
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005128 case 'N':
5129 message = "malformed \\N character escape";
5130 if (ucnhash_CAPI == NULL) {
5131 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005132 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5133 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005134 if (ucnhash_CAPI == NULL)
5135 goto ucnhashError;
5136 }
5137 if (*s == '{') {
5138 const char *start = s+1;
5139 /* look for the closing brace */
5140 while (*s != '}' && s < end)
5141 s++;
5142 if (s > start && s < end && *s == '}') {
5143 /* found a name. look it up in the unicode database */
5144 message = "unknown Unicode character name";
5145 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005146 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5147 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005148 goto store;
5149 }
5150 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005151 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005152 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005153 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 errors, &errorHandler,
5155 "unicodeescape", message,
5156 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005157 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005158 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005159 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005160 break;
5161
5162 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005163 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005164 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005165 message = "\\ at end of string";
5166 s--;
5167 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005168 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005169 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 errors, &errorHandler,
5171 "unicodeescape", message,
5172 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005173 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005174 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005175 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005176 }
5177 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005178 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5179 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005180 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005181 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005183 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005184 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005186 /* Ensure the length prediction worked in case of ASCII strings */
5187 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5188
5189 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5190 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005191 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005192 Py_XDECREF(errorHandler);
5193 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005195
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005197 PyErr_SetString(
5198 PyExc_UnicodeError,
5199 "\\N escapes not supported (can't load unicodedata module)"
5200 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005201 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005202 Py_XDECREF(errorHandler);
5203 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005204 return NULL;
5205
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005208 Py_XDECREF(errorHandler);
5209 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 return NULL;
5211}
5212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005213#undef WRITE_ASCII_OR_WSTR
5214#undef WRITE_WSTR
5215
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216/* Return a Unicode-Escape string version of the Unicode object.
5217
5218 If quotes is true, the string is enclosed in u"" or u'' quotes as
5219 appropriate.
5220
5221*/
5222
Walter Dörwald79e913e2007-05-12 11:08:06 +00005223static const char *hexdigits = "0123456789abcdef";
5224
Alexander Belopolsky40018472011-02-26 01:02:56 +00005225PyObject *
5226PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005227 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005229 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005232#ifdef Py_UNICODE_WIDE
5233 const Py_ssize_t expandsize = 10;
5234#else
5235 const Py_ssize_t expandsize = 6;
5236#endif
5237
Thomas Wouters89f507f2006-12-13 04:49:30 +00005238 /* XXX(nnorwitz): rather than over-allocating, it would be
5239 better to choose a different scheme. Perhaps scan the
5240 first N-chars of the string and allocate based on that size.
5241 */
5242 /* Initial allocation is based on the longest-possible unichr
5243 escape.
5244
5245 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5246 unichr, so in this case it's the longest unichr escape. In
5247 narrow (UTF-16) builds this is five chars per source unichr
5248 since there are two unichrs in the surrogate pair, so in narrow
5249 (UTF-16) builds it's not the longest unichr escape.
5250
5251 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5252 so in the narrow (UTF-16) build case it's the longest unichr
5253 escape.
5254 */
5255
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005256 if (size == 0)
5257 return PyBytes_FromStringAndSize(NULL, 0);
5258
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005259 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005260 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005261
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005262 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 2
5264 + expandsize*size
5265 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 if (repr == NULL)
5267 return NULL;
5268
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005269 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 while (size-- > 0) {
5272 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005273
Walter Dörwald79e913e2007-05-12 11:08:06 +00005274 /* Escape backslashes */
5275 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276 *p++ = '\\';
5277 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005278 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005279 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005280
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005281#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005282 /* Map 21-bit characters to '\U00xxxxxx' */
5283 else if (ch >= 0x10000) {
5284 *p++ = '\\';
5285 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005286 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5287 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5288 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5289 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5290 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5291 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5292 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5293 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005294 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005295 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005296#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005297 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5298 else if (ch >= 0xD800 && ch < 0xDC00) {
5299 Py_UNICODE ch2;
5300 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005301
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 ch2 = *s++;
5303 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005304 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005305 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5306 *p++ = '\\';
5307 *p++ = 'U';
5308 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5309 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5310 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5311 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5312 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5313 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5314 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5315 *p++ = hexdigits[ucs & 0x0000000F];
5316 continue;
5317 }
5318 /* Fall through: isolated surrogates are copied as-is */
5319 s--;
5320 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005321 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005322#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005323
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005325 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 *p++ = '\\';
5327 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005328 *p++ = hexdigits[(ch >> 12) & 0x000F];
5329 *p++ = hexdigits[(ch >> 8) & 0x000F];
5330 *p++ = hexdigits[(ch >> 4) & 0x000F];
5331 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005333
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005334 /* Map special whitespace to '\t', \n', '\r' */
5335 else if (ch == '\t') {
5336 *p++ = '\\';
5337 *p++ = 't';
5338 }
5339 else if (ch == '\n') {
5340 *p++ = '\\';
5341 *p++ = 'n';
5342 }
5343 else if (ch == '\r') {
5344 *p++ = '\\';
5345 *p++ = 'r';
5346 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005347
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005348 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005349 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005351 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005352 *p++ = hexdigits[(ch >> 4) & 0x000F];
5353 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005354 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005355
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356 /* Copy everything else as-is */
5357 else
5358 *p++ = (char) ch;
5359 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005361 assert(p - PyBytes_AS_STRING(repr) > 0);
5362 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5363 return NULL;
5364 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365}
5366
Alexander Belopolsky40018472011-02-26 01:02:56 +00005367PyObject *
5368PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005370 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 if (!PyUnicode_Check(unicode)) {
5372 PyErr_BadArgument();
5373 return NULL;
5374 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005375 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5376 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005377 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378}
5379
5380/* --- Raw Unicode Escape Codec ------------------------------------------- */
5381
Alexander Belopolsky40018472011-02-26 01:02:56 +00005382PyObject *
5383PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005384 Py_ssize_t size,
5385 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005387 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005388 Py_ssize_t startinpos;
5389 Py_ssize_t endinpos;
5390 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005392 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393 const char *end;
5394 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005395 PyObject *errorHandler = NULL;
5396 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005397
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398 /* Escaped strings will always be longer than the resulting
5399 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005400 length after conversion to the true value. (But decoding error
5401 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 v = _PyUnicode_New(size);
5403 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005407 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 end = s + size;
5409 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 unsigned char c;
5411 Py_UCS4 x;
5412 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005413 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414
Benjamin Peterson29060642009-01-31 22:14:21 +00005415 /* Non-escape characters are interpreted as Unicode ordinals */
5416 if (*s != '\\') {
5417 *p++ = (unsigned char)*s++;
5418 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005419 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 startinpos = s-starts;
5421
5422 /* \u-escapes are only interpreted iff the number of leading
5423 backslashes if odd */
5424 bs = s;
5425 for (;s < end;) {
5426 if (*s != '\\')
5427 break;
5428 *p++ = (unsigned char)*s++;
5429 }
5430 if (((s - bs) & 1) == 0 ||
5431 s >= end ||
5432 (*s != 'u' && *s != 'U')) {
5433 continue;
5434 }
5435 p--;
5436 count = *s=='u' ? 4 : 8;
5437 s++;
5438
5439 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5440 outpos = p-PyUnicode_AS_UNICODE(v);
5441 for (x = 0, i = 0; i < count; ++i, ++s) {
5442 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005443 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 endinpos = s-starts;
5445 if (unicode_decode_call_errorhandler(
5446 errors, &errorHandler,
5447 "rawunicodeescape", "truncated \\uXXXX",
5448 &starts, &end, &startinpos, &endinpos, &exc, &s,
5449 &v, &outpos, &p))
5450 goto onError;
5451 goto nextByte;
5452 }
5453 x = (x<<4) & ~0xF;
5454 if (c >= '0' && c <= '9')
5455 x += c - '0';
5456 else if (c >= 'a' && c <= 'f')
5457 x += 10 + c - 'a';
5458 else
5459 x += 10 + c - 'A';
5460 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005461 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005462 /* UCS-2 character */
5463 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005464 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 /* UCS-4 character. Either store directly, or as
5466 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005467#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005469#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005470 x -= 0x10000L;
5471 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5472 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005473#endif
5474 } else {
5475 endinpos = s-starts;
5476 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005477 if (unicode_decode_call_errorhandler(
5478 errors, &errorHandler,
5479 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005480 &starts, &end, &startinpos, &endinpos, &exc, &s,
5481 &v, &outpos, &p))
5482 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005483 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 nextByte:
5485 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005487 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005488 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005489 Py_XDECREF(errorHandler);
5490 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005491 if (PyUnicode_READY(v) == -1) {
5492 Py_DECREF(v);
5493 return NULL;
5494 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005496
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005499 Py_XDECREF(errorHandler);
5500 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 return NULL;
5502}
5503
Alexander Belopolsky40018472011-02-26 01:02:56 +00005504PyObject *
5505PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005506 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005508 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 char *p;
5510 char *q;
5511
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005512#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005513 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005514#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005515 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005516#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005517
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005518 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005520
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005521 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 if (repr == NULL)
5523 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005524 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005525 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005527 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 while (size-- > 0) {
5529 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005530#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 /* Map 32-bit characters to '\Uxxxxxxxx' */
5532 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005533 *p++ = '\\';
5534 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005535 *p++ = hexdigits[(ch >> 28) & 0xf];
5536 *p++ = hexdigits[(ch >> 24) & 0xf];
5537 *p++ = hexdigits[(ch >> 20) & 0xf];
5538 *p++ = hexdigits[(ch >> 16) & 0xf];
5539 *p++ = hexdigits[(ch >> 12) & 0xf];
5540 *p++ = hexdigits[(ch >> 8) & 0xf];
5541 *p++ = hexdigits[(ch >> 4) & 0xf];
5542 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005543 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005544 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005545#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5547 if (ch >= 0xD800 && ch < 0xDC00) {
5548 Py_UNICODE ch2;
5549 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005550
Benjamin Peterson29060642009-01-31 22:14:21 +00005551 ch2 = *s++;
5552 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005553 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005554 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5555 *p++ = '\\';
5556 *p++ = 'U';
5557 *p++ = hexdigits[(ucs >> 28) & 0xf];
5558 *p++ = hexdigits[(ucs >> 24) & 0xf];
5559 *p++ = hexdigits[(ucs >> 20) & 0xf];
5560 *p++ = hexdigits[(ucs >> 16) & 0xf];
5561 *p++ = hexdigits[(ucs >> 12) & 0xf];
5562 *p++ = hexdigits[(ucs >> 8) & 0xf];
5563 *p++ = hexdigits[(ucs >> 4) & 0xf];
5564 *p++ = hexdigits[ucs & 0xf];
5565 continue;
5566 }
5567 /* Fall through: isolated surrogates are copied as-is */
5568 s--;
5569 size++;
5570 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005571#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005572 /* Map 16-bit characters to '\uxxxx' */
5573 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 *p++ = '\\';
5575 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005576 *p++ = hexdigits[(ch >> 12) & 0xf];
5577 *p++ = hexdigits[(ch >> 8) & 0xf];
5578 *p++ = hexdigits[(ch >> 4) & 0xf];
5579 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 /* Copy everything else as-is */
5582 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 *p++ = (char) ch;
5584 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005585 size = p - q;
5586
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005587 assert(size > 0);
5588 if (_PyBytes_Resize(&repr, size) < 0)
5589 return NULL;
5590 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591}
5592
Alexander Belopolsky40018472011-02-26 01:02:56 +00005593PyObject *
5594PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005596 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005598 PyErr_BadArgument();
5599 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005601 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5602 PyUnicode_GET_SIZE(unicode));
5603
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005604 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605}
5606
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005607/* --- Unicode Internal Codec ------------------------------------------- */
5608
Alexander Belopolsky40018472011-02-26 01:02:56 +00005609PyObject *
5610_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005611 Py_ssize_t size,
5612 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005613{
5614 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005615 Py_ssize_t startinpos;
5616 Py_ssize_t endinpos;
5617 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005618 PyUnicodeObject *v;
5619 Py_UNICODE *p;
5620 const char *end;
5621 const char *reason;
5622 PyObject *errorHandler = NULL;
5623 PyObject *exc = NULL;
5624
Neal Norwitzd43069c2006-01-08 01:12:10 +00005625#ifdef Py_UNICODE_WIDE
5626 Py_UNICODE unimax = PyUnicode_GetMax();
5627#endif
5628
Thomas Wouters89f507f2006-12-13 04:49:30 +00005629 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005630 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5631 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005633 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5634 as string was created with the old API. */
5635 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005636 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005637 p = PyUnicode_AS_UNICODE(v);
5638 end = s + size;
5639
5640 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005641 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005642 /* We have to sanity check the raw data, otherwise doom looms for
5643 some malformed UCS-4 data. */
5644 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005645#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005646 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005647#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005648 end-s < Py_UNICODE_SIZE
5649 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005651 startinpos = s - starts;
5652 if (end-s < Py_UNICODE_SIZE) {
5653 endinpos = end-starts;
5654 reason = "truncated input";
5655 }
5656 else {
5657 endinpos = s - starts + Py_UNICODE_SIZE;
5658 reason = "illegal code point (> 0x10FFFF)";
5659 }
5660 outpos = p - PyUnicode_AS_UNICODE(v);
5661 if (unicode_decode_call_errorhandler(
5662 errors, &errorHandler,
5663 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005664 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005665 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005666 goto onError;
5667 }
5668 }
5669 else {
5670 p++;
5671 s += Py_UNICODE_SIZE;
5672 }
5673 }
5674
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005675 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005676 goto onError;
5677 Py_XDECREF(errorHandler);
5678 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005679 if (PyUnicode_READY(v) == -1) {
5680 Py_DECREF(v);
5681 return NULL;
5682 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005683 return (PyObject *)v;
5684
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005686 Py_XDECREF(v);
5687 Py_XDECREF(errorHandler);
5688 Py_XDECREF(exc);
5689 return NULL;
5690}
5691
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692/* --- Latin-1 Codec ------------------------------------------------------ */
5693
Alexander Belopolsky40018472011-02-26 01:02:56 +00005694PyObject *
5695PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005696 Py_ssize_t size,
5697 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005700 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701}
5702
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005704static void
5705make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005706 const char *encoding,
5707 const Py_UNICODE *unicode, Py_ssize_t size,
5708 Py_ssize_t startpos, Py_ssize_t endpos,
5709 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005711 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 *exceptionObject = PyUnicodeEncodeError_Create(
5713 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 }
5715 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5717 goto onError;
5718 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5719 goto onError;
5720 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5721 goto onError;
5722 return;
5723 onError:
5724 Py_DECREF(*exceptionObject);
5725 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 }
5727}
5728
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005729/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005730static void
5731raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005732 const char *encoding,
5733 const Py_UNICODE *unicode, Py_ssize_t size,
5734 Py_ssize_t startpos, Py_ssize_t endpos,
5735 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005736{
5737 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741}
5742
5743/* error handling callback helper:
5744 build arguments, call the callback and check the arguments,
5745 put the result into newpos and return the replacement string, which
5746 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005747static PyObject *
5748unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005749 PyObject **errorHandler,
5750 const char *encoding, const char *reason,
5751 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5752 Py_ssize_t startpos, Py_ssize_t endpos,
5753 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005754{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005755 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756
5757 PyObject *restuple;
5758 PyObject *resunicode;
5759
5760 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005763 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005764 }
5765
5766 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770
5771 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005776 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 Py_DECREF(restuple);
5778 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005779 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005780 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 &resunicode, newpos)) {
5782 Py_DECREF(restuple);
5783 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005785 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5786 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5787 Py_DECREF(restuple);
5788 return NULL;
5789 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005792 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5794 Py_DECREF(restuple);
5795 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005796 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 Py_INCREF(resunicode);
5798 Py_DECREF(restuple);
5799 return resunicode;
5800}
5801
Alexander Belopolsky40018472011-02-26 01:02:56 +00005802static PyObject *
5803unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005804 Py_ssize_t size,
5805 const char *errors,
5806 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005807{
5808 /* output object */
5809 PyObject *res;
5810 /* pointers to the beginning and end+1 of input */
5811 const Py_UNICODE *startp = p;
5812 const Py_UNICODE *endp = p + size;
5813 /* pointer to the beginning of the unencodable characters */
5814 /* const Py_UNICODE *badp = NULL; */
5815 /* pointer into the output */
5816 char *str;
5817 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005818 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005819 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5820 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005821 PyObject *errorHandler = NULL;
5822 PyObject *exc = NULL;
5823 /* the following variable is used for caching string comparisons
5824 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5825 int known_errorHandler = -1;
5826
5827 /* allocate enough for a simple encoding without
5828 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005829 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005830 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005831 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005832 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005833 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005834 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005835 ressize = size;
5836
5837 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 /* can we encode this? */
5841 if (c<limit) {
5842 /* no overflow check, because we know that the space is enough */
5843 *str++ = (char)c;
5844 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005845 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 else {
5847 Py_ssize_t unicodepos = p-startp;
5848 Py_ssize_t requiredsize;
5849 PyObject *repunicode;
5850 Py_ssize_t repsize;
5851 Py_ssize_t newpos;
5852 Py_ssize_t respos;
5853 Py_UNICODE *uni2;
5854 /* startpos for collecting unencodable chars */
5855 const Py_UNICODE *collstart = p;
5856 const Py_UNICODE *collend = p;
5857 /* find all unecodable characters */
5858 while ((collend < endp) && ((*collend)>=limit))
5859 ++collend;
5860 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5861 if (known_errorHandler==-1) {
5862 if ((errors==NULL) || (!strcmp(errors, "strict")))
5863 known_errorHandler = 1;
5864 else if (!strcmp(errors, "replace"))
5865 known_errorHandler = 2;
5866 else if (!strcmp(errors, "ignore"))
5867 known_errorHandler = 3;
5868 else if (!strcmp(errors, "xmlcharrefreplace"))
5869 known_errorHandler = 4;
5870 else
5871 known_errorHandler = 0;
5872 }
5873 switch (known_errorHandler) {
5874 case 1: /* strict */
5875 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5876 goto onError;
5877 case 2: /* replace */
5878 while (collstart++<collend)
5879 *str++ = '?'; /* fall through */
5880 case 3: /* ignore */
5881 p = collend;
5882 break;
5883 case 4: /* xmlcharrefreplace */
5884 respos = str - PyBytes_AS_STRING(res);
5885 /* determine replacement size (temporarily (mis)uses p) */
5886 for (p = collstart, repsize = 0; p < collend; ++p) {
5887 if (*p<10)
5888 repsize += 2+1+1;
5889 else if (*p<100)
5890 repsize += 2+2+1;
5891 else if (*p<1000)
5892 repsize += 2+3+1;
5893 else if (*p<10000)
5894 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005895#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 else
5897 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005898#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 else if (*p<100000)
5900 repsize += 2+5+1;
5901 else if (*p<1000000)
5902 repsize += 2+6+1;
5903 else
5904 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005905#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 }
5907 requiredsize = respos+repsize+(endp-collend);
5908 if (requiredsize > ressize) {
5909 if (requiredsize<2*ressize)
5910 requiredsize = 2*ressize;
5911 if (_PyBytes_Resize(&res, requiredsize))
5912 goto onError;
5913 str = PyBytes_AS_STRING(res) + respos;
5914 ressize = requiredsize;
5915 }
5916 /* generate replacement (temporarily (mis)uses p) */
5917 for (p = collstart; p < collend; ++p) {
5918 str += sprintf(str, "&#%d;", (int)*p);
5919 }
5920 p = collend;
5921 break;
5922 default:
5923 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5924 encoding, reason, startp, size, &exc,
5925 collstart-startp, collend-startp, &newpos);
5926 if (repunicode == NULL)
5927 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005928 if (PyBytes_Check(repunicode)) {
5929 /* Directly copy bytes result to output. */
5930 repsize = PyBytes_Size(repunicode);
5931 if (repsize > 1) {
5932 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005933 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005934 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5935 Py_DECREF(repunicode);
5936 goto onError;
5937 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005938 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005939 ressize += repsize-1;
5940 }
5941 memcpy(str, PyBytes_AsString(repunicode), repsize);
5942 str += repsize;
5943 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005944 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005945 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005946 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 /* need more space? (at least enough for what we
5948 have+the replacement+the rest of the string, so
5949 we won't have to check space for encodable characters) */
5950 respos = str - PyBytes_AS_STRING(res);
5951 repsize = PyUnicode_GET_SIZE(repunicode);
5952 requiredsize = respos+repsize+(endp-collend);
5953 if (requiredsize > ressize) {
5954 if (requiredsize<2*ressize)
5955 requiredsize = 2*ressize;
5956 if (_PyBytes_Resize(&res, requiredsize)) {
5957 Py_DECREF(repunicode);
5958 goto onError;
5959 }
5960 str = PyBytes_AS_STRING(res) + respos;
5961 ressize = requiredsize;
5962 }
5963 /* check if there is anything unencodable in the replacement
5964 and copy it to the output */
5965 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5966 c = *uni2;
5967 if (c >= limit) {
5968 raise_encode_exception(&exc, encoding, startp, size,
5969 unicodepos, unicodepos+1, reason);
5970 Py_DECREF(repunicode);
5971 goto onError;
5972 }
5973 *str = (char)c;
5974 }
5975 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005976 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005977 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005978 }
5979 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005980 /* Resize if we allocated to much */
5981 size = str - PyBytes_AS_STRING(res);
5982 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005983 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005984 if (_PyBytes_Resize(&res, size) < 0)
5985 goto onError;
5986 }
5987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005988 Py_XDECREF(errorHandler);
5989 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005990 return res;
5991
5992 onError:
5993 Py_XDECREF(res);
5994 Py_XDECREF(errorHandler);
5995 Py_XDECREF(exc);
5996 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005997}
5998
Alexander Belopolsky40018472011-02-26 01:02:56 +00005999PyObject *
6000PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006001 Py_ssize_t size,
6002 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006004 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005}
6006
Alexander Belopolsky40018472011-02-26 01:02:56 +00006007PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006008_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009{
6010 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 PyErr_BadArgument();
6012 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006014 if (PyUnicode_READY(unicode) == -1)
6015 return NULL;
6016 /* Fast path: if it is a one-byte string, construct
6017 bytes object directly. */
6018 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6019 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6020 PyUnicode_GET_LENGTH(unicode));
6021 /* Non-Latin-1 characters present. Defer to above function to
6022 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006025 errors);
6026}
6027
6028PyObject*
6029PyUnicode_AsLatin1String(PyObject *unicode)
6030{
6031 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032}
6033
6034/* --- 7-bit ASCII Codec -------------------------------------------------- */
6035
Alexander Belopolsky40018472011-02-26 01:02:56 +00006036PyObject *
6037PyUnicode_DecodeASCII(const char *s,
6038 Py_ssize_t size,
6039 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006041 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 PyUnicodeObject *v;
6043 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006044 Py_ssize_t startinpos;
6045 Py_ssize_t endinpos;
6046 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006047 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006048 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006049 PyObject *errorHandler = NULL;
6050 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006051 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006052
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006054 if (size == 1 && *(unsigned char*)s < 128)
6055 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6056
6057 /* Fast path. Assume the input actually *is* ASCII, and allocate
6058 a single-block Unicode object with that assumption. If there is
6059 an error, drop the object and start over. */
6060 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6061 if (v == NULL)
6062 goto onError;
6063 d = PyUnicode_1BYTE_DATA(v);
6064 for (i = 0; i < size; i++) {
6065 unsigned char ch = ((unsigned char*)s)[i];
6066 if (ch < 128)
6067 d[i] = ch;
6068 else
6069 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006070 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006071 if (i == size)
6072 return (PyObject*)v;
6073 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006074
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 v = _PyUnicode_New(size);
6076 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006081 e = s + size;
6082 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 register unsigned char c = (unsigned char)*s;
6084 if (c < 128) {
6085 *p++ = c;
6086 ++s;
6087 }
6088 else {
6089 startinpos = s-starts;
6090 endinpos = startinpos + 1;
6091 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6092 if (unicode_decode_call_errorhandler(
6093 errors, &errorHandler,
6094 "ascii", "ordinal not in range(128)",
6095 &starts, &e, &startinpos, &endinpos, &exc, &s,
6096 &v, &outpos, &p))
6097 goto onError;
6098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006100 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6102 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006103 Py_XDECREF(errorHandler);
6104 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006105 if (PyUnicode_READY(v) == -1) {
6106 Py_DECREF(v);
6107 return NULL;
6108 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006110
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006113 Py_XDECREF(errorHandler);
6114 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115 return NULL;
6116}
6117
Alexander Belopolsky40018472011-02-26 01:02:56 +00006118PyObject *
6119PyUnicode_EncodeASCII(const Py_UNICODE *p,
6120 Py_ssize_t size,
6121 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006123 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124}
6125
Alexander Belopolsky40018472011-02-26 01:02:56 +00006126PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006127_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128{
6129 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 PyErr_BadArgument();
6131 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006133 if (PyUnicode_READY(unicode) == -1)
6134 return NULL;
6135 /* Fast path: if it is an ASCII-only string, construct bytes object
6136 directly. Else defer to above function to raise the exception. */
6137 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6138 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6139 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006141 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006142 errors);
6143}
6144
6145PyObject *
6146PyUnicode_AsASCIIString(PyObject *unicode)
6147{
6148 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149}
6150
Victor Stinner99b95382011-07-04 14:23:54 +02006151#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006152
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006153/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006154
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006155#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006156#define NEED_RETRY
6157#endif
6158
6159/* XXX This code is limited to "true" double-byte encodings, as
6160 a) it assumes an incomplete character consists of a single byte, and
6161 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006162 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006163
Alexander Belopolsky40018472011-02-26 01:02:56 +00006164static int
6165is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006166{
6167 const char *curr = s + offset;
6168
6169 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 const char *prev = CharPrev(s, curr);
6171 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006172 }
6173 return 0;
6174}
6175
6176/*
6177 * Decode MBCS string into unicode object. If 'final' is set, converts
6178 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6179 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006180static int
6181decode_mbcs(PyUnicodeObject **v,
6182 const char *s, /* MBCS string */
6183 int size, /* sizeof MBCS string */
6184 int final,
6185 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006186{
6187 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006188 Py_ssize_t n;
6189 DWORD usize;
6190 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006191
6192 assert(size >= 0);
6193
Victor Stinner554f3f02010-06-16 23:33:54 +00006194 /* check and handle 'errors' arg */
6195 if (errors==NULL || strcmp(errors, "strict")==0)
6196 flags = MB_ERR_INVALID_CHARS;
6197 else if (strcmp(errors, "ignore")==0)
6198 flags = 0;
6199 else {
6200 PyErr_Format(PyExc_ValueError,
6201 "mbcs encoding does not support errors='%s'",
6202 errors);
6203 return -1;
6204 }
6205
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006206 /* Skip trailing lead-byte unless 'final' is set */
6207 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006209
6210 /* First get the size of the result */
6211 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006212 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6213 if (usize==0)
6214 goto mbcs_decode_error;
6215 } else
6216 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006217
6218 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 /* Create unicode object */
6220 *v = _PyUnicode_New(usize);
6221 if (*v == NULL)
6222 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006223 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006224 }
6225 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 /* Extend unicode object */
6227 n = PyUnicode_GET_SIZE(*v);
6228 if (_PyUnicode_Resize(v, n + usize) < 0)
6229 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006230 }
6231
6232 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006233 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006235 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6236 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006237 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006238 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006239 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006240
6241mbcs_decode_error:
6242 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6243 we raise a UnicodeDecodeError - else it is a 'generic'
6244 windows error
6245 */
6246 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6247 /* Ideally, we should get reason from FormatMessage - this
6248 is the Windows 2000 English version of the message
6249 */
6250 PyObject *exc = NULL;
6251 const char *reason = "No mapping for the Unicode character exists "
6252 "in the target multi-byte code page.";
6253 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6254 if (exc != NULL) {
6255 PyCodec_StrictErrors(exc);
6256 Py_DECREF(exc);
6257 }
6258 } else {
6259 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6260 }
6261 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006262}
6263
Alexander Belopolsky40018472011-02-26 01:02:56 +00006264PyObject *
6265PyUnicode_DecodeMBCSStateful(const char *s,
6266 Py_ssize_t size,
6267 const char *errors,
6268 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006269{
6270 PyUnicodeObject *v = NULL;
6271 int done;
6272
6273 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006275
6276#ifdef NEED_RETRY
6277 retry:
6278 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006279 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006280 else
6281#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006282 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006283
6284 if (done < 0) {
6285 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006287 }
6288
6289 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006291
6292#ifdef NEED_RETRY
6293 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 s += done;
6295 size -= done;
6296 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006297 }
6298#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006299 if (PyUnicode_READY(v) == -1) {
6300 Py_DECREF(v);
6301 return NULL;
6302 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006303 return (PyObject *)v;
6304}
6305
Alexander Belopolsky40018472011-02-26 01:02:56 +00006306PyObject *
6307PyUnicode_DecodeMBCS(const char *s,
6308 Py_ssize_t size,
6309 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006310{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006311 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6312}
6313
6314/*
6315 * Convert unicode into string object (MBCS).
6316 * Returns 0 if succeed, -1 otherwise.
6317 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006318static int
6319encode_mbcs(PyObject **repr,
6320 const Py_UNICODE *p, /* unicode */
6321 int size, /* size of unicode */
6322 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006323{
Victor Stinner554f3f02010-06-16 23:33:54 +00006324 BOOL usedDefaultChar = FALSE;
6325 BOOL *pusedDefaultChar;
6326 int mbcssize;
6327 Py_ssize_t n;
6328 PyObject *exc = NULL;
6329 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006330
6331 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006332
Victor Stinner554f3f02010-06-16 23:33:54 +00006333 /* check and handle 'errors' arg */
6334 if (errors==NULL || strcmp(errors, "strict")==0) {
6335 flags = WC_NO_BEST_FIT_CHARS;
6336 pusedDefaultChar = &usedDefaultChar;
6337 } else if (strcmp(errors, "replace")==0) {
6338 flags = 0;
6339 pusedDefaultChar = NULL;
6340 } else {
6341 PyErr_Format(PyExc_ValueError,
6342 "mbcs encoding does not support errors='%s'",
6343 errors);
6344 return -1;
6345 }
6346
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006347 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006348 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006349 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6350 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 if (mbcssize == 0) {
6352 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6353 return -1;
6354 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006355 /* If we used a default char, then we failed! */
6356 if (pusedDefaultChar && *pusedDefaultChar)
6357 goto mbcs_encode_error;
6358 } else {
6359 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006360 }
6361
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006362 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 /* Create string object */
6364 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6365 if (*repr == NULL)
6366 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006367 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006368 }
6369 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 /* Extend string object */
6371 n = PyBytes_Size(*repr);
6372 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6373 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006374 }
6375
6376 /* Do the conversion */
6377 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006379 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6380 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6382 return -1;
6383 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006384 if (pusedDefaultChar && *pusedDefaultChar)
6385 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006386 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006387 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006388
6389mbcs_encode_error:
6390 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6391 Py_XDECREF(exc);
6392 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006393}
6394
Alexander Belopolsky40018472011-02-26 01:02:56 +00006395PyObject *
6396PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6397 Py_ssize_t size,
6398 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006399{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006400 PyObject *repr = NULL;
6401 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006402
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006403#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006405 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006406 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006407 else
6408#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006409 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006410
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006411 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 Py_XDECREF(repr);
6413 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006414 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006415
6416#ifdef NEED_RETRY
6417 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 p += INT_MAX;
6419 size -= INT_MAX;
6420 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006421 }
6422#endif
6423
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006424 return repr;
6425}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006426
Alexander Belopolsky40018472011-02-26 01:02:56 +00006427PyObject *
6428PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006429{
6430 if (!PyUnicode_Check(unicode)) {
6431 PyErr_BadArgument();
6432 return NULL;
6433 }
6434 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 PyUnicode_GET_SIZE(unicode),
6436 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006437}
6438
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006439#undef NEED_RETRY
6440
Victor Stinner99b95382011-07-04 14:23:54 +02006441#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006442
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443/* --- Character Mapping Codec -------------------------------------------- */
6444
Alexander Belopolsky40018472011-02-26 01:02:56 +00006445PyObject *
6446PyUnicode_DecodeCharmap(const char *s,
6447 Py_ssize_t size,
6448 PyObject *mapping,
6449 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006452 Py_ssize_t startinpos;
6453 Py_ssize_t endinpos;
6454 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 PyUnicodeObject *v;
6457 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006458 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459 PyObject *errorHandler = NULL;
6460 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006461 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006462 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006463
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 /* Default to Latin-1 */
6465 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467
6468 v = _PyUnicode_New(size);
6469 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006474 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006475 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 mapstring = PyUnicode_AS_UNICODE(mapping);
6477 maplen = PyUnicode_GET_SIZE(mapping);
6478 while (s < e) {
6479 unsigned char ch = *s;
6480 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 if (ch < maplen)
6483 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 if (x == 0xfffe) {
6486 /* undefined mapping */
6487 outpos = p-PyUnicode_AS_UNICODE(v);
6488 startinpos = s-starts;
6489 endinpos = startinpos+1;
6490 if (unicode_decode_call_errorhandler(
6491 errors, &errorHandler,
6492 "charmap", "character maps to <undefined>",
6493 &starts, &e, &startinpos, &endinpos, &exc, &s,
6494 &v, &outpos, &p)) {
6495 goto onError;
6496 }
6497 continue;
6498 }
6499 *p++ = x;
6500 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006501 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006502 }
6503 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 while (s < e) {
6505 unsigned char ch = *s;
6506 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006507
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6509 w = PyLong_FromLong((long)ch);
6510 if (w == NULL)
6511 goto onError;
6512 x = PyObject_GetItem(mapping, w);
6513 Py_DECREF(w);
6514 if (x == NULL) {
6515 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6516 /* No mapping found means: mapping is undefined. */
6517 PyErr_Clear();
6518 x = Py_None;
6519 Py_INCREF(x);
6520 } else
6521 goto onError;
6522 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006523
Benjamin Peterson29060642009-01-31 22:14:21 +00006524 /* Apply mapping */
6525 if (PyLong_Check(x)) {
6526 long value = PyLong_AS_LONG(x);
6527 if (value < 0 || value > 65535) {
6528 PyErr_SetString(PyExc_TypeError,
6529 "character mapping must be in range(65536)");
6530 Py_DECREF(x);
6531 goto onError;
6532 }
6533 *p++ = (Py_UNICODE)value;
6534 }
6535 else if (x == Py_None) {
6536 /* undefined mapping */
6537 outpos = p-PyUnicode_AS_UNICODE(v);
6538 startinpos = s-starts;
6539 endinpos = startinpos+1;
6540 if (unicode_decode_call_errorhandler(
6541 errors, &errorHandler,
6542 "charmap", "character maps to <undefined>",
6543 &starts, &e, &startinpos, &endinpos, &exc, &s,
6544 &v, &outpos, &p)) {
6545 Py_DECREF(x);
6546 goto onError;
6547 }
6548 Py_DECREF(x);
6549 continue;
6550 }
6551 else if (PyUnicode_Check(x)) {
6552 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006553
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 if (targetsize == 1)
6555 /* 1-1 mapping */
6556 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006557
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 else if (targetsize > 1) {
6559 /* 1-n mapping */
6560 if (targetsize > extrachars) {
6561 /* resize first */
6562 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6563 Py_ssize_t needed = (targetsize - extrachars) + \
6564 (targetsize << 2);
6565 extrachars += needed;
6566 /* XXX overflow detection missing */
6567 if (_PyUnicode_Resize(&v,
6568 PyUnicode_GET_SIZE(v) + needed) < 0) {
6569 Py_DECREF(x);
6570 goto onError;
6571 }
6572 p = PyUnicode_AS_UNICODE(v) + oldpos;
6573 }
6574 Py_UNICODE_COPY(p,
6575 PyUnicode_AS_UNICODE(x),
6576 targetsize);
6577 p += targetsize;
6578 extrachars -= targetsize;
6579 }
6580 /* 1-0 mapping: skip the character */
6581 }
6582 else {
6583 /* wrong return value */
6584 PyErr_SetString(PyExc_TypeError,
6585 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006586 Py_DECREF(x);
6587 goto onError;
6588 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 Py_DECREF(x);
6590 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 }
6593 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006594 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6595 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006596 Py_XDECREF(errorHandler);
6597 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006598 if (PyUnicode_READY(v) == -1) {
6599 Py_DECREF(v);
6600 return NULL;
6601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006603
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006605 Py_XDECREF(errorHandler);
6606 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 Py_XDECREF(v);
6608 return NULL;
6609}
6610
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006611/* Charmap encoding: the lookup table */
6612
Alexander Belopolsky40018472011-02-26 01:02:56 +00006613struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 PyObject_HEAD
6615 unsigned char level1[32];
6616 int count2, count3;
6617 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006618};
6619
6620static PyObject*
6621encoding_map_size(PyObject *obj, PyObject* args)
6622{
6623 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006624 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006626}
6627
6628static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006629 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 PyDoc_STR("Return the size (in bytes) of this object") },
6631 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006632};
6633
6634static void
6635encoding_map_dealloc(PyObject* o)
6636{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006637 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006638}
6639
6640static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006641 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 "EncodingMap", /*tp_name*/
6643 sizeof(struct encoding_map), /*tp_basicsize*/
6644 0, /*tp_itemsize*/
6645 /* methods */
6646 encoding_map_dealloc, /*tp_dealloc*/
6647 0, /*tp_print*/
6648 0, /*tp_getattr*/
6649 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006650 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 0, /*tp_repr*/
6652 0, /*tp_as_number*/
6653 0, /*tp_as_sequence*/
6654 0, /*tp_as_mapping*/
6655 0, /*tp_hash*/
6656 0, /*tp_call*/
6657 0, /*tp_str*/
6658 0, /*tp_getattro*/
6659 0, /*tp_setattro*/
6660 0, /*tp_as_buffer*/
6661 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6662 0, /*tp_doc*/
6663 0, /*tp_traverse*/
6664 0, /*tp_clear*/
6665 0, /*tp_richcompare*/
6666 0, /*tp_weaklistoffset*/
6667 0, /*tp_iter*/
6668 0, /*tp_iternext*/
6669 encoding_map_methods, /*tp_methods*/
6670 0, /*tp_members*/
6671 0, /*tp_getset*/
6672 0, /*tp_base*/
6673 0, /*tp_dict*/
6674 0, /*tp_descr_get*/
6675 0, /*tp_descr_set*/
6676 0, /*tp_dictoffset*/
6677 0, /*tp_init*/
6678 0, /*tp_alloc*/
6679 0, /*tp_new*/
6680 0, /*tp_free*/
6681 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006682};
6683
6684PyObject*
6685PyUnicode_BuildEncodingMap(PyObject* string)
6686{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006687 PyObject *result;
6688 struct encoding_map *mresult;
6689 int i;
6690 int need_dict = 0;
6691 unsigned char level1[32];
6692 unsigned char level2[512];
6693 unsigned char *mlevel1, *mlevel2, *mlevel3;
6694 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006695 int kind;
6696 void *data;
6697 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006699 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006700 PyErr_BadArgument();
6701 return NULL;
6702 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006703 kind = PyUnicode_KIND(string);
6704 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006705 memset(level1, 0xFF, sizeof level1);
6706 memset(level2, 0xFF, sizeof level2);
6707
6708 /* If there isn't a one-to-one mapping of NULL to \0,
6709 or if there are non-BMP characters, we need to use
6710 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006711 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006712 need_dict = 1;
6713 for (i = 1; i < 256; i++) {
6714 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006715 ch = PyUnicode_READ(kind, data, i);
6716 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006717 need_dict = 1;
6718 break;
6719 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006720 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006721 /* unmapped character */
6722 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006723 l1 = ch >> 11;
6724 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006725 if (level1[l1] == 0xFF)
6726 level1[l1] = count2++;
6727 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006728 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006729 }
6730
6731 if (count2 >= 0xFF || count3 >= 0xFF)
6732 need_dict = 1;
6733
6734 if (need_dict) {
6735 PyObject *result = PyDict_New();
6736 PyObject *key, *value;
6737 if (!result)
6738 return NULL;
6739 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006740 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006741 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006742 if (!key || !value)
6743 goto failed1;
6744 if (PyDict_SetItem(result, key, value) == -1)
6745 goto failed1;
6746 Py_DECREF(key);
6747 Py_DECREF(value);
6748 }
6749 return result;
6750 failed1:
6751 Py_XDECREF(key);
6752 Py_XDECREF(value);
6753 Py_DECREF(result);
6754 return NULL;
6755 }
6756
6757 /* Create a three-level trie */
6758 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6759 16*count2 + 128*count3 - 1);
6760 if (!result)
6761 return PyErr_NoMemory();
6762 PyObject_Init(result, &EncodingMapType);
6763 mresult = (struct encoding_map*)result;
6764 mresult->count2 = count2;
6765 mresult->count3 = count3;
6766 mlevel1 = mresult->level1;
6767 mlevel2 = mresult->level23;
6768 mlevel3 = mresult->level23 + 16*count2;
6769 memcpy(mlevel1, level1, 32);
6770 memset(mlevel2, 0xFF, 16*count2);
6771 memset(mlevel3, 0, 128*count3);
6772 count3 = 0;
6773 for (i = 1; i < 256; i++) {
6774 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006775 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006776 /* unmapped character */
6777 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006778 o1 = PyUnicode_READ(kind, data, i)>>11;
6779 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006780 i2 = 16*mlevel1[o1] + o2;
6781 if (mlevel2[i2] == 0xFF)
6782 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006783 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006784 i3 = 128*mlevel2[i2] + o3;
6785 mlevel3[i3] = i;
6786 }
6787 return result;
6788}
6789
6790static int
6791encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6792{
6793 struct encoding_map *map = (struct encoding_map*)mapping;
6794 int l1 = c>>11;
6795 int l2 = (c>>7) & 0xF;
6796 int l3 = c & 0x7F;
6797 int i;
6798
6799#ifdef Py_UNICODE_WIDE
6800 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006802 }
6803#endif
6804 if (c == 0)
6805 return 0;
6806 /* level 1*/
6807 i = map->level1[l1];
6808 if (i == 0xFF) {
6809 return -1;
6810 }
6811 /* level 2*/
6812 i = map->level23[16*i+l2];
6813 if (i == 0xFF) {
6814 return -1;
6815 }
6816 /* level 3 */
6817 i = map->level23[16*map->count2 + 128*i + l3];
6818 if (i == 0) {
6819 return -1;
6820 }
6821 return i;
6822}
6823
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006824/* Lookup the character ch in the mapping. If the character
6825 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006826 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006827static PyObject *
6828charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829{
Christian Heimes217cfd12007-12-02 14:31:20 +00006830 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006831 PyObject *x;
6832
6833 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006835 x = PyObject_GetItem(mapping, w);
6836 Py_DECREF(w);
6837 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006838 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6839 /* No mapping found means: mapping is undefined. */
6840 PyErr_Clear();
6841 x = Py_None;
6842 Py_INCREF(x);
6843 return x;
6844 } else
6845 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006847 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006848 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006849 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006850 long value = PyLong_AS_LONG(x);
6851 if (value < 0 || value > 255) {
6852 PyErr_SetString(PyExc_TypeError,
6853 "character mapping must be in range(256)");
6854 Py_DECREF(x);
6855 return NULL;
6856 }
6857 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006859 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 /* wrong return value */
6863 PyErr_Format(PyExc_TypeError,
6864 "character mapping must return integer, bytes or None, not %.400s",
6865 x->ob_type->tp_name);
6866 Py_DECREF(x);
6867 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 }
6869}
6870
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006871static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006872charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006873{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006874 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6875 /* exponentially overallocate to minimize reallocations */
6876 if (requiredsize < 2*outsize)
6877 requiredsize = 2*outsize;
6878 if (_PyBytes_Resize(outobj, requiredsize))
6879 return -1;
6880 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006881}
6882
Benjamin Peterson14339b62009-01-31 16:36:08 +00006883typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006885} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006886/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006887 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006888 space is available. Return a new reference to the object that
6889 was put in the output buffer, or Py_None, if the mapping was undefined
6890 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006891 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006892static charmapencode_result
6893charmapencode_output(Py_UNICODE c, PyObject *mapping,
6894 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006895{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006896 PyObject *rep;
6897 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006898 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006899
Christian Heimes90aa7642007-12-19 02:45:37 +00006900 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006901 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006903 if (res == -1)
6904 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006905 if (outsize<requiredsize)
6906 if (charmapencode_resize(outobj, outpos, requiredsize))
6907 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006908 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006909 outstart[(*outpos)++] = (char)res;
6910 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006911 }
6912
6913 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006914 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006915 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006916 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 Py_DECREF(rep);
6918 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006919 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 if (PyLong_Check(rep)) {
6921 Py_ssize_t requiredsize = *outpos+1;
6922 if (outsize<requiredsize)
6923 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6924 Py_DECREF(rep);
6925 return enc_EXCEPTION;
6926 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006927 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006929 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006930 else {
6931 const char *repchars = PyBytes_AS_STRING(rep);
6932 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6933 Py_ssize_t requiredsize = *outpos+repsize;
6934 if (outsize<requiredsize)
6935 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6936 Py_DECREF(rep);
6937 return enc_EXCEPTION;
6938 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006939 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006940 memcpy(outstart + *outpos, repchars, repsize);
6941 *outpos += repsize;
6942 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006943 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006944 Py_DECREF(rep);
6945 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946}
6947
6948/* handle an error in PyUnicode_EncodeCharmap
6949 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006950static int
6951charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006952 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006953 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006954 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006955 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006956{
6957 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006958 Py_ssize_t repsize;
6959 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006960 Py_UNICODE *uni2;
6961 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006962 Py_ssize_t collstartpos = *inpos;
6963 Py_ssize_t collendpos = *inpos+1;
6964 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006965 char *encoding = "charmap";
6966 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006967 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006968
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006969 /* find all unencodable characters */
6970 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006971 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006972 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 int res = encoding_map_lookup(p[collendpos], mapping);
6974 if (res != -1)
6975 break;
6976 ++collendpos;
6977 continue;
6978 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006979
Benjamin Peterson29060642009-01-31 22:14:21 +00006980 rep = charmapencode_lookup(p[collendpos], mapping);
6981 if (rep==NULL)
6982 return -1;
6983 else if (rep!=Py_None) {
6984 Py_DECREF(rep);
6985 break;
6986 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006987 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006989 }
6990 /* cache callback name lookup
6991 * (if not done yet, i.e. it's the first error) */
6992 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 if ((errors==NULL) || (!strcmp(errors, "strict")))
6994 *known_errorHandler = 1;
6995 else if (!strcmp(errors, "replace"))
6996 *known_errorHandler = 2;
6997 else if (!strcmp(errors, "ignore"))
6998 *known_errorHandler = 3;
6999 else if (!strcmp(errors, "xmlcharrefreplace"))
7000 *known_errorHandler = 4;
7001 else
7002 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007003 }
7004 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007005 case 1: /* strict */
7006 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7007 return -1;
7008 case 2: /* replace */
7009 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 x = charmapencode_output('?', mapping, res, respos);
7011 if (x==enc_EXCEPTION) {
7012 return -1;
7013 }
7014 else if (x==enc_FAILED) {
7015 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7016 return -1;
7017 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007018 }
7019 /* fall through */
7020 case 3: /* ignore */
7021 *inpos = collendpos;
7022 break;
7023 case 4: /* xmlcharrefreplace */
7024 /* generate replacement (temporarily (mis)uses p) */
7025 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 char buffer[2+29+1+1];
7027 char *cp;
7028 sprintf(buffer, "&#%d;", (int)p[collpos]);
7029 for (cp = buffer; *cp; ++cp) {
7030 x = charmapencode_output(*cp, mapping, res, respos);
7031 if (x==enc_EXCEPTION)
7032 return -1;
7033 else if (x==enc_FAILED) {
7034 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7035 return -1;
7036 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007037 }
7038 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007039 *inpos = collendpos;
7040 break;
7041 default:
7042 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 encoding, reason, p, size, exceptionObject,
7044 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007045 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007046 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007047 if (PyBytes_Check(repunicode)) {
7048 /* Directly copy bytes result to output. */
7049 Py_ssize_t outsize = PyBytes_Size(*res);
7050 Py_ssize_t requiredsize;
7051 repsize = PyBytes_Size(repunicode);
7052 requiredsize = *respos + repsize;
7053 if (requiredsize > outsize)
7054 /* Make room for all additional bytes. */
7055 if (charmapencode_resize(res, respos, requiredsize)) {
7056 Py_DECREF(repunicode);
7057 return -1;
7058 }
7059 memcpy(PyBytes_AsString(*res) + *respos,
7060 PyBytes_AsString(repunicode), repsize);
7061 *respos += repsize;
7062 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007063 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007064 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007065 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007066 /* generate replacement */
7067 repsize = PyUnicode_GET_SIZE(repunicode);
7068 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 x = charmapencode_output(*uni2, mapping, res, respos);
7070 if (x==enc_EXCEPTION) {
7071 return -1;
7072 }
7073 else if (x==enc_FAILED) {
7074 Py_DECREF(repunicode);
7075 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7076 return -1;
7077 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007078 }
7079 *inpos = newpos;
7080 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007081 }
7082 return 0;
7083}
7084
Alexander Belopolsky40018472011-02-26 01:02:56 +00007085PyObject *
7086PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7087 Py_ssize_t size,
7088 PyObject *mapping,
7089 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007091 /* output object */
7092 PyObject *res = NULL;
7093 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007094 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007095 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007096 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007097 PyObject *errorHandler = NULL;
7098 PyObject *exc = NULL;
7099 /* the following variable is used for caching string comparisons
7100 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7101 * 3=ignore, 4=xmlcharrefreplace */
7102 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103
7104 /* Default to Latin-1 */
7105 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007106 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007108 /* allocate enough for a simple encoding without
7109 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007110 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007111 if (res == NULL)
7112 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007113 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007116 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007117 /* try to encode it */
7118 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7119 if (x==enc_EXCEPTION) /* error */
7120 goto onError;
7121 if (x==enc_FAILED) { /* unencodable character */
7122 if (charmap_encoding_error(p, size, &inpos, mapping,
7123 &exc,
7124 &known_errorHandler, &errorHandler, errors,
7125 &res, &respos)) {
7126 goto onError;
7127 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007128 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007129 else
7130 /* done with this character => adjust input position */
7131 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007134 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007135 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007136 if (_PyBytes_Resize(&res, respos) < 0)
7137 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007138
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007139 Py_XDECREF(exc);
7140 Py_XDECREF(errorHandler);
7141 return res;
7142
Benjamin Peterson29060642009-01-31 22:14:21 +00007143 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007144 Py_XDECREF(res);
7145 Py_XDECREF(exc);
7146 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 return NULL;
7148}
7149
Alexander Belopolsky40018472011-02-26 01:02:56 +00007150PyObject *
7151PyUnicode_AsCharmapString(PyObject *unicode,
7152 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153{
7154 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007155 PyErr_BadArgument();
7156 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 }
7158 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007159 PyUnicode_GET_SIZE(unicode),
7160 mapping,
7161 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162}
7163
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007164/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007165static void
7166make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007167 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007168 Py_ssize_t startpos, Py_ssize_t endpos,
7169 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007171 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007172 *exceptionObject = _PyUnicodeTranslateError_Create(
7173 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 }
7175 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007176 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7177 goto onError;
7178 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7179 goto onError;
7180 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7181 goto onError;
7182 return;
7183 onError:
7184 Py_DECREF(*exceptionObject);
7185 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186 }
7187}
7188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007189/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007190static void
7191raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007192 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007193 Py_ssize_t startpos, Py_ssize_t endpos,
7194 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007195{
7196 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007197 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007198 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007200}
7201
7202/* error handling callback helper:
7203 build arguments, call the callback and check the arguments,
7204 put the result into newpos and return the replacement string, which
7205 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007206static PyObject *
7207unicode_translate_call_errorhandler(const char *errors,
7208 PyObject **errorHandler,
7209 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007210 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007211 Py_ssize_t startpos, Py_ssize_t endpos,
7212 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007213{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007214 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007215
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007216 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007217 PyObject *restuple;
7218 PyObject *resunicode;
7219
7220 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007221 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007222 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007224 }
7225
7226 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007227 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007228 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007230
7231 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007233 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007235 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007236 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 Py_DECREF(restuple);
7238 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007239 }
7240 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007241 &resunicode, &i_newpos)) {
7242 Py_DECREF(restuple);
7243 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007244 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007245 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007246 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007247 else
7248 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007249 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007250 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7251 Py_DECREF(restuple);
7252 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007253 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007254 Py_INCREF(resunicode);
7255 Py_DECREF(restuple);
7256 return resunicode;
7257}
7258
7259/* Lookup the character ch in the mapping and put the result in result,
7260 which must be decrefed by the caller.
7261 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007262static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007263charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007264{
Christian Heimes217cfd12007-12-02 14:31:20 +00007265 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007266 PyObject *x;
7267
7268 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007269 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007270 x = PyObject_GetItem(mapping, w);
7271 Py_DECREF(w);
7272 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7274 /* No mapping found means: use 1:1 mapping. */
7275 PyErr_Clear();
7276 *result = NULL;
7277 return 0;
7278 } else
7279 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007280 }
7281 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007282 *result = x;
7283 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007284 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007285 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007286 long value = PyLong_AS_LONG(x);
7287 long max = PyUnicode_GetMax();
7288 if (value < 0 || value > max) {
7289 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007290 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 Py_DECREF(x);
7292 return -1;
7293 }
7294 *result = x;
7295 return 0;
7296 }
7297 else if (PyUnicode_Check(x)) {
7298 *result = x;
7299 return 0;
7300 }
7301 else {
7302 /* wrong return value */
7303 PyErr_SetString(PyExc_TypeError,
7304 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007305 Py_DECREF(x);
7306 return -1;
7307 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007308}
7309/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007310 if not reallocate and adjust various state variables.
7311 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007312static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007313charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007314 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007315{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007316 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007317 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007318 /* exponentially overallocate to minimize reallocations */
7319 if (requiredsize < 2 * oldsize)
7320 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007321 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7322 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007324 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007325 }
7326 return 0;
7327}
7328/* lookup the character, put the result in the output string and adjust
7329 various state variables. Return a new reference to the object that
7330 was put in the output buffer in *result, or Py_None, if the mapping was
7331 undefined (in which case no character was written).
7332 The called must decref result.
7333 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007334static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007335charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7336 PyObject *mapping, Py_UCS4 **output,
7337 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007338 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007339{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007340 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7341 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007342 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007343 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007345 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007346 }
7347 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007349 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007350 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007351 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007352 }
7353 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007354 Py_ssize_t repsize;
7355 if (PyUnicode_READY(*res) == -1)
7356 return -1;
7357 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 if (repsize==1) {
7359 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007360 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 }
7362 else if (repsize!=0) {
7363 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007364 Py_ssize_t requiredsize = *opos +
7365 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007366 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007367 Py_ssize_t i;
7368 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007370 for(i = 0; i < repsize; i++)
7371 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007372 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007373 }
7374 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007376 return 0;
7377}
7378
Alexander Belopolsky40018472011-02-26 01:02:56 +00007379PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007380_PyUnicode_TranslateCharmap(PyObject *input,
7381 PyObject *mapping,
7382 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007384 /* input object */
7385 char *idata;
7386 Py_ssize_t size, i;
7387 int kind;
7388 /* output buffer */
7389 Py_UCS4 *output = NULL;
7390 Py_ssize_t osize;
7391 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007392 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007393 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007394 char *reason = "character maps to <undefined>";
7395 PyObject *errorHandler = NULL;
7396 PyObject *exc = NULL;
7397 /* the following variable is used for caching string comparisons
7398 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7399 * 3=ignore, 4=xmlcharrefreplace */
7400 int known_errorHandler = -1;
7401
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 PyErr_BadArgument();
7404 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007407 if (PyUnicode_READY(input) == -1)
7408 return NULL;
7409 idata = (char*)PyUnicode_DATA(input);
7410 kind = PyUnicode_KIND(input);
7411 size = PyUnicode_GET_LENGTH(input);
7412 i = 0;
7413
7414 if (size == 0) {
7415 Py_INCREF(input);
7416 return input;
7417 }
7418
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007419 /* allocate enough for a simple 1:1 translation without
7420 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007421 osize = size;
7422 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7423 opos = 0;
7424 if (output == NULL) {
7425 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007427 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007429 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007430 /* try to encode it */
7431 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007432 if (charmaptranslate_output(input, i, mapping,
7433 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007434 Py_XDECREF(x);
7435 goto onError;
7436 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007437 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007438 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007439 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007440 else { /* untranslatable character */
7441 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7442 Py_ssize_t repsize;
7443 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007444 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007446 Py_ssize_t collstart = i;
7447 Py_ssize_t collend = i+1;
7448 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007451 while (collend < size) {
7452 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 goto onError;
7454 Py_XDECREF(x);
7455 if (x!=Py_None)
7456 break;
7457 ++collend;
7458 }
7459 /* cache callback name lookup
7460 * (if not done yet, i.e. it's the first error) */
7461 if (known_errorHandler==-1) {
7462 if ((errors==NULL) || (!strcmp(errors, "strict")))
7463 known_errorHandler = 1;
7464 else if (!strcmp(errors, "replace"))
7465 known_errorHandler = 2;
7466 else if (!strcmp(errors, "ignore"))
7467 known_errorHandler = 3;
7468 else if (!strcmp(errors, "xmlcharrefreplace"))
7469 known_errorHandler = 4;
7470 else
7471 known_errorHandler = 0;
7472 }
7473 switch (known_errorHandler) {
7474 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007475 raise_translate_exception(&exc, input, collstart,
7476 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007477 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007478 case 2: /* replace */
7479 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007480 for (coll = collstart; coll<collend; coll++)
7481 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007482 /* fall through */
7483 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007484 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 break;
7486 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007487 /* generate replacement (temporarily (mis)uses i) */
7488 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 char buffer[2+29+1+1];
7490 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007491 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7492 if (charmaptranslate_makespace(&output, &osize,
7493 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 goto onError;
7495 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007496 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007498 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007499 break;
7500 default:
7501 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007502 reason, input, &exc,
7503 collstart, collend, &newpos);
7504 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007505 goto onError;
7506 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007507 repsize = PyUnicode_GET_LENGTH(repunicode);
7508 if (charmaptranslate_makespace(&output, &osize,
7509 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 Py_DECREF(repunicode);
7511 goto onError;
7512 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007513 for (uni2 = 0; repsize-->0; ++uni2)
7514 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7515 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007516 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007517 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007518 }
7519 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007520 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7521 if (!res)
7522 goto onError;
7523 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007524 Py_XDECREF(exc);
7525 Py_XDECREF(errorHandler);
7526 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007529 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007530 Py_XDECREF(exc);
7531 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 return NULL;
7533}
7534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007535/* Deprecated. Use PyUnicode_Translate instead. */
7536PyObject *
7537PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7538 Py_ssize_t size,
7539 PyObject *mapping,
7540 const char *errors)
7541{
7542 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7543 if (!unicode)
7544 return NULL;
7545 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7546}
7547
Alexander Belopolsky40018472011-02-26 01:02:56 +00007548PyObject *
7549PyUnicode_Translate(PyObject *str,
7550 PyObject *mapping,
7551 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552{
7553 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007554
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555 str = PyUnicode_FromObject(str);
7556 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007557 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007558 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559 Py_DECREF(str);
7560 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007561
Benjamin Peterson29060642009-01-31 22:14:21 +00007562 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563 Py_XDECREF(str);
7564 return NULL;
7565}
Tim Petersced69f82003-09-16 20:30:58 +00007566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007567static Py_UCS4
7568fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7569{
7570 /* No need to call PyUnicode_READY(self) because this function is only
7571 called as a callback from fixup() which does it already. */
7572 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7573 const int kind = PyUnicode_KIND(self);
7574 void *data = PyUnicode_DATA(self);
7575 Py_UCS4 maxchar = 0, ch, fixed;
7576 Py_ssize_t i;
7577
7578 for (i = 0; i < len; ++i) {
7579 ch = PyUnicode_READ(kind, data, i);
7580 fixed = 0;
7581 if (ch > 127) {
7582 if (Py_UNICODE_ISSPACE(ch))
7583 fixed = ' ';
7584 else {
7585 const int decimal = Py_UNICODE_TODECIMAL(ch);
7586 if (decimal >= 0)
7587 fixed = '0' + decimal;
7588 }
7589 if (fixed != 0) {
7590 if (fixed > maxchar)
7591 maxchar = fixed;
7592 PyUnicode_WRITE(kind, data, i, fixed);
7593 }
7594 else if (ch > maxchar)
7595 maxchar = ch;
7596 }
7597 else if (ch > maxchar)
7598 maxchar = ch;
7599 }
7600
7601 return maxchar;
7602}
7603
7604PyObject *
7605_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7606{
7607 if (!PyUnicode_Check(unicode)) {
7608 PyErr_BadInternalCall();
7609 return NULL;
7610 }
7611 if (PyUnicode_READY(unicode) == -1)
7612 return NULL;
7613 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7614 /* If the string is already ASCII, just return the same string */
7615 Py_INCREF(unicode);
7616 return unicode;
7617 }
7618 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7619}
7620
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007621PyObject *
7622PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7623 Py_ssize_t length)
7624{
7625 PyObject *result;
7626 Py_UNICODE *p; /* write pointer into result */
7627 Py_ssize_t i;
7628 /* Copy to a new string */
7629 result = (PyObject *)_PyUnicode_New(length);
7630 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7631 if (result == NULL)
7632 return result;
7633 p = PyUnicode_AS_UNICODE(result);
7634 /* Iterate over code points */
7635 for (i = 0; i < length; i++) {
7636 Py_UNICODE ch =s[i];
7637 if (ch > 127) {
7638 int decimal = Py_UNICODE_TODECIMAL(ch);
7639 if (decimal >= 0)
7640 p[i] = '0' + decimal;
7641 }
7642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007643 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7644 Py_DECREF(result);
7645 return NULL;
7646 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007647 return result;
7648}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007649/* --- Decimal Encoder ---------------------------------------------------- */
7650
Alexander Belopolsky40018472011-02-26 01:02:56 +00007651int
7652PyUnicode_EncodeDecimal(Py_UNICODE *s,
7653 Py_ssize_t length,
7654 char *output,
7655 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007656{
7657 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007658 PyObject *errorHandler = NULL;
7659 PyObject *exc = NULL;
7660 const char *encoding = "decimal";
7661 const char *reason = "invalid decimal Unicode string";
7662 /* the following variable is used for caching string comparisons
7663 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7664 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007665
7666 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 PyErr_BadArgument();
7668 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007669 }
7670
7671 p = s;
7672 end = s + length;
7673 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 register Py_UNICODE ch = *p;
7675 int decimal;
7676 PyObject *repunicode;
7677 Py_ssize_t repsize;
7678 Py_ssize_t newpos;
7679 Py_UNICODE *uni2;
7680 Py_UNICODE *collstart;
7681 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007682
Benjamin Peterson29060642009-01-31 22:14:21 +00007683 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007684 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 ++p;
7686 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007687 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 decimal = Py_UNICODE_TODECIMAL(ch);
7689 if (decimal >= 0) {
7690 *output++ = '0' + decimal;
7691 ++p;
7692 continue;
7693 }
7694 if (0 < ch && ch < 256) {
7695 *output++ = (char)ch;
7696 ++p;
7697 continue;
7698 }
7699 /* All other characters are considered unencodable */
7700 collstart = p;
7701 collend = p+1;
7702 while (collend < end) {
7703 if ((0 < *collend && *collend < 256) ||
7704 !Py_UNICODE_ISSPACE(*collend) ||
7705 Py_UNICODE_TODECIMAL(*collend))
7706 break;
7707 }
7708 /* cache callback name lookup
7709 * (if not done yet, i.e. it's the first error) */
7710 if (known_errorHandler==-1) {
7711 if ((errors==NULL) || (!strcmp(errors, "strict")))
7712 known_errorHandler = 1;
7713 else if (!strcmp(errors, "replace"))
7714 known_errorHandler = 2;
7715 else if (!strcmp(errors, "ignore"))
7716 known_errorHandler = 3;
7717 else if (!strcmp(errors, "xmlcharrefreplace"))
7718 known_errorHandler = 4;
7719 else
7720 known_errorHandler = 0;
7721 }
7722 switch (known_errorHandler) {
7723 case 1: /* strict */
7724 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7725 goto onError;
7726 case 2: /* replace */
7727 for (p = collstart; p < collend; ++p)
7728 *output++ = '?';
7729 /* fall through */
7730 case 3: /* ignore */
7731 p = collend;
7732 break;
7733 case 4: /* xmlcharrefreplace */
7734 /* generate replacement (temporarily (mis)uses p) */
7735 for (p = collstart; p < collend; ++p)
7736 output += sprintf(output, "&#%d;", (int)*p);
7737 p = collend;
7738 break;
7739 default:
7740 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7741 encoding, reason, s, length, &exc,
7742 collstart-s, collend-s, &newpos);
7743 if (repunicode == NULL)
7744 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007745 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007746 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007747 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7748 Py_DECREF(repunicode);
7749 goto onError;
7750 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 /* generate replacement */
7752 repsize = PyUnicode_GET_SIZE(repunicode);
7753 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7754 Py_UNICODE ch = *uni2;
7755 if (Py_UNICODE_ISSPACE(ch))
7756 *output++ = ' ';
7757 else {
7758 decimal = Py_UNICODE_TODECIMAL(ch);
7759 if (decimal >= 0)
7760 *output++ = '0' + decimal;
7761 else if (0 < ch && ch < 256)
7762 *output++ = (char)ch;
7763 else {
7764 Py_DECREF(repunicode);
7765 raise_encode_exception(&exc, encoding,
7766 s, length, collstart-s, collend-s, reason);
7767 goto onError;
7768 }
7769 }
7770 }
7771 p = s + newpos;
7772 Py_DECREF(repunicode);
7773 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007774 }
7775 /* 0-terminate the output string */
7776 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007777 Py_XDECREF(exc);
7778 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007779 return 0;
7780
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007782 Py_XDECREF(exc);
7783 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007784 return -1;
7785}
7786
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787/* --- Helpers ------------------------------------------------------------ */
7788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007789#include "stringlib/ucs1lib.h"
7790#include "stringlib/fastsearch.h"
7791#include "stringlib/partition.h"
7792#include "stringlib/split.h"
7793#include "stringlib/count.h"
7794#include "stringlib/find.h"
7795#include "stringlib/localeutil.h"
7796#include "stringlib/undef.h"
7797
7798#include "stringlib/ucs2lib.h"
7799#include "stringlib/fastsearch.h"
7800#include "stringlib/partition.h"
7801#include "stringlib/split.h"
7802#include "stringlib/count.h"
7803#include "stringlib/find.h"
7804#include "stringlib/localeutil.h"
7805#include "stringlib/undef.h"
7806
7807#include "stringlib/ucs4lib.h"
7808#include "stringlib/fastsearch.h"
7809#include "stringlib/partition.h"
7810#include "stringlib/split.h"
7811#include "stringlib/count.h"
7812#include "stringlib/find.h"
7813#include "stringlib/localeutil.h"
7814#include "stringlib/undef.h"
7815
7816static Py_ssize_t
7817any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7818 const Py_UCS1*, Py_ssize_t,
7819 Py_ssize_t, Py_ssize_t),
7820 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7821 const Py_UCS2*, Py_ssize_t,
7822 Py_ssize_t, Py_ssize_t),
7823 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7824 const Py_UCS4*, Py_ssize_t,
7825 Py_ssize_t, Py_ssize_t),
7826 PyObject* s1, PyObject* s2,
7827 Py_ssize_t start,
7828 Py_ssize_t end)
7829{
7830 int kind1, kind2, kind;
7831 void *buf1, *buf2;
7832 Py_ssize_t len1, len2, result;
7833
7834 kind1 = PyUnicode_KIND(s1);
7835 kind2 = PyUnicode_KIND(s2);
7836 kind = kind1 > kind2 ? kind1 : kind2;
7837 buf1 = PyUnicode_DATA(s1);
7838 buf2 = PyUnicode_DATA(s2);
7839 if (kind1 != kind)
7840 buf1 = _PyUnicode_AsKind(s1, kind);
7841 if (!buf1)
7842 return -2;
7843 if (kind2 != kind)
7844 buf2 = _PyUnicode_AsKind(s2, kind);
7845 if (!buf2) {
7846 if (kind1 != kind) PyMem_Free(buf1);
7847 return -2;
7848 }
7849 len1 = PyUnicode_GET_LENGTH(s1);
7850 len2 = PyUnicode_GET_LENGTH(s2);
7851
7852 switch(kind) {
7853 case PyUnicode_1BYTE_KIND:
7854 result = ucs1(buf1, len1, buf2, len2, start, end);
7855 break;
7856 case PyUnicode_2BYTE_KIND:
7857 result = ucs2(buf1, len1, buf2, len2, start, end);
7858 break;
7859 case PyUnicode_4BYTE_KIND:
7860 result = ucs4(buf1, len1, buf2, len2, start, end);
7861 break;
7862 default:
7863 assert(0); result = -2;
7864 }
7865
7866 if (kind1 != kind)
7867 PyMem_Free(buf1);
7868 if (kind2 != kind)
7869 PyMem_Free(buf2);
7870
7871 return result;
7872}
7873
7874Py_ssize_t
7875_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7876 Py_ssize_t n_buffer,
7877 void *digits, Py_ssize_t n_digits,
7878 Py_ssize_t min_width,
7879 const char *grouping,
7880 const char *thousands_sep)
7881{
7882 switch(kind) {
7883 case PyUnicode_1BYTE_KIND:
7884 return _PyUnicode_ucs1_InsertThousandsGrouping(
7885 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7886 min_width, grouping, thousands_sep);
7887 case PyUnicode_2BYTE_KIND:
7888 return _PyUnicode_ucs2_InsertThousandsGrouping(
7889 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7890 min_width, grouping, thousands_sep);
7891 case PyUnicode_4BYTE_KIND:
7892 return _PyUnicode_ucs4_InsertThousandsGrouping(
7893 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7894 min_width, grouping, thousands_sep);
7895 }
7896 assert(0);
7897 return -1;
7898}
7899
7900
Eric Smith8c663262007-08-25 02:26:07 +00007901#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007902#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007903
Thomas Wouters477c8d52006-05-27 19:21:47 +00007904#include "stringlib/count.h"
7905#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007906
Thomas Wouters477c8d52006-05-27 19:21:47 +00007907/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007908#define ADJUST_INDICES(start, end, len) \
7909 if (end > len) \
7910 end = len; \
7911 else if (end < 0) { \
7912 end += len; \
7913 if (end < 0) \
7914 end = 0; \
7915 } \
7916 if (start < 0) { \
7917 start += len; \
7918 if (start < 0) \
7919 start = 0; \
7920 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007921
Alexander Belopolsky40018472011-02-26 01:02:56 +00007922Py_ssize_t
7923PyUnicode_Count(PyObject *str,
7924 PyObject *substr,
7925 Py_ssize_t start,
7926 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007928 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007929 PyUnicodeObject* str_obj;
7930 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007931 int kind1, kind2, kind;
7932 void *buf1 = NULL, *buf2 = NULL;
7933 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007934
Thomas Wouters477c8d52006-05-27 19:21:47 +00007935 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007936 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007938 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007939 if (!sub_obj || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 Py_DECREF(str_obj);
7941 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942 }
Tim Petersced69f82003-09-16 20:30:58 +00007943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007944 kind1 = PyUnicode_KIND(str_obj);
7945 kind2 = PyUnicode_KIND(sub_obj);
7946 kind = kind1 > kind2 ? kind1 : kind2;
7947 buf1 = PyUnicode_DATA(str_obj);
7948 if (kind1 != kind)
7949 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7950 if (!buf1)
7951 goto onError;
7952 buf2 = PyUnicode_DATA(sub_obj);
7953 if (kind2 != kind)
7954 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7955 if (!buf2)
7956 goto onError;
7957 len1 = PyUnicode_GET_LENGTH(str_obj);
7958 len2 = PyUnicode_GET_LENGTH(sub_obj);
7959
7960 ADJUST_INDICES(start, end, len1);
7961 switch(kind) {
7962 case PyUnicode_1BYTE_KIND:
7963 result = ucs1lib_count(
7964 ((Py_UCS1*)buf1) + start, end - start,
7965 buf2, len2, PY_SSIZE_T_MAX
7966 );
7967 break;
7968 case PyUnicode_2BYTE_KIND:
7969 result = ucs2lib_count(
7970 ((Py_UCS2*)buf1) + start, end - start,
7971 buf2, len2, PY_SSIZE_T_MAX
7972 );
7973 break;
7974 case PyUnicode_4BYTE_KIND:
7975 result = ucs4lib_count(
7976 ((Py_UCS4*)buf1) + start, end - start,
7977 buf2, len2, PY_SSIZE_T_MAX
7978 );
7979 break;
7980 default:
7981 assert(0); result = 0;
7982 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007983
7984 Py_DECREF(sub_obj);
7985 Py_DECREF(str_obj);
7986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007987 if (kind1 != kind)
7988 PyMem_Free(buf1);
7989 if (kind2 != kind)
7990 PyMem_Free(buf2);
7991
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007993 onError:
7994 Py_DECREF(sub_obj);
7995 Py_DECREF(str_obj);
7996 if (kind1 != kind && buf1)
7997 PyMem_Free(buf1);
7998 if (kind2 != kind && buf2)
7999 PyMem_Free(buf2);
8000 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001}
8002
Alexander Belopolsky40018472011-02-26 01:02:56 +00008003Py_ssize_t
8004PyUnicode_Find(PyObject *str,
8005 PyObject *sub,
8006 Py_ssize_t start,
8007 Py_ssize_t end,
8008 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008010 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008011
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008013 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008015 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008016 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 Py_DECREF(str);
8018 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019 }
Tim Petersced69f82003-09-16 20:30:58 +00008020
Thomas Wouters477c8d52006-05-27 19:21:47 +00008021 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008022 result = any_find_slice(
8023 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8024 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008025 );
8026 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008027 result = any_find_slice(
8028 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8029 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008030 );
8031
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008033 Py_DECREF(sub);
8034
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 return result;
8036}
8037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008038Py_ssize_t
8039PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8040 Py_ssize_t start, Py_ssize_t end,
8041 int direction)
8042{
8043 char *result;
8044 int kind;
8045 if (PyUnicode_READY(str) == -1)
8046 return -2;
8047 if (end > PyUnicode_GET_LENGTH(str))
8048 end = PyUnicode_GET_LENGTH(str);
8049 kind = PyUnicode_KIND(str);
8050 result = findchar(PyUnicode_1BYTE_DATA(str)
8051 + PyUnicode_KIND_SIZE(kind, start),
8052 kind,
8053 end-start, ch, direction);
8054 if (!result)
8055 return -1;
8056 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8057}
8058
Alexander Belopolsky40018472011-02-26 01:02:56 +00008059static int
8060tailmatch(PyUnicodeObject *self,
8061 PyUnicodeObject *substring,
8062 Py_ssize_t start,
8063 Py_ssize_t end,
8064 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008066 int kind_self;
8067 int kind_sub;
8068 void *data_self;
8069 void *data_sub;
8070 Py_ssize_t offset;
8071 Py_ssize_t i;
8072 Py_ssize_t end_sub;
8073
8074 if (PyUnicode_READY(self) == -1 ||
8075 PyUnicode_READY(substring) == -1)
8076 return 0;
8077
8078 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 return 1;
8080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008081 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8082 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008086 kind_self = PyUnicode_KIND(self);
8087 data_self = PyUnicode_DATA(self);
8088 kind_sub = PyUnicode_KIND(substring);
8089 data_sub = PyUnicode_DATA(substring);
8090 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8091
8092 if (direction > 0)
8093 offset = end;
8094 else
8095 offset = start;
8096
8097 if (PyUnicode_READ(kind_self, data_self, offset) ==
8098 PyUnicode_READ(kind_sub, data_sub, 0) &&
8099 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8100 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8101 /* If both are of the same kind, memcmp is sufficient */
8102 if (kind_self == kind_sub) {
8103 return ! memcmp((char *)data_self +
8104 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8105 data_sub,
8106 PyUnicode_GET_LENGTH(substring) *
8107 PyUnicode_CHARACTER_SIZE(substring));
8108 }
8109 /* otherwise we have to compare each character by first accesing it */
8110 else {
8111 /* We do not need to compare 0 and len(substring)-1 because
8112 the if statement above ensured already that they are equal
8113 when we end up here. */
8114 // TODO: honor direction and do a forward or backwards search
8115 for (i = 1; i < end_sub; ++i) {
8116 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8117 PyUnicode_READ(kind_sub, data_sub, i))
8118 return 0;
8119 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008121 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122 }
8123
8124 return 0;
8125}
8126
Alexander Belopolsky40018472011-02-26 01:02:56 +00008127Py_ssize_t
8128PyUnicode_Tailmatch(PyObject *str,
8129 PyObject *substr,
8130 Py_ssize_t start,
8131 Py_ssize_t end,
8132 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008134 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008135
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136 str = PyUnicode_FromObject(str);
8137 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139 substr = PyUnicode_FromObject(substr);
8140 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 Py_DECREF(str);
8142 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143 }
Tim Petersced69f82003-09-16 20:30:58 +00008144
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 (PyUnicodeObject *)substr,
8147 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 Py_DECREF(str);
8149 Py_DECREF(substr);
8150 return result;
8151}
8152
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153/* Apply fixfct filter to the Unicode object self and return a
8154 reference to the modified object */
8155
Alexander Belopolsky40018472011-02-26 01:02:56 +00008156static PyObject *
8157fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008158 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008160 PyObject *u;
8161 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008163 if (PyUnicode_READY(self) == -1)
8164 return NULL;
8165 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8166 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8167 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008169 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008171 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8172 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008174 /* fix functions return the new maximum character in a string,
8175 if the kind of the resulting unicode object does not change,
8176 everything is fine. Otherwise we need to change the string kind
8177 and re-run the fix function. */
8178 maxchar_new = fixfct((PyUnicodeObject*)u);
8179 if (maxchar_new == 0)
8180 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8181 else if (maxchar_new <= 127)
8182 maxchar_new = 127;
8183 else if (maxchar_new <= 255)
8184 maxchar_new = 255;
8185 else if (maxchar_new <= 65535)
8186 maxchar_new = 65535;
8187 else
8188 maxchar_new = 1114111; /* 0x10ffff */
8189
8190 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 /* fixfct should return TRUE if it modified the buffer. If
8192 FALSE, return a reference to the original buffer instead
8193 (to save space, not time) */
8194 Py_INCREF(self);
8195 Py_DECREF(u);
8196 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008198 else if (maxchar_new == maxchar_old) {
8199 return u;
8200 }
8201 else {
8202 /* In case the maximum character changed, we need to
8203 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008204 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008205 if (v == NULL) {
8206 Py_DECREF(u);
8207 return NULL;
8208 }
8209 if (maxchar_new > maxchar_old) {
8210 /* If the maxchar increased so that the kind changed, not all
8211 characters are representable anymore and we need to fix the
8212 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008213 if (PyUnicode_CopyCharacters(v, 0,
8214 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008215 PyUnicode_GET_LENGTH(self)) < 0)
8216 {
8217 Py_DECREF(u);
8218 return NULL;
8219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008220 maxchar_old = fixfct((PyUnicodeObject*)v);
8221 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8222 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008223 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008224 if (PyUnicode_CopyCharacters(v, 0,
8225 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008226 PyUnicode_GET_LENGTH(self)) < 0)
8227 {
8228 Py_DECREF(u);
8229 return NULL;
8230 }
8231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008232
8233 Py_DECREF(u);
8234 return v;
8235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236}
8237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008239fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008241 /* No need to call PyUnicode_READY(self) because this function is only
8242 called as a callback from fixup() which does it already. */
8243 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8244 const int kind = PyUnicode_KIND(self);
8245 void *data = PyUnicode_DATA(self);
8246 int touched = 0;
8247 Py_UCS4 maxchar = 0;
8248 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008250 for (i = 0; i < len; ++i) {
8251 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8252 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8253 if (up != ch) {
8254 if (up > maxchar)
8255 maxchar = up;
8256 PyUnicode_WRITE(kind, data, i, up);
8257 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008259 else if (ch > maxchar)
8260 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 }
8262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008263 if (touched)
8264 return maxchar;
8265 else
8266 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267}
8268
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008269static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008270fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008272 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8273 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8274 const int kind = PyUnicode_KIND(self);
8275 void *data = PyUnicode_DATA(self);
8276 int touched = 0;
8277 Py_UCS4 maxchar = 0;
8278 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008280 for(i = 0; i < len; ++i) {
8281 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8282 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8283 if (lo != ch) {
8284 if (lo > maxchar)
8285 maxchar = lo;
8286 PyUnicode_WRITE(kind, data, i, lo);
8287 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008289 else if (ch > maxchar)
8290 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291 }
8292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008293 if (touched)
8294 return maxchar;
8295 else
8296 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297}
8298
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008299static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008300fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008302 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8303 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8304 const int kind = PyUnicode_KIND(self);
8305 void *data = PyUnicode_DATA(self);
8306 int touched = 0;
8307 Py_UCS4 maxchar = 0;
8308 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008310 for(i = 0; i < len; ++i) {
8311 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8312 Py_UCS4 nu = 0;
8313
8314 if (Py_UNICODE_ISUPPER(ch))
8315 nu = Py_UNICODE_TOLOWER(ch);
8316 else if (Py_UNICODE_ISLOWER(ch))
8317 nu = Py_UNICODE_TOUPPER(ch);
8318
8319 if (nu != 0) {
8320 if (nu > maxchar)
8321 maxchar = nu;
8322 PyUnicode_WRITE(kind, data, i, nu);
8323 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008325 else if (ch > maxchar)
8326 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327 }
8328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 if (touched)
8330 return maxchar;
8331 else
8332 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333}
8334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008336fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008338 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8339 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8340 const int kind = PyUnicode_KIND(self);
8341 void *data = PyUnicode_DATA(self);
8342 int touched = 0;
8343 Py_UCS4 maxchar = 0;
8344 Py_ssize_t i = 0;
8345 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008346
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008347 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008349
8350 ch = PyUnicode_READ(kind, data, i);
8351 if (!Py_UNICODE_ISUPPER(ch)) {
8352 maxchar = Py_UNICODE_TOUPPER(ch);
8353 PyUnicode_WRITE(kind, data, i, maxchar);
8354 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356 ++i;
8357 for(; i < len; ++i) {
8358 ch = PyUnicode_READ(kind, data, i);
8359 if (!Py_UNICODE_ISLOWER(ch)) {
8360 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8361 if (lo > maxchar)
8362 maxchar = lo;
8363 PyUnicode_WRITE(kind, data, i, lo);
8364 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008366 else if (ch > maxchar)
8367 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008368 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008369
8370 if (touched)
8371 return maxchar;
8372 else
8373 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374}
8375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008377fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008379 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8380 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8381 const int kind = PyUnicode_KIND(self);
8382 void *data = PyUnicode_DATA(self);
8383 Py_UCS4 maxchar = 0;
8384 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385 int previous_is_cased;
8386
8387 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008388 if (len == 1) {
8389 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8390 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8391 if (ti != ch) {
8392 PyUnicode_WRITE(kind, data, i, ti);
8393 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 }
8395 else
8396 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008399 for(; i < len; ++i) {
8400 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8401 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008402
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008406 nu = Py_UNICODE_TOTITLE(ch);
8407
8408 if (nu > maxchar)
8409 maxchar = nu;
8410 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008411
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 if (Py_UNICODE_ISLOWER(ch) ||
8413 Py_UNICODE_ISUPPER(ch) ||
8414 Py_UNICODE_ISTITLE(ch))
8415 previous_is_cased = 1;
8416 else
8417 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008419 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420}
8421
Tim Peters8ce9f162004-08-27 01:49:32 +00008422PyObject *
8423PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008425 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008426 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008427 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008428 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008429 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8430 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008431 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008432 Py_ssize_t sz, i, res_offset;
8433 Py_UCS4 maxchar = 0;
8434 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435
Tim Peters05eba1f2004-08-27 21:32:02 +00008436 fseq = PySequence_Fast(seq, "");
8437 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008438 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008439 }
8440
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008441 /* NOTE: the following code can't call back into Python code,
8442 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008443 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008444
Tim Peters05eba1f2004-08-27 21:32:02 +00008445 seqlen = PySequence_Fast_GET_SIZE(fseq);
8446 /* If empty sequence, return u"". */
8447 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008449 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008450 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008451 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008452 /* If singleton sequence with an exact Unicode, return that. */
8453 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 item = items[0];
8455 if (PyUnicode_CheckExact(item)) {
8456 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 goto Done;
8459 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008460 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008461 else {
8462 /* Set up sep and seplen */
8463 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 /* fall back to a blank space separator */
8465 sep = PyUnicode_FromOrdinal(' ');
8466 if (!sep || PyUnicode_READY(sep) == -1)
8467 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008468 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008469 else {
8470 if (!PyUnicode_Check(separator)) {
8471 PyErr_Format(PyExc_TypeError,
8472 "separator: expected str instance,"
8473 " %.80s found",
8474 Py_TYPE(separator)->tp_name);
8475 goto onError;
8476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008477 if (PyUnicode_READY(separator) == -1)
8478 goto onError;
8479 sep = separator;
8480 seplen = PyUnicode_GET_LENGTH(separator);
8481 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8482 /* inc refcount to keep this code path symetric with the
8483 above case of a blank separator */
8484 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008485 }
8486 }
8487
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008488 /* There are at least two things to join, or else we have a subclass
8489 * of str in the sequence.
8490 * Do a pre-pass to figure out the total amount of space we'll
8491 * need (sz), and see whether all argument are strings.
8492 */
8493 sz = 0;
8494 for (i = 0; i < seqlen; i++) {
8495 const Py_ssize_t old_sz = sz;
8496 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 if (!PyUnicode_Check(item)) {
8498 PyErr_Format(PyExc_TypeError,
8499 "sequence item %zd: expected str instance,"
8500 " %.80s found",
8501 i, Py_TYPE(item)->tp_name);
8502 goto onError;
8503 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504 if (PyUnicode_READY(item) == -1)
8505 goto onError;
8506 sz += PyUnicode_GET_LENGTH(item);
8507 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8508 if (item_maxchar > maxchar)
8509 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008510 if (i != 0)
8511 sz += seplen;
8512 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8513 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008515 goto onError;
8516 }
8517 }
Tim Petersced69f82003-09-16 20:30:58 +00008518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008520 if (res == NULL)
8521 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008522
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008523 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008524 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008525 Py_ssize_t itemlen;
8526 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 /* Copy item, and maybe the separator. */
8529 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008530 if (PyUnicode_CopyCharacters(res, res_offset,
8531 sep, 0, seplen) < 0)
8532 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008533 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008535 if (PyUnicode_CopyCharacters(res, res_offset,
8536 item, 0, itemlen) < 0)
8537 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008541
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008543 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544 Py_XDECREF(sep);
8545 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008548 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008549 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008550 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551 return NULL;
8552}
8553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554#define FILL(kind, data, value, start, length) \
8555 do { \
8556 Py_ssize_t i_ = 0; \
8557 assert(kind != PyUnicode_WCHAR_KIND); \
8558 switch ((kind)) { \
8559 case PyUnicode_1BYTE_KIND: { \
8560 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8561 memset(to_, (unsigned char)value, length); \
8562 break; \
8563 } \
8564 case PyUnicode_2BYTE_KIND: { \
8565 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8566 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8567 break; \
8568 } \
8569 default: { \
8570 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8571 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8572 break; \
8573 } \
8574 } \
8575 } while (0)
8576
Alexander Belopolsky40018472011-02-26 01:02:56 +00008577static PyUnicodeObject *
8578pad(PyUnicodeObject *self,
8579 Py_ssize_t left,
8580 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008581 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583 PyObject *u;
8584 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008585 int kind;
8586 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587
8588 if (left < 0)
8589 left = 0;
8590 if (right < 0)
8591 right = 0;
8592
Tim Peters7a29bd52001-09-12 03:03:31 +00008593 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594 Py_INCREF(self);
8595 return self;
8596 }
8597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8599 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008600 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8601 return NULL;
8602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8604 if (fill > maxchar)
8605 maxchar = fill;
8606 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008607 if (!u)
8608 return NULL;
8609
8610 kind = PyUnicode_KIND(u);
8611 data = PyUnicode_DATA(u);
8612 if (left)
8613 FILL(kind, data, fill, 0, left);
8614 if (right)
8615 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008616 if (PyUnicode_CopyCharacters(u, left,
8617 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008618 _PyUnicode_LENGTH(self)) < 0)
8619 {
8620 Py_DECREF(u);
8621 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 }
8623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627
Alexander Belopolsky40018472011-02-26 01:02:56 +00008628PyObject *
8629PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632
8633 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 switch(PyUnicode_KIND(string)) {
8638 case PyUnicode_1BYTE_KIND:
8639 list = ucs1lib_splitlines(
8640 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8641 PyUnicode_GET_LENGTH(string), keepends);
8642 break;
8643 case PyUnicode_2BYTE_KIND:
8644 list = ucs2lib_splitlines(
8645 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8646 PyUnicode_GET_LENGTH(string), keepends);
8647 break;
8648 case PyUnicode_4BYTE_KIND:
8649 list = ucs4lib_splitlines(
8650 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8651 PyUnicode_GET_LENGTH(string), keepends);
8652 break;
8653 default:
8654 assert(0);
8655 list = 0;
8656 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657 Py_DECREF(string);
8658 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659}
8660
Alexander Belopolsky40018472011-02-26 01:02:56 +00008661static PyObject *
8662split(PyUnicodeObject *self,
8663 PyUnicodeObject *substring,
8664 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 int kind1, kind2, kind;
8667 void *buf1, *buf2;
8668 Py_ssize_t len1, len2;
8669 PyObject* out;
8670
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008672 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 if (PyUnicode_READY(self) == -1)
8675 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 if (substring == NULL)
8678 switch(PyUnicode_KIND(self)) {
8679 case PyUnicode_1BYTE_KIND:
8680 return ucs1lib_split_whitespace(
8681 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8682 PyUnicode_GET_LENGTH(self), maxcount
8683 );
8684 case PyUnicode_2BYTE_KIND:
8685 return ucs2lib_split_whitespace(
8686 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8687 PyUnicode_GET_LENGTH(self), maxcount
8688 );
8689 case PyUnicode_4BYTE_KIND:
8690 return ucs4lib_split_whitespace(
8691 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8692 PyUnicode_GET_LENGTH(self), maxcount
8693 );
8694 default:
8695 assert(0);
8696 return NULL;
8697 }
8698
8699 if (PyUnicode_READY(substring) == -1)
8700 return NULL;
8701
8702 kind1 = PyUnicode_KIND(self);
8703 kind2 = PyUnicode_KIND(substring);
8704 kind = kind1 > kind2 ? kind1 : kind2;
8705 buf1 = PyUnicode_DATA(self);
8706 buf2 = PyUnicode_DATA(substring);
8707 if (kind1 != kind)
8708 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8709 if (!buf1)
8710 return NULL;
8711 if (kind2 != kind)
8712 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8713 if (!buf2) {
8714 if (kind1 != kind) PyMem_Free(buf1);
8715 return NULL;
8716 }
8717 len1 = PyUnicode_GET_LENGTH(self);
8718 len2 = PyUnicode_GET_LENGTH(substring);
8719
8720 switch(kind) {
8721 case PyUnicode_1BYTE_KIND:
8722 out = ucs1lib_split(
8723 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8724 break;
8725 case PyUnicode_2BYTE_KIND:
8726 out = ucs2lib_split(
8727 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8728 break;
8729 case PyUnicode_4BYTE_KIND:
8730 out = ucs4lib_split(
8731 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8732 break;
8733 default:
8734 out = NULL;
8735 }
8736 if (kind1 != kind)
8737 PyMem_Free(buf1);
8738 if (kind2 != kind)
8739 PyMem_Free(buf2);
8740 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741}
8742
Alexander Belopolsky40018472011-02-26 01:02:56 +00008743static PyObject *
8744rsplit(PyUnicodeObject *self,
8745 PyUnicodeObject *substring,
8746 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748 int kind1, kind2, kind;
8749 void *buf1, *buf2;
8750 Py_ssize_t len1, len2;
8751 PyObject* out;
8752
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008753 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008754 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756 if (PyUnicode_READY(self) == -1)
8757 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 if (substring == NULL)
8760 switch(PyUnicode_KIND(self)) {
8761 case PyUnicode_1BYTE_KIND:
8762 return ucs1lib_rsplit_whitespace(
8763 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8764 PyUnicode_GET_LENGTH(self), maxcount
8765 );
8766 case PyUnicode_2BYTE_KIND:
8767 return ucs2lib_rsplit_whitespace(
8768 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8769 PyUnicode_GET_LENGTH(self), maxcount
8770 );
8771 case PyUnicode_4BYTE_KIND:
8772 return ucs4lib_rsplit_whitespace(
8773 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8774 PyUnicode_GET_LENGTH(self), maxcount
8775 );
8776 default:
8777 assert(0);
8778 return NULL;
8779 }
8780
8781 if (PyUnicode_READY(substring) == -1)
8782 return NULL;
8783
8784 kind1 = PyUnicode_KIND(self);
8785 kind2 = PyUnicode_KIND(substring);
8786 kind = kind1 > kind2 ? kind1 : kind2;
8787 buf1 = PyUnicode_DATA(self);
8788 buf2 = PyUnicode_DATA(substring);
8789 if (kind1 != kind)
8790 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8791 if (!buf1)
8792 return NULL;
8793 if (kind2 != kind)
8794 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8795 if (!buf2) {
8796 if (kind1 != kind) PyMem_Free(buf1);
8797 return NULL;
8798 }
8799 len1 = PyUnicode_GET_LENGTH(self);
8800 len2 = PyUnicode_GET_LENGTH(substring);
8801
8802 switch(kind) {
8803 case PyUnicode_1BYTE_KIND:
8804 out = ucs1lib_rsplit(
8805 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8806 break;
8807 case PyUnicode_2BYTE_KIND:
8808 out = ucs2lib_rsplit(
8809 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8810 break;
8811 case PyUnicode_4BYTE_KIND:
8812 out = ucs4lib_rsplit(
8813 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8814 break;
8815 default:
8816 out = NULL;
8817 }
8818 if (kind1 != kind)
8819 PyMem_Free(buf1);
8820 if (kind2 != kind)
8821 PyMem_Free(buf2);
8822 return out;
8823}
8824
8825static Py_ssize_t
8826anylib_find(int kind, void *buf1, Py_ssize_t len1,
8827 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8828{
8829 switch(kind) {
8830 case PyUnicode_1BYTE_KIND:
8831 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8832 case PyUnicode_2BYTE_KIND:
8833 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8834 case PyUnicode_4BYTE_KIND:
8835 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8836 }
8837 assert(0);
8838 return -1;
8839}
8840
8841static Py_ssize_t
8842anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8843 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8844{
8845 switch(kind) {
8846 case PyUnicode_1BYTE_KIND:
8847 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8848 case PyUnicode_2BYTE_KIND:
8849 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8850 case PyUnicode_4BYTE_KIND:
8851 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8852 }
8853 assert(0);
8854 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008855}
8856
Alexander Belopolsky40018472011-02-26 01:02:56 +00008857static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858replace(PyObject *self, PyObject *str1,
8859 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008861 PyObject *u;
8862 char *sbuf = PyUnicode_DATA(self);
8863 char *buf1 = PyUnicode_DATA(str1);
8864 char *buf2 = PyUnicode_DATA(str2);
8865 int srelease = 0, release1 = 0, release2 = 0;
8866 int skind = PyUnicode_KIND(self);
8867 int kind1 = PyUnicode_KIND(str1);
8868 int kind2 = PyUnicode_KIND(str2);
8869 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8870 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8871 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872
8873 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008876 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 if (skind < kind1)
8879 /* substring too wide to be present */
8880 goto nothing;
8881
8882 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008883 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008884 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008886 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008887 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008888 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889 Py_UCS4 u1, u2, maxchar;
8890 int mayshrink, rkind;
8891 u1 = PyUnicode_READ_CHAR(str1, 0);
8892 if (!findchar(sbuf, PyUnicode_KIND(self),
8893 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008894 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895 u2 = PyUnicode_READ_CHAR(str2, 0);
8896 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8897 /* Replacing u1 with u2 may cause a maxchar reduction in the
8898 result string. */
8899 mayshrink = maxchar > 127;
8900 if (u2 > maxchar) {
8901 maxchar = u2;
8902 mayshrink = 0;
8903 }
8904 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008905 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008907 if (PyUnicode_CopyCharacters(u, 0,
8908 (PyObject*)self, 0, slen) < 0)
8909 {
8910 Py_DECREF(u);
8911 return NULL;
8912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913 rkind = PyUnicode_KIND(u);
8914 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8915 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008916 if (--maxcount < 0)
8917 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008918 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920 if (mayshrink) {
8921 PyObject *tmp = u;
8922 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8923 PyUnicode_GET_LENGTH(tmp));
8924 Py_DECREF(tmp);
8925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 int rkind = skind;
8928 char *res;
8929 if (kind1 < rkind) {
8930 /* widen substring */
8931 buf1 = _PyUnicode_AsKind(str1, rkind);
8932 if (!buf1) goto error;
8933 release1 = 1;
8934 }
8935 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008936 if (i < 0)
8937 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938 if (rkind > kind2) {
8939 /* widen replacement */
8940 buf2 = _PyUnicode_AsKind(str2, rkind);
8941 if (!buf2) goto error;
8942 release2 = 1;
8943 }
8944 else if (rkind < kind2) {
8945 /* widen self and buf1 */
8946 rkind = kind2;
8947 if (release1) PyMem_Free(buf1);
8948 sbuf = _PyUnicode_AsKind(self, rkind);
8949 if (!sbuf) goto error;
8950 srelease = 1;
8951 buf1 = _PyUnicode_AsKind(str1, rkind);
8952 if (!buf1) goto error;
8953 release1 = 1;
8954 }
8955 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8956 if (!res) {
8957 PyErr_NoMemory();
8958 goto error;
8959 }
8960 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008961 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8963 buf2,
8964 PyUnicode_KIND_SIZE(rkind, len2));
8965 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008966
8967 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008968 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8969 slen-i,
8970 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008971 if (i == -1)
8972 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8974 buf2,
8975 PyUnicode_KIND_SIZE(rkind, len2));
8976 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978
8979 u = PyUnicode_FromKindAndData(rkind, res, slen);
8980 PyMem_Free(res);
8981 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 Py_ssize_t n, i, j, ires;
8986 Py_ssize_t product, new_size;
8987 int rkind = skind;
8988 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990 if (kind1 < rkind) {
8991 buf1 = _PyUnicode_AsKind(str1, rkind);
8992 if (!buf1) goto error;
8993 release1 = 1;
8994 }
8995 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008996 if (n == 0)
8997 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008998 if (kind2 < rkind) {
8999 buf2 = _PyUnicode_AsKind(str2, rkind);
9000 if (!buf2) goto error;
9001 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003 else if (kind2 > rkind) {
9004 rkind = kind2;
9005 sbuf = _PyUnicode_AsKind(self, rkind);
9006 if (!sbuf) goto error;
9007 srelease = 1;
9008 if (release1) PyMem_Free(buf1);
9009 buf1 = _PyUnicode_AsKind(str1, rkind);
9010 if (!buf1) goto error;
9011 release1 = 1;
9012 }
9013 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9014 PyUnicode_GET_LENGTH(str1))); */
9015 product = n * (len2-len1);
9016 if ((product / (len2-len1)) != n) {
9017 PyErr_SetString(PyExc_OverflowError,
9018 "replace string is too long");
9019 goto error;
9020 }
9021 new_size = slen + product;
9022 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9023 PyErr_SetString(PyExc_OverflowError,
9024 "replace string is too long");
9025 goto error;
9026 }
9027 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9028 if (!res)
9029 goto error;
9030 ires = i = 0;
9031 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009032 while (n-- > 0) {
9033 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 j = anylib_find(rkind,
9035 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9036 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009037 if (j == -1)
9038 break;
9039 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009040 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9042 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9043 PyUnicode_KIND_SIZE(rkind, j-i));
9044 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009045 }
9046 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047 if (len2 > 0) {
9048 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9049 buf2,
9050 PyUnicode_KIND_SIZE(rkind, len2));
9051 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009056 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9058 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9059 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009060 } else {
9061 /* interleave */
9062 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009063 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9064 buf2,
9065 PyUnicode_KIND_SIZE(rkind, len2));
9066 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009067 if (--n <= 0)
9068 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9070 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9071 PyUnicode_KIND_SIZE(rkind, 1));
9072 ires++;
9073 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009074 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9076 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9077 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009078 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081 if (srelease)
9082 PyMem_FREE(sbuf);
9083 if (release1)
9084 PyMem_FREE(buf1);
9085 if (release2)
9086 PyMem_FREE(buf2);
9087 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009088
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009090 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 if (srelease)
9092 PyMem_FREE(sbuf);
9093 if (release1)
9094 PyMem_FREE(buf1);
9095 if (release2)
9096 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009097 if (PyUnicode_CheckExact(self)) {
9098 Py_INCREF(self);
9099 return (PyObject *) self;
9100 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009101 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 error:
9103 if (srelease && sbuf)
9104 PyMem_FREE(sbuf);
9105 if (release1 && buf1)
9106 PyMem_FREE(buf1);
9107 if (release2 && buf2)
9108 PyMem_FREE(buf2);
9109 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110}
9111
9112/* --- Unicode Object Methods --------------------------------------------- */
9113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009114PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116\n\
9117Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009118characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119
9120static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009121unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123 return fixup(self, fixtitle);
9124}
9125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009126PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009127 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128\n\
9129Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009130have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131
9132static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009133unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009135 return fixup(self, fixcapitalize);
9136}
9137
9138#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009139PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141\n\
9142Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009143normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144
9145static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009146unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147{
9148 PyObject *list;
9149 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009150 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152 /* Split into words */
9153 list = split(self, NULL, -1);
9154 if (!list)
9155 return NULL;
9156
9157 /* Capitalize each word */
9158 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9159 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009160 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161 if (item == NULL)
9162 goto onError;
9163 Py_DECREF(PyList_GET_ITEM(list, i));
9164 PyList_SET_ITEM(list, i, item);
9165 }
9166
9167 /* Join the words to form a new string */
9168 item = PyUnicode_Join(NULL, list);
9169
Benjamin Peterson29060642009-01-31 22:14:21 +00009170 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171 Py_DECREF(list);
9172 return (PyObject *)item;
9173}
9174#endif
9175
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009176/* Argument converter. Coerces to a single unicode character */
9177
9178static int
9179convert_uc(PyObject *obj, void *addr)
9180{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009182 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009183
Benjamin Peterson14339b62009-01-31 16:36:08 +00009184 uniobj = PyUnicode_FromObject(obj);
9185 if (uniobj == NULL) {
9186 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009187 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009188 return 0;
9189 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009190 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009191 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009192 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009193 Py_DECREF(uniobj);
9194 return 0;
9195 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196 if (PyUnicode_READY(uniobj)) {
9197 Py_DECREF(uniobj);
9198 return 0;
9199 }
9200 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009201 Py_DECREF(uniobj);
9202 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009203}
9204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009205PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009206 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009208Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009209done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210
9211static PyObject *
9212unicode_center(PyUnicodeObject *self, PyObject *args)
9213{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009214 Py_ssize_t marg, left;
9215 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216 Py_UCS4 fillchar = ' ';
9217
9218 if (PyUnicode_READY(self) == -1)
9219 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220
Thomas Woutersde017742006-02-16 19:34:37 +00009221 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222 return NULL;
9223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225 Py_INCREF(self);
9226 return (PyObject*) self;
9227 }
9228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009229 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230 left = marg / 2 + (marg & width & 1);
9231
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009232 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233}
9234
Marc-André Lemburge5034372000-08-08 08:04:29 +00009235#if 0
9236
9237/* This code should go into some future Unicode collation support
9238 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009239 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009240
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009241/* speedy UTF-16 code point order comparison */
9242/* gleaned from: */
9243/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9244
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009245static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009246{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009247 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009248 0, 0, 0, 0, 0, 0, 0, 0,
9249 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009250 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009251};
9252
Guido van Rossumd57fd912000-03-10 22:53:23 +00009253static int
9254unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9255{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009256 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009257
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258 Py_UNICODE *s1 = str1->str;
9259 Py_UNICODE *s2 = str2->str;
9260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 len1 = str1->_base._base.length;
9262 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009263
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009265 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009266
9267 c1 = *s1++;
9268 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009269
Benjamin Peterson29060642009-01-31 22:14:21 +00009270 if (c1 > (1<<11) * 26)
9271 c1 += utf16Fixup[c1>>11];
9272 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009273 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009274 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009275
9276 if (c1 != c2)
9277 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009278
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009279 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009280 }
9281
9282 return (len1 < len2) ? -1 : (len1 != len2);
9283}
9284
Marc-André Lemburge5034372000-08-08 08:04:29 +00009285#else
9286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287/* This function assumes that str1 and str2 are readied by the caller. */
9288
Marc-André Lemburge5034372000-08-08 08:04:29 +00009289static int
9290unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9291{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292 int kind1, kind2;
9293 void *data1, *data2;
9294 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296 kind1 = PyUnicode_KIND(str1);
9297 kind2 = PyUnicode_KIND(str2);
9298 data1 = PyUnicode_DATA(str1);
9299 data2 = PyUnicode_DATA(str2);
9300 len1 = PyUnicode_GET_LENGTH(str1);
9301 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009303 for (i = 0; i < len1 && i < len2; ++i) {
9304 Py_UCS4 c1, c2;
9305 c1 = PyUnicode_READ(kind1, data1, i);
9306 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009307
9308 if (c1 != c2)
9309 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009310 }
9311
9312 return (len1 < len2) ? -1 : (len1 != len2);
9313}
9314
9315#endif
9316
Alexander Belopolsky40018472011-02-26 01:02:56 +00009317int
9318PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9321 if (PyUnicode_READY(left) == -1 ||
9322 PyUnicode_READY(right) == -1)
9323 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009324 return unicode_compare((PyUnicodeObject *)left,
9325 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009326 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009327 PyErr_Format(PyExc_TypeError,
9328 "Can't compare %.100s and %.100s",
9329 left->ob_type->tp_name,
9330 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009331 return -1;
9332}
9333
Martin v. Löwis5b222132007-06-10 09:51:05 +00009334int
9335PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9336{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 Py_ssize_t i;
9338 int kind;
9339 void *data;
9340 Py_UCS4 chr;
9341
Martin v. Löwis5b222132007-06-10 09:51:05 +00009342 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 if (PyUnicode_READY(uni) == -1)
9344 return -1;
9345 kind = PyUnicode_KIND(uni);
9346 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009347 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9349 if (chr != str[i])
9350 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009351 /* This check keeps Python strings that end in '\0' from comparing equal
9352 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009354 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009355 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009356 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009357 return 0;
9358}
9359
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009360
Benjamin Peterson29060642009-01-31 22:14:21 +00009361#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009362 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009363
Alexander Belopolsky40018472011-02-26 01:02:56 +00009364PyObject *
9365PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009366{
9367 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009368
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009369 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9370 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 if (PyUnicode_READY(left) == -1 ||
9372 PyUnicode_READY(right) == -1)
9373 return NULL;
9374 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9375 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009376 if (op == Py_EQ) {
9377 Py_INCREF(Py_False);
9378 return Py_False;
9379 }
9380 if (op == Py_NE) {
9381 Py_INCREF(Py_True);
9382 return Py_True;
9383 }
9384 }
9385 if (left == right)
9386 result = 0;
9387 else
9388 result = unicode_compare((PyUnicodeObject *)left,
9389 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009390
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009391 /* Convert the return value to a Boolean */
9392 switch (op) {
9393 case Py_EQ:
9394 v = TEST_COND(result == 0);
9395 break;
9396 case Py_NE:
9397 v = TEST_COND(result != 0);
9398 break;
9399 case Py_LE:
9400 v = TEST_COND(result <= 0);
9401 break;
9402 case Py_GE:
9403 v = TEST_COND(result >= 0);
9404 break;
9405 case Py_LT:
9406 v = TEST_COND(result == -1);
9407 break;
9408 case Py_GT:
9409 v = TEST_COND(result == 1);
9410 break;
9411 default:
9412 PyErr_BadArgument();
9413 return NULL;
9414 }
9415 Py_INCREF(v);
9416 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009417 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009418
Brian Curtindfc80e32011-08-10 20:28:54 -05009419 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009420}
9421
Alexander Belopolsky40018472011-02-26 01:02:56 +00009422int
9423PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009424{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009425 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009426 int kind1, kind2, kind;
9427 void *buf1, *buf2;
9428 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009429 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009430
9431 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009432 sub = PyUnicode_FromObject(element);
9433 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009434 PyErr_Format(PyExc_TypeError,
9435 "'in <string>' requires string as left operand, not %s",
9436 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009437 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 if (PyUnicode_READY(sub) == -1)
9440 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009441
Thomas Wouters477c8d52006-05-27 19:21:47 +00009442 str = PyUnicode_FromObject(container);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 if (!str || PyUnicode_READY(container) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009444 Py_DECREF(sub);
9445 return -1;
9446 }
9447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448 kind1 = PyUnicode_KIND(str);
9449 kind2 = PyUnicode_KIND(sub);
9450 kind = kind1 > kind2 ? kind1 : kind2;
9451 buf1 = PyUnicode_DATA(str);
9452 buf2 = PyUnicode_DATA(sub);
9453 if (kind1 != kind)
9454 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9455 if (!buf1) {
9456 Py_DECREF(sub);
9457 return -1;
9458 }
9459 if (kind2 != kind)
9460 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9461 if (!buf2) {
9462 Py_DECREF(sub);
9463 if (kind1 != kind) PyMem_Free(buf1);
9464 return -1;
9465 }
9466 len1 = PyUnicode_GET_LENGTH(str);
9467 len2 = PyUnicode_GET_LENGTH(sub);
9468
9469 switch(kind) {
9470 case PyUnicode_1BYTE_KIND:
9471 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9472 break;
9473 case PyUnicode_2BYTE_KIND:
9474 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9475 break;
9476 case PyUnicode_4BYTE_KIND:
9477 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9478 break;
9479 default:
9480 result = -1;
9481 assert(0);
9482 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009483
9484 Py_DECREF(str);
9485 Py_DECREF(sub);
9486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 if (kind1 != kind)
9488 PyMem_Free(buf1);
9489 if (kind2 != kind)
9490 PyMem_Free(buf2);
9491
Guido van Rossum403d68b2000-03-13 15:55:09 +00009492 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009493}
9494
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495/* Concat to string or Unicode object giving a new Unicode object. */
9496
Alexander Belopolsky40018472011-02-26 01:02:56 +00009497PyObject *
9498PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 PyObject *u = NULL, *v = NULL, *w;
9501 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502
9503 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009506 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510
9511 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009513 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009517 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519 }
9520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521 if (PyUnicode_READY(u) == -1 || PyUnicode_READY(v) == -1)
9522 goto onError;
9523
9524 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009525 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526
Guido van Rossumd57fd912000-03-10 22:53:23 +00009527 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009528 w = PyUnicode_New(
9529 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9530 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009533 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9534 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009535 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009536 v, 0,
9537 PyUnicode_GET_LENGTH(v)) < 0)
9538 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539 Py_DECREF(u);
9540 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542
Benjamin Peterson29060642009-01-31 22:14:21 +00009543 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544 Py_XDECREF(u);
9545 Py_XDECREF(v);
9546 return NULL;
9547}
9548
Walter Dörwald1ab83302007-05-18 17:15:44 +00009549void
9550PyUnicode_Append(PyObject **pleft, PyObject *right)
9551{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009552 PyObject *new;
9553 if (*pleft == NULL)
9554 return;
9555 if (right == NULL || !PyUnicode_Check(*pleft)) {
9556 Py_DECREF(*pleft);
9557 *pleft = NULL;
9558 return;
9559 }
9560 new = PyUnicode_Concat(*pleft, right);
9561 Py_DECREF(*pleft);
9562 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009563}
9564
9565void
9566PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9567{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009568 PyUnicode_Append(pleft, right);
9569 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009570}
9571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009572PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009573 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009575Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009576string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009577interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578
9579static PyObject *
9580unicode_count(PyUnicodeObject *self, PyObject *args)
9581{
9582 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009583 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009584 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586 int kind1, kind2, kind;
9587 void *buf1, *buf2;
9588 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589
Jesus Ceaac451502011-04-20 17:09:23 +02009590 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9591 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009592 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 kind1 = PyUnicode_KIND(self);
9595 kind2 = PyUnicode_KIND(substring);
9596 kind = kind1 > kind2 ? kind1 : kind2;
9597 buf1 = PyUnicode_DATA(self);
9598 buf2 = PyUnicode_DATA(substring);
9599 if (kind1 != kind)
9600 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9601 if (!buf1) {
9602 Py_DECREF(substring);
9603 return NULL;
9604 }
9605 if (kind2 != kind)
9606 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9607 if (!buf2) {
9608 Py_DECREF(substring);
9609 if (kind1 != kind) PyMem_Free(buf1);
9610 return NULL;
9611 }
9612 len1 = PyUnicode_GET_LENGTH(self);
9613 len2 = PyUnicode_GET_LENGTH(substring);
9614
9615 ADJUST_INDICES(start, end, len1);
9616 switch(kind) {
9617 case PyUnicode_1BYTE_KIND:
9618 iresult = ucs1lib_count(
9619 ((Py_UCS1*)buf1) + start, end - start,
9620 buf2, len2, PY_SSIZE_T_MAX
9621 );
9622 break;
9623 case PyUnicode_2BYTE_KIND:
9624 iresult = ucs2lib_count(
9625 ((Py_UCS2*)buf1) + start, end - start,
9626 buf2, len2, PY_SSIZE_T_MAX
9627 );
9628 break;
9629 case PyUnicode_4BYTE_KIND:
9630 iresult = ucs4lib_count(
9631 ((Py_UCS4*)buf1) + start, end - start,
9632 buf2, len2, PY_SSIZE_T_MAX
9633 );
9634 break;
9635 default:
9636 assert(0); iresult = 0;
9637 }
9638
9639 result = PyLong_FromSsize_t(iresult);
9640
9641 if (kind1 != kind)
9642 PyMem_Free(buf1);
9643 if (kind2 != kind)
9644 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645
9646 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009647
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648 return result;
9649}
9650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009651PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009652 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009654Encode S using the codec registered for encoding. Default encoding\n\
9655is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009656handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009657a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9658'xmlcharrefreplace' as well as any other name registered with\n\
9659codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660
9661static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009662unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009664 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009665 char *encoding = NULL;
9666 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009667
Benjamin Peterson308d6372009-09-18 21:42:35 +00009668 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9669 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009671 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009672}
9673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009674PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009675 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676\n\
9677Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009678If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679
9680static PyObject*
9681unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9682{
9683 Py_UNICODE *e;
9684 Py_UNICODE *p;
9685 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009686 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 PyUnicodeObject *u;
9689 int tabsize = 8;
9690
9691 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009692 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9695 return NULL;
9696
Thomas Wouters7e474022000-07-16 12:04:32 +00009697 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009698 i = 0; /* chars up to and including most recent \n or \r */
9699 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9701 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009703 if (tabsize > 0) {
9704 incr = tabsize - (j % tabsize); /* cannot overflow */
9705 if (j > PY_SSIZE_T_MAX - incr)
9706 goto overflow1;
9707 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009708 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009711 if (j > PY_SSIZE_T_MAX - 1)
9712 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713 j++;
9714 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009715 if (i > PY_SSIZE_T_MAX - j)
9716 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009718 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719 }
9720 }
9721
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009722 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009723 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009724
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725 /* Second pass: create output string and fill it */
9726 u = _PyUnicode_New(i + j);
9727 if (!u)
9728 return NULL;
9729
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009730 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 q = _PyUnicode_WSTR(u); /* next output char */
9732 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009734 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009736 if (tabsize > 0) {
9737 i = tabsize - (j % tabsize);
9738 j += i;
9739 while (i--) {
9740 if (q >= qe)
9741 goto overflow2;
9742 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009743 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009744 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009745 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009746 else {
9747 if (q >= qe)
9748 goto overflow2;
9749 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009750 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009751 if (*p == '\n' || *p == '\r')
9752 j = 0;
9753 }
9754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009755 if (PyUnicode_READY(u) == -1) {
9756 Py_DECREF(u);
9757 return NULL;
9758 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009760
9761 overflow2:
9762 Py_DECREF(u);
9763 overflow1:
9764 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9765 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766}
9767
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009768PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009769 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770\n\
9771Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009772such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773arguments start and end are interpreted as in slice notation.\n\
9774\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009775Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776
9777static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779{
Jesus Ceaac451502011-04-20 17:09:23 +02009780 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009781 Py_ssize_t start;
9782 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009783 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009784
Jesus Ceaac451502011-04-20 17:09:23 +02009785 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9786 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 if (PyUnicode_READY(self) == -1)
9790 return NULL;
9791 if (PyUnicode_READY(substring) == -1)
9792 return NULL;
9793
9794 result = any_find_slice(
9795 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9796 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009797 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009798
9799 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 if (result == -2)
9802 return NULL;
9803
Christian Heimes217cfd12007-12-02 14:31:20 +00009804 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805}
9806
9807static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009808unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 Py_UCS4 ch;
9811
9812 if (PyUnicode_READY(self) == -1)
9813 return NULL;
9814 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815 PyErr_SetString(PyExc_IndexError, "string index out of range");
9816 return NULL;
9817 }
9818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9820 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821}
9822
Guido van Rossumc2504932007-09-18 19:42:40 +00009823/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009824 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009825static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009826unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827{
Guido van Rossumc2504932007-09-18 19:42:40 +00009828 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009829 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 if (_PyUnicode_HASH(self) != -1)
9832 return _PyUnicode_HASH(self);
9833 if (PyUnicode_READY(self) == -1)
9834 return -1;
9835 len = PyUnicode_GET_LENGTH(self);
9836
9837 /* The hash function as a macro, gets expanded three times below. */
9838#define HASH(P) \
9839 x = (Py_uhash_t)*P << 7; \
9840 while (--len >= 0) \
9841 x = (1000003*x) ^ (Py_uhash_t)*P++;
9842
9843 switch (PyUnicode_KIND(self)) {
9844 case PyUnicode_1BYTE_KIND: {
9845 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9846 HASH(c);
9847 break;
9848 }
9849 case PyUnicode_2BYTE_KIND: {
9850 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9851 HASH(s);
9852 break;
9853 }
9854 default: {
9855 Py_UCS4 *l;
9856 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9857 "Impossible switch case in unicode_hash");
9858 l = PyUnicode_4BYTE_DATA(self);
9859 HASH(l);
9860 break;
9861 }
9862 }
9863 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9864
Guido van Rossumc2504932007-09-18 19:42:40 +00009865 if (x == -1)
9866 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009868 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009872PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009873 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009875Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009876
9877static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009880 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009881 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009882 Py_ssize_t start;
9883 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009884
Jesus Ceaac451502011-04-20 17:09:23 +02009885 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9886 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 if (PyUnicode_READY(self) == -1)
9890 return NULL;
9891 if (PyUnicode_READY(substring) == -1)
9892 return NULL;
9893
9894 result = any_find_slice(
9895 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9896 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009897 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898
9899 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 if (result == -2)
9902 return NULL;
9903
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904 if (result < 0) {
9905 PyErr_SetString(PyExc_ValueError, "substring not found");
9906 return NULL;
9907 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009908
Christian Heimes217cfd12007-12-02 14:31:20 +00009909 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009910}
9911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009912PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009913 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009915Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009916at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009917
9918static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009919unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 Py_ssize_t i, length;
9922 int kind;
9923 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924 int cased;
9925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 if (PyUnicode_READY(self) == -1)
9927 return NULL;
9928 length = PyUnicode_GET_LENGTH(self);
9929 kind = PyUnicode_KIND(self);
9930 data = PyUnicode_DATA(self);
9931
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 if (length == 1)
9934 return PyBool_FromLong(
9935 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009937 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009939 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009940
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 for (i = 0; i < length; i++) {
9943 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009944
Benjamin Peterson29060642009-01-31 22:14:21 +00009945 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9946 return PyBool_FromLong(0);
9947 else if (!cased && Py_UNICODE_ISLOWER(ch))
9948 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009950 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951}
9952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009953PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009954 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009956Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009957at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958
9959static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009960unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 Py_ssize_t i, length;
9963 int kind;
9964 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009965 int cased;
9966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 if (PyUnicode_READY(self) == -1)
9968 return NULL;
9969 length = PyUnicode_GET_LENGTH(self);
9970 kind = PyUnicode_KIND(self);
9971 data = PyUnicode_DATA(self);
9972
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 if (length == 1)
9975 return PyBool_FromLong(
9976 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009978 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009980 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009981
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 for (i = 0; i < length; i++) {
9984 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009985
Benjamin Peterson29060642009-01-31 22:14:21 +00009986 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9987 return PyBool_FromLong(0);
9988 else if (!cased && Py_UNICODE_ISUPPER(ch))
9989 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009991 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009992}
9993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009994PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009995 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009997Return True if S is a titlecased string and there is at least one\n\
9998character in S, i.e. upper- and titlecase characters may only\n\
9999follow uncased characters and lowercase characters only cased ones.\n\
10000Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010001
10002static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010003unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 Py_ssize_t i, length;
10006 int kind;
10007 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008 int cased, previous_is_cased;
10009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 if (PyUnicode_READY(self) == -1)
10011 return NULL;
10012 length = PyUnicode_GET_LENGTH(self);
10013 kind = PyUnicode_KIND(self);
10014 data = PyUnicode_DATA(self);
10015
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 if (length == 1) {
10018 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10019 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10020 (Py_UNICODE_ISUPPER(ch) != 0));
10021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010023 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010025 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010026
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027 cased = 0;
10028 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 for (i = 0; i < length; i++) {
10030 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010031
Benjamin Peterson29060642009-01-31 22:14:21 +000010032 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10033 if (previous_is_cased)
10034 return PyBool_FromLong(0);
10035 previous_is_cased = 1;
10036 cased = 1;
10037 }
10038 else if (Py_UNICODE_ISLOWER(ch)) {
10039 if (!previous_is_cased)
10040 return PyBool_FromLong(0);
10041 previous_is_cased = 1;
10042 cased = 1;
10043 }
10044 else
10045 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010046 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010047 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010048}
10049
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010050PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010051 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010052\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010053Return True if all characters in S are whitespace\n\
10054and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010055
10056static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010057unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010058{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 Py_ssize_t i, length;
10060 int kind;
10061 void *data;
10062
10063 if (PyUnicode_READY(self) == -1)
10064 return NULL;
10065 length = PyUnicode_GET_LENGTH(self);
10066 kind = PyUnicode_KIND(self);
10067 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010068
Guido van Rossumd57fd912000-03-10 22:53:23 +000010069 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 if (length == 1)
10071 return PyBool_FromLong(
10072 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010073
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010074 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010076 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 for (i = 0; i < length; i++) {
10079 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010080 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010081 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010082 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010083 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010084}
10085
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010086PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010087 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010088\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010089Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010090and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010091
10092static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010093unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010094{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 Py_ssize_t i, length;
10096 int kind;
10097 void *data;
10098
10099 if (PyUnicode_READY(self) == -1)
10100 return NULL;
10101 length = PyUnicode_GET_LENGTH(self);
10102 kind = PyUnicode_KIND(self);
10103 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010104
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010105 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 if (length == 1)
10107 return PyBool_FromLong(
10108 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010109
10110 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010112 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 for (i = 0; i < length; i++) {
10115 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010116 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010117 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010118 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010119}
10120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010121PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010122 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010123\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010124Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010125and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010126
10127static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010128unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010129{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 int kind;
10131 void *data;
10132 Py_ssize_t len, i;
10133
10134 if (PyUnicode_READY(self) == -1)
10135 return NULL;
10136
10137 kind = PyUnicode_KIND(self);
10138 data = PyUnicode_DATA(self);
10139 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010140
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010141 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 if (len == 1) {
10143 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10144 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10145 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010146
10147 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010149 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 for (i = 0; i < len; i++) {
10152 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010153 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010154 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010155 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010156 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010157}
10158
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010159PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010160 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010162Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010163False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164
10165static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010166unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 Py_ssize_t i, length;
10169 int kind;
10170 void *data;
10171
10172 if (PyUnicode_READY(self) == -1)
10173 return NULL;
10174 length = PyUnicode_GET_LENGTH(self);
10175 kind = PyUnicode_KIND(self);
10176 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 if (length == 1)
10180 return PyBool_FromLong(
10181 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010183 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010185 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 for (i = 0; i < length; i++) {
10188 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010189 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010190 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010191 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192}
10193
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010194PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010195 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010196\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010197Return True if all characters in S are digits\n\
10198and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199
10200static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010201unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 Py_ssize_t i, length;
10204 int kind;
10205 void *data;
10206
10207 if (PyUnicode_READY(self) == -1)
10208 return NULL;
10209 length = PyUnicode_GET_LENGTH(self);
10210 kind = PyUnicode_KIND(self);
10211 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 if (length == 1) {
10215 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10216 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10217 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010218
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010219 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010221 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 for (i = 0; i < length; i++) {
10224 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010225 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010227 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228}
10229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010230PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010231 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010233Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010234False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235
10236static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010237unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 Py_ssize_t i, length;
10240 int kind;
10241 void *data;
10242
10243 if (PyUnicode_READY(self) == -1)
10244 return NULL;
10245 length = PyUnicode_GET_LENGTH(self);
10246 kind = PyUnicode_KIND(self);
10247 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 if (length == 1)
10251 return PyBool_FromLong(
10252 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010254 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010256 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 for (i = 0; i < length; i++) {
10259 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010260 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010262 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263}
10264
Martin v. Löwis47383402007-08-15 07:32:56 +000010265int
10266PyUnicode_IsIdentifier(PyObject *self)
10267{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 int kind;
10269 void *data;
10270 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010271 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 if (PyUnicode_READY(self) == -1) {
10274 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010275 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 }
10277
10278 /* Special case for empty strings */
10279 if (PyUnicode_GET_LENGTH(self) == 0)
10280 return 0;
10281 kind = PyUnicode_KIND(self);
10282 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010283
10284 /* PEP 3131 says that the first character must be in
10285 XID_Start and subsequent characters in XID_Continue,
10286 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010287 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010288 letters, digits, underscore). However, given the current
10289 definition of XID_Start and XID_Continue, it is sufficient
10290 to check just for these, except that _ must be allowed
10291 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010293 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010294 return 0;
10295
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010296 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010298 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010299 return 1;
10300}
10301
10302PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010303 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010304\n\
10305Return True if S is a valid identifier according\n\
10306to the language definition.");
10307
10308static PyObject*
10309unicode_isidentifier(PyObject *self)
10310{
10311 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10312}
10313
Georg Brandl559e5d72008-06-11 18:37:52 +000010314PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010315 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010316\n\
10317Return True if all characters in S are considered\n\
10318printable in repr() or S is empty, False otherwise.");
10319
10320static PyObject*
10321unicode_isprintable(PyObject *self)
10322{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 Py_ssize_t i, length;
10324 int kind;
10325 void *data;
10326
10327 if (PyUnicode_READY(self) == -1)
10328 return NULL;
10329 length = PyUnicode_GET_LENGTH(self);
10330 kind = PyUnicode_KIND(self);
10331 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010332
10333 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 if (length == 1)
10335 return PyBool_FromLong(
10336 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 for (i = 0; i < length; i++) {
10339 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010340 Py_RETURN_FALSE;
10341 }
10342 }
10343 Py_RETURN_TRUE;
10344}
10345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010346PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010347 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348\n\
10349Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010350iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351
10352static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010353unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010354{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010355 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010356}
10357
Martin v. Löwis18e16552006-02-15 17:27:45 +000010358static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359unicode_length(PyUnicodeObject *self)
10360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 if (PyUnicode_READY(self) == -1)
10362 return -1;
10363 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364}
10365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010366PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010367 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010369Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010370done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371
10372static PyObject *
10373unicode_ljust(PyUnicodeObject *self, PyObject *args)
10374{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010375 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 Py_UCS4 fillchar = ' ';
10377
10378 if (PyUnicode_READY(self) == -1)
10379 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010380
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010381 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382 return NULL;
10383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385 Py_INCREF(self);
10386 return (PyObject*) self;
10387 }
10388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390}
10391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010392PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010393 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010395Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396
10397static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010398unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400 return fixup(self, fixlower);
10401}
10402
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010403#define LEFTSTRIP 0
10404#define RIGHTSTRIP 1
10405#define BOTHSTRIP 2
10406
10407/* Arrays indexed by above */
10408static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10409
10410#define STRIPNAME(i) (stripformat[i]+3)
10411
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010412/* externally visible for str.strip(unicode) */
10413PyObject *
10414_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 void *data;
10417 int kind;
10418 Py_ssize_t i, j, len;
10419 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10422 return NULL;
10423
10424 kind = PyUnicode_KIND(self);
10425 data = PyUnicode_DATA(self);
10426 len = PyUnicode_GET_LENGTH(self);
10427 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10428 PyUnicode_DATA(sepobj),
10429 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010430
Benjamin Peterson14339b62009-01-31 16:36:08 +000010431 i = 0;
10432 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 while (i < len &&
10434 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010435 i++;
10436 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010437 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010438
Benjamin Peterson14339b62009-01-31 16:36:08 +000010439 j = len;
10440 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010441 do {
10442 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 } while (j >= i &&
10444 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010445 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010446 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010447
Benjamin Peterson14339b62009-01-31 16:36:08 +000010448 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010449 Py_INCREF(self);
10450 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010451 }
10452 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 return PyUnicode_Substring((PyObject*)self, i, j);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010454}
10455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456/* Assumes an already ready self string. */
10457
10458static PyObject *
10459substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len)
10460{
10461 const int kind = PyUnicode_KIND(self);
10462 void *data = PyUnicode_DATA(self);
10463 Py_UCS4 maxchar = 0;
10464 Py_ssize_t i;
10465 PyObject *unicode;
10466
10467 if (start < 0 || len < 0 || (start + len) > PyUnicode_GET_LENGTH(self)) {
10468 PyErr_BadInternalCall();
10469 return NULL;
10470 }
10471
10472 if (len == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) {
10473 Py_INCREF(self);
10474 return (PyObject*)self;
10475 }
10476
10477 for (i = 0; i < len; ++i) {
10478 const Py_UCS4 ch = PyUnicode_READ(kind, data, start + i);
10479 if (ch > maxchar)
10480 maxchar = ch;
10481 }
10482
10483 unicode = PyUnicode_New(len, maxchar);
10484 if (unicode == NULL)
10485 return NULL;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010486 if (PyUnicode_CopyCharacters(unicode, 0,
10487 (PyObject*)self, start, len) < 0)
10488 {
10489 Py_DECREF(unicode);
10490 return NULL;
10491 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 return unicode;
10493}
10494
10495PyObject*
10496PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10497{
10498 unsigned char *data;
10499 int kind;
10500
10501 if (start == 0 && end == PyUnicode_GET_LENGTH(self)
10502 && PyUnicode_CheckExact(self))
10503 {
10504 Py_INCREF(self);
10505 return (PyObject *)self;
10506 }
10507
10508 if ((end - start) == 1)
10509 return unicode_getitem((PyUnicodeObject*)self, start);
10510
10511 if (PyUnicode_READY(self) == -1)
10512 return NULL;
10513 kind = PyUnicode_KIND(self);
10514 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010515 return PyUnicode_FromKindAndData(kind,
10516 data + PyUnicode_KIND_SIZE(kind, start),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 end-start);
10518}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519
10520static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010521do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 int kind;
10524 void *data;
10525 Py_ssize_t len, i, j;
10526
10527 if (PyUnicode_READY(self) == -1)
10528 return NULL;
10529
10530 kind = PyUnicode_KIND(self);
10531 data = PyUnicode_DATA(self);
10532 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010533
Benjamin Peterson14339b62009-01-31 16:36:08 +000010534 i = 0;
10535 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010537 i++;
10538 }
10539 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010540
Benjamin Peterson14339b62009-01-31 16:36:08 +000010541 j = len;
10542 if (striptype != LEFTSTRIP) {
10543 do {
10544 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010546 j++;
10547 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010548
Benjamin Peterson14339b62009-01-31 16:36:08 +000010549 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
10550 Py_INCREF(self);
10551 return (PyObject*)self;
10552 }
10553 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 return substring(self, i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010555}
10556
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010557
10558static PyObject *
10559do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10560{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010561 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010562
Benjamin Peterson14339b62009-01-31 16:36:08 +000010563 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10564 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010565
Benjamin Peterson14339b62009-01-31 16:36:08 +000010566 if (sep != NULL && sep != Py_None) {
10567 if (PyUnicode_Check(sep))
10568 return _PyUnicode_XStrip(self, striptype, sep);
10569 else {
10570 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010571 "%s arg must be None or str",
10572 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010573 return NULL;
10574 }
10575 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010576
Benjamin Peterson14339b62009-01-31 16:36:08 +000010577 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010578}
10579
10580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010581PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010582 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010583\n\
10584Return a copy of the string S with leading and trailing\n\
10585whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010586If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010587
10588static PyObject *
10589unicode_strip(PyUnicodeObject *self, PyObject *args)
10590{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010591 if (PyTuple_GET_SIZE(args) == 0)
10592 return do_strip(self, BOTHSTRIP); /* Common case */
10593 else
10594 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010595}
10596
10597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010598PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010599 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010600\n\
10601Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010602If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010603
10604static PyObject *
10605unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10606{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010607 if (PyTuple_GET_SIZE(args) == 0)
10608 return do_strip(self, LEFTSTRIP); /* Common case */
10609 else
10610 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010611}
10612
10613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010614PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010615 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010616\n\
10617Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010618If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010619
10620static PyObject *
10621unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10622{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010623 if (PyTuple_GET_SIZE(args) == 0)
10624 return do_strip(self, RIGHTSTRIP); /* Common case */
10625 else
10626 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010627}
10628
10629
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010631unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632{
10633 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 Py_ssize_t nchars, n;
10635 size_t nbytes, char_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636
Georg Brandl222de0f2009-04-12 12:01:50 +000010637 if (len < 1) {
10638 Py_INCREF(unicode_empty);
10639 return (PyObject *)unicode_empty;
10640 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641
Tim Peters7a29bd52001-09-12 03:03:31 +000010642 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643 /* no repeat, return original string */
10644 Py_INCREF(str);
10645 return (PyObject*) str;
10646 }
Tim Peters8f422462000-09-09 06:13:41 +000010647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 if (PyUnicode_READY(str) == -1)
10649 return NULL;
10650
Tim Peters8f422462000-09-09 06:13:41 +000010651 /* ensure # of chars needed doesn't overflow int and # of bytes
10652 * needed doesn't overflow size_t
10653 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 nchars = len * PyUnicode_GET_LENGTH(str);
10655 if (nchars / len != PyUnicode_GET_LENGTH(str)) {
Tim Peters8f422462000-09-09 06:13:41 +000010656 PyErr_SetString(PyExc_OverflowError,
10657 "repeated string is too long");
10658 return NULL;
10659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 char_size = PyUnicode_CHARACTER_SIZE(str);
10661 nbytes = (nchars + 1) * char_size;
10662 if (nbytes / char_size != (size_t)(nchars + 1)) {
Tim Peters8f422462000-09-09 06:13:41 +000010663 PyErr_SetString(PyExc_OverflowError,
10664 "repeated string is too long");
10665 return NULL;
10666 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668 if (!u)
10669 return NULL;
10670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 if (PyUnicode_GET_LENGTH(str) == 1) {
10672 const int kind = PyUnicode_KIND(str);
10673 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10674 void *to = PyUnicode_DATA(u);
10675 for (n = 0; n < len; ++n)
10676 PyUnicode_WRITE(kind, to, n, fill_char);
10677 }
10678 else {
10679 /* number of characters copied this far */
10680 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10681 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10682 char *to = (char *) PyUnicode_DATA(u);
10683 Py_MEMCPY(to, PyUnicode_DATA(str),
10684 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010685 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 n = (done <= nchars-done) ? done : nchars-done;
10687 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010688 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010690 }
10691
10692 return (PyObject*) u;
10693}
10694
Alexander Belopolsky40018472011-02-26 01:02:56 +000010695PyObject *
10696PyUnicode_Replace(PyObject *obj,
10697 PyObject *subobj,
10698 PyObject *replobj,
10699 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700{
10701 PyObject *self;
10702 PyObject *str1;
10703 PyObject *str2;
10704 PyObject *result;
10705
10706 self = PyUnicode_FromObject(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 if (self == NULL || PyUnicode_READY(obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010708 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709 str1 = PyUnicode_FromObject(subobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710 if (str1 == NULL || PyUnicode_READY(obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010711 Py_DECREF(self);
10712 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713 }
10714 str2 = PyUnicode_FromObject(replobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 if (str2 == NULL || PyUnicode_READY(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010716 Py_DECREF(self);
10717 Py_DECREF(str1);
10718 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721 Py_DECREF(self);
10722 Py_DECREF(str1);
10723 Py_DECREF(str2);
10724 return result;
10725}
10726
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010727PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010728 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729\n\
10730Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010731old replaced by new. If the optional argument count is\n\
10732given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733
10734static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 PyObject *str1;
10738 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010739 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740 PyObject *result;
10741
Martin v. Löwis18e16552006-02-15 17:27:45 +000010742 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010745 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746 str1 = PyUnicode_FromObject(str1);
10747 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10748 return NULL;
10749 str2 = PyUnicode_FromObject(str2);
10750 if (str2 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010751 Py_DECREF(str1);
10752 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010753 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754
10755 result = replace(self, str1, str2, maxcount);
10756
10757 Py_DECREF(str1);
10758 Py_DECREF(str2);
10759 return result;
10760}
10761
Alexander Belopolsky40018472011-02-26 01:02:56 +000010762static PyObject *
10763unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010765 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010766 Py_ssize_t isize;
10767 Py_ssize_t osize, squote, dquote, i, o;
10768 Py_UCS4 max, quote;
10769 int ikind, okind;
10770 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010773 return NULL;
10774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 isize = PyUnicode_GET_LENGTH(unicode);
10776 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 /* Compute length of output, quote characters, and
10779 maximum character */
10780 osize = 2; /* quotes */
10781 max = 127;
10782 squote = dquote = 0;
10783 ikind = PyUnicode_KIND(unicode);
10784 for (i = 0; i < isize; i++) {
10785 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10786 switch (ch) {
10787 case '\'': squote++; osize++; break;
10788 case '"': dquote++; osize++; break;
10789 case '\\': case '\t': case '\r': case '\n':
10790 osize += 2; break;
10791 default:
10792 /* Fast-path ASCII */
10793 if (ch < ' ' || ch == 0x7f)
10794 osize += 4; /* \xHH */
10795 else if (ch < 0x7f)
10796 osize++;
10797 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10798 osize++;
10799 max = ch > max ? ch : max;
10800 }
10801 else if (ch < 0x100)
10802 osize += 4; /* \xHH */
10803 else if (ch < 0x10000)
10804 osize += 6; /* \uHHHH */
10805 else
10806 osize += 10; /* \uHHHHHHHH */
10807 }
10808 }
10809
10810 quote = '\'';
10811 if (squote) {
10812 if (dquote)
10813 /* Both squote and dquote present. Use squote,
10814 and escape them */
10815 osize += squote;
10816 else
10817 quote = '"';
10818 }
10819
10820 repr = PyUnicode_New(osize, max);
10821 if (repr == NULL)
10822 return NULL;
10823 okind = PyUnicode_KIND(repr);
10824 odata = PyUnicode_DATA(repr);
10825
10826 PyUnicode_WRITE(okind, odata, 0, quote);
10827 PyUnicode_WRITE(okind, odata, osize-1, quote);
10828
10829 for (i = 0, o = 1; i < isize; i++) {
10830 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010831
10832 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 if ((ch == quote) || (ch == '\\')) {
10834 PyUnicode_WRITE(okind, odata, o++, '\\');
10835 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010836 continue;
10837 }
10838
Benjamin Peterson29060642009-01-31 22:14:21 +000010839 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010840 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 PyUnicode_WRITE(okind, odata, o++, '\\');
10842 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010843 }
10844 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845 PyUnicode_WRITE(okind, odata, o++, '\\');
10846 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010847 }
10848 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849 PyUnicode_WRITE(okind, odata, o++, '\\');
10850 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010851 }
10852
10853 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010854 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 PyUnicode_WRITE(okind, odata, o++, '\\');
10856 PyUnicode_WRITE(okind, odata, o++, 'x');
10857 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10858 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010859 }
10860
Georg Brandl559e5d72008-06-11 18:37:52 +000010861 /* Copy ASCII characters as-is */
10862 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010864 }
10865
Benjamin Peterson29060642009-01-31 22:14:21 +000010866 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010867 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010868 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010869 (categories Z* and C* except ASCII space)
10870 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010871 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010872 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 if (ch <= 0xff) {
10874 PyUnicode_WRITE(okind, odata, o++, '\\');
10875 PyUnicode_WRITE(okind, odata, o++, 'x');
10876 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10877 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010878 }
10879 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 else if (ch >= 0x10000) {
10881 PyUnicode_WRITE(okind, odata, o++, '\\');
10882 PyUnicode_WRITE(okind, odata, o++, 'U');
10883 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10884 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10885 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10886 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10887 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10888 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10889 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10890 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010891 }
10892 /* Map 16-bit characters to '\uxxxx' */
10893 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 PyUnicode_WRITE(okind, odata, o++, '\\');
10895 PyUnicode_WRITE(okind, odata, o++, 'u');
10896 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10897 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10898 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10899 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010900 }
10901 }
10902 /* Copy characters as-is */
10903 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010905 }
10906 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010909 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910}
10911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010912PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010913 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914\n\
10915Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010916such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917arguments start and end are interpreted as in slice notation.\n\
10918\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010919Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010920
10921static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923{
Jesus Ceaac451502011-04-20 17:09:23 +020010924 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010925 Py_ssize_t start;
10926 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010927 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928
Jesus Ceaac451502011-04-20 17:09:23 +020010929 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10930 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010931 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 if (PyUnicode_READY(self) == -1)
10934 return NULL;
10935 if (PyUnicode_READY(substring) == -1)
10936 return NULL;
10937
10938 result = any_find_slice(
10939 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10940 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010941 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942
10943 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945 if (result == -2)
10946 return NULL;
10947
Christian Heimes217cfd12007-12-02 14:31:20 +000010948 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949}
10950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010951PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010952 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010954Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955
10956static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958{
Jesus Ceaac451502011-04-20 17:09:23 +020010959 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010960 Py_ssize_t start;
10961 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010962 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963
Jesus Ceaac451502011-04-20 17:09:23 +020010964 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10965 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010966 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 if (PyUnicode_READY(self) == -1)
10969 return NULL;
10970 if (PyUnicode_READY(substring) == -1)
10971 return NULL;
10972
10973 result = any_find_slice(
10974 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10975 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010976 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977
10978 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 if (result == -2)
10981 return NULL;
10982
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983 if (result < 0) {
10984 PyErr_SetString(PyExc_ValueError, "substring not found");
10985 return NULL;
10986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987
Christian Heimes217cfd12007-12-02 14:31:20 +000010988 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989}
10990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010991PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010992 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010994Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010995done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996
10997static PyObject *
10998unicode_rjust(PyUnicodeObject *self, PyObject *args)
10999{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011000 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 Py_UCS4 fillchar = ' ';
11002
11003 if (PyUnicode_READY(self) == -1)
11004 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011005
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011006 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007 return NULL;
11008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010 Py_INCREF(self);
11011 return (PyObject*) self;
11012 }
11013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015}
11016
Alexander Belopolsky40018472011-02-26 01:02:56 +000011017PyObject *
11018PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019{
11020 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011021
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022 s = PyUnicode_FromObject(s);
11023 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011024 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011025 if (sep != NULL) {
11026 sep = PyUnicode_FromObject(sep);
11027 if (sep == NULL) {
11028 Py_DECREF(s);
11029 return NULL;
11030 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 }
11032
11033 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11034
11035 Py_DECREF(s);
11036 Py_XDECREF(sep);
11037 return result;
11038}
11039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011040PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011041 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042\n\
11043Return a list of the words in S, using sep as the\n\
11044delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011045splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011046whitespace string is a separator and empty strings are\n\
11047removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048
11049static PyObject*
11050unicode_split(PyUnicodeObject *self, PyObject *args)
11051{
11052 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011053 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054
Martin v. Löwis18e16552006-02-15 17:27:45 +000011055 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056 return NULL;
11057
11058 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011059 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011061 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011063 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064}
11065
Thomas Wouters477c8d52006-05-27 19:21:47 +000011066PyObject *
11067PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11068{
11069 PyObject* str_obj;
11070 PyObject* sep_obj;
11071 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 int kind1, kind2, kind;
11073 void *buf1 = NULL, *buf2 = NULL;
11074 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011075
11076 str_obj = PyUnicode_FromObject(str_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011077 if (!str_obj || PyUnicode_READY(str_in) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011078 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011079 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011081 Py_DECREF(str_obj);
11082 return NULL;
11083 }
11084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 kind1 = PyUnicode_KIND(str_in);
11086 kind2 = PyUnicode_KIND(sep_obj);
11087 kind = kind1 > kind2 ? kind1 : kind2;
11088 buf1 = PyUnicode_DATA(str_in);
11089 if (kind1 != kind)
11090 buf1 = _PyUnicode_AsKind(str_in, kind);
11091 if (!buf1)
11092 goto onError;
11093 buf2 = PyUnicode_DATA(sep_obj);
11094 if (kind2 != kind)
11095 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11096 if (!buf2)
11097 goto onError;
11098 len1 = PyUnicode_GET_LENGTH(str_obj);
11099 len2 = PyUnicode_GET_LENGTH(sep_obj);
11100
11101 switch(PyUnicode_KIND(str_in)) {
11102 case PyUnicode_1BYTE_KIND:
11103 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11104 break;
11105 case PyUnicode_2BYTE_KIND:
11106 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11107 break;
11108 case PyUnicode_4BYTE_KIND:
11109 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11110 break;
11111 default:
11112 assert(0);
11113 out = 0;
11114 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011115
11116 Py_DECREF(sep_obj);
11117 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118 if (kind1 != kind)
11119 PyMem_Free(buf1);
11120 if (kind2 != kind)
11121 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011122
11123 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011124 onError:
11125 Py_DECREF(sep_obj);
11126 Py_DECREF(str_obj);
11127 if (kind1 != kind && buf1)
11128 PyMem_Free(buf1);
11129 if (kind2 != kind && buf2)
11130 PyMem_Free(buf2);
11131 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011132}
11133
11134
11135PyObject *
11136PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11137{
11138 PyObject* str_obj;
11139 PyObject* sep_obj;
11140 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 int kind1, kind2, kind;
11142 void *buf1 = NULL, *buf2 = NULL;
11143 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011144
11145 str_obj = PyUnicode_FromObject(str_in);
11146 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011147 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011148 sep_obj = PyUnicode_FromObject(sep_in);
11149 if (!sep_obj) {
11150 Py_DECREF(str_obj);
11151 return NULL;
11152 }
11153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154 kind1 = PyUnicode_KIND(str_in);
11155 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011156 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 buf1 = PyUnicode_DATA(str_in);
11158 if (kind1 != kind)
11159 buf1 = _PyUnicode_AsKind(str_in, kind);
11160 if (!buf1)
11161 goto onError;
11162 buf2 = PyUnicode_DATA(sep_obj);
11163 if (kind2 != kind)
11164 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11165 if (!buf2)
11166 goto onError;
11167 len1 = PyUnicode_GET_LENGTH(str_obj);
11168 len2 = PyUnicode_GET_LENGTH(sep_obj);
11169
11170 switch(PyUnicode_KIND(str_in)) {
11171 case PyUnicode_1BYTE_KIND:
11172 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11173 break;
11174 case PyUnicode_2BYTE_KIND:
11175 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11176 break;
11177 case PyUnicode_4BYTE_KIND:
11178 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11179 break;
11180 default:
11181 assert(0);
11182 out = 0;
11183 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011184
11185 Py_DECREF(sep_obj);
11186 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187 if (kind1 != kind)
11188 PyMem_Free(buf1);
11189 if (kind2 != kind)
11190 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011191
11192 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 onError:
11194 Py_DECREF(sep_obj);
11195 Py_DECREF(str_obj);
11196 if (kind1 != kind && buf1)
11197 PyMem_Free(buf1);
11198 if (kind2 != kind && buf2)
11199 PyMem_Free(buf2);
11200 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011201}
11202
11203PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011204 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011205\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011206Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011207the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011208found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011209
11210static PyObject*
11211unicode_partition(PyUnicodeObject *self, PyObject *separator)
11212{
11213 return PyUnicode_Partition((PyObject *)self, separator);
11214}
11215
11216PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011217 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011218\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011219Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011220the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011221separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011222
11223static PyObject*
11224unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11225{
11226 return PyUnicode_RPartition((PyObject *)self, separator);
11227}
11228
Alexander Belopolsky40018472011-02-26 01:02:56 +000011229PyObject *
11230PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011231{
11232 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011233
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011234 s = PyUnicode_FromObject(s);
11235 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011236 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011237 if (sep != NULL) {
11238 sep = PyUnicode_FromObject(sep);
11239 if (sep == NULL) {
11240 Py_DECREF(s);
11241 return NULL;
11242 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011243 }
11244
11245 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11246
11247 Py_DECREF(s);
11248 Py_XDECREF(sep);
11249 return result;
11250}
11251
11252PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011253 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011254\n\
11255Return a list of the words in S, using sep as the\n\
11256delimiter string, starting at the end of the string and\n\
11257working to the front. If maxsplit is given, at most maxsplit\n\
11258splits are done. If sep is not specified, any whitespace string\n\
11259is a separator.");
11260
11261static PyObject*
11262unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11263{
11264 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011265 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011266
Martin v. Löwis18e16552006-02-15 17:27:45 +000011267 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011268 return NULL;
11269
11270 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011271 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011272 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011273 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011274 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011275 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011276}
11277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011278PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011279 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280\n\
11281Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011282Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011283is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284
11285static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011286unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011288 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011289 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011291 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11292 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293 return NULL;
11294
Guido van Rossum86662912000-04-11 15:38:46 +000011295 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296}
11297
11298static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011299PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300{
Walter Dörwald346737f2007-05-31 10:44:43 +000011301 if (PyUnicode_CheckExact(self)) {
11302 Py_INCREF(self);
11303 return self;
11304 } else
11305 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011306 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307}
11308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011309PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011310 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311\n\
11312Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011313and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314
11315static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011316unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318 return fixup(self, fixswapcase);
11319}
11320
Georg Brandlceee0772007-11-27 23:48:05 +000011321PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011322 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011323\n\
11324Return a translation table usable for str.translate().\n\
11325If there is only one argument, it must be a dictionary mapping Unicode\n\
11326ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011327Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011328If there are two arguments, they must be strings of equal length, and\n\
11329in the resulting dictionary, each character in x will be mapped to the\n\
11330character at the same position in y. If there is a third argument, it\n\
11331must be a string, whose characters will be mapped to None in the result.");
11332
11333static PyObject*
11334unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11335{
11336 PyObject *x, *y = NULL, *z = NULL;
11337 PyObject *new = NULL, *key, *value;
11338 Py_ssize_t i = 0;
11339 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011340
Georg Brandlceee0772007-11-27 23:48:05 +000011341 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11342 return NULL;
11343 new = PyDict_New();
11344 if (!new)
11345 return NULL;
11346 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011347 int x_kind, y_kind, z_kind;
11348 void *x_data, *y_data, *z_data;
11349
Georg Brandlceee0772007-11-27 23:48:05 +000011350 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011351 if (!PyUnicode_Check(x)) {
11352 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11353 "be a string if there is a second argument");
11354 goto err;
11355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011357 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11358 "arguments must have equal length");
11359 goto err;
11360 }
11361 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 x_kind = PyUnicode_KIND(x);
11363 y_kind = PyUnicode_KIND(y);
11364 x_data = PyUnicode_DATA(x);
11365 y_data = PyUnicode_DATA(y);
11366 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11367 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11368 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011369 if (!key || !value)
11370 goto err;
11371 res = PyDict_SetItem(new, key, value);
11372 Py_DECREF(key);
11373 Py_DECREF(value);
11374 if (res < 0)
11375 goto err;
11376 }
11377 /* create entries for deleting chars in z */
11378 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 z_kind = PyUnicode_KIND(z);
11380 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011381 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011383 if (!key)
11384 goto err;
11385 res = PyDict_SetItem(new, key, Py_None);
11386 Py_DECREF(key);
11387 if (res < 0)
11388 goto err;
11389 }
11390 }
11391 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 int kind;
11393 void *data;
11394
Georg Brandlceee0772007-11-27 23:48:05 +000011395 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011396 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011397 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11398 "to maketrans it must be a dict");
11399 goto err;
11400 }
11401 /* copy entries into the new dict, converting string keys to int keys */
11402 while (PyDict_Next(x, &i, &key, &value)) {
11403 if (PyUnicode_Check(key)) {
11404 /* convert string keys to integer keys */
11405 PyObject *newkey;
11406 if (PyUnicode_GET_SIZE(key) != 1) {
11407 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11408 "table must be of length 1");
11409 goto err;
11410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 kind = PyUnicode_KIND(key);
11412 data = PyUnicode_DATA(key);
11413 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011414 if (!newkey)
11415 goto err;
11416 res = PyDict_SetItem(new, newkey, value);
11417 Py_DECREF(newkey);
11418 if (res < 0)
11419 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011420 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011421 /* just keep integer keys */
11422 if (PyDict_SetItem(new, key, value) < 0)
11423 goto err;
11424 } else {
11425 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11426 "be strings or integers");
11427 goto err;
11428 }
11429 }
11430 }
11431 return new;
11432 err:
11433 Py_DECREF(new);
11434 return NULL;
11435}
11436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011437PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011438 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439\n\
11440Return a copy of the string S, where all characters have been mapped\n\
11441through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011442Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011443Unmapped characters are left untouched. Characters mapped to None\n\
11444are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445
11446static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450}
11451
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011452PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011453 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011455Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456
11457static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011458unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460 return fixup(self, fixupper);
11461}
11462
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011463PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011464 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011466Pad a numeric string S with zeros on the left, to fill a field\n\
11467of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468
11469static PyObject *
11470unicode_zfill(PyUnicodeObject *self, PyObject *args)
11471{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011472 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011474 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011475 int kind;
11476 void *data;
11477 Py_UCS4 chr;
11478
11479 if (PyUnicode_READY(self) == -1)
11480 return NULL;
11481
Martin v. Löwis18e16552006-02-15 17:27:45 +000011482 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 return NULL;
11484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011486 if (PyUnicode_CheckExact(self)) {
11487 Py_INCREF(self);
11488 return (PyObject*) self;
11489 }
11490 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011491 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492 }
11493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495
11496 u = pad(self, fill, 0, '0');
11497
Walter Dörwald068325e2002-04-15 13:36:47 +000011498 if (u == NULL)
11499 return NULL;
11500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011501 kind = PyUnicode_KIND(u);
11502 data = PyUnicode_DATA(u);
11503 chr = PyUnicode_READ(kind, data, fill);
11504
11505 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 PyUnicode_WRITE(kind, data, 0, chr);
11508 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509 }
11510
11511 return (PyObject*) u;
11512}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
11514#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011515static PyObject *
11516unicode__decimal2ascii(PyObject *self)
11517{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011519}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520#endif
11521
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011522PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011525Return True if S starts with the specified prefix, False otherwise.\n\
11526With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011527With optional end, stop comparing S at that position.\n\
11528prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529
11530static PyObject *
11531unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011532 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011534 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011536 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011537 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011538 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539
Jesus Ceaac451502011-04-20 17:09:23 +020011540 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011541 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011542 if (PyTuple_Check(subobj)) {
11543 Py_ssize_t i;
11544 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11545 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011547 if (substring == NULL)
11548 return NULL;
11549 result = tailmatch(self, substring, start, end, -1);
11550 Py_DECREF(substring);
11551 if (result) {
11552 Py_RETURN_TRUE;
11553 }
11554 }
11555 /* nothing matched */
11556 Py_RETURN_FALSE;
11557 }
11558 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011559 if (substring == NULL) {
11560 if (PyErr_ExceptionMatches(PyExc_TypeError))
11561 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11562 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011564 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011565 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011567 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568}
11569
11570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011571PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011572 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011574Return True if S ends with the specified suffix, False otherwise.\n\
11575With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011576With optional end, stop comparing S at that position.\n\
11577suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578
11579static PyObject *
11580unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011583 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011585 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011586 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011587 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588
Jesus Ceaac451502011-04-20 17:09:23 +020011589 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011590 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011591 if (PyTuple_Check(subobj)) {
11592 Py_ssize_t i;
11593 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11594 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011595 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011596 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011598 result = tailmatch(self, substring, start, end, +1);
11599 Py_DECREF(substring);
11600 if (result) {
11601 Py_RETURN_TRUE;
11602 }
11603 }
11604 Py_RETURN_FALSE;
11605 }
11606 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011607 if (substring == NULL) {
11608 if (PyErr_ExceptionMatches(PyExc_TypeError))
11609 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11610 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011611 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011612 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011613 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011615 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616}
11617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011618#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011619
11620PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011621 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011622\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011623Return a formatted version of S, using substitutions from args and kwargs.\n\
11624The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011625
Eric Smith27bbca62010-11-04 17:06:58 +000011626PyDoc_STRVAR(format_map__doc__,
11627 "S.format_map(mapping) -> str\n\
11628\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011629Return a formatted version of S, using substitutions from mapping.\n\
11630The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011631
Eric Smith4a7d76d2008-05-30 18:10:19 +000011632static PyObject *
11633unicode__format__(PyObject* self, PyObject* args)
11634{
11635 PyObject *format_spec;
11636
11637 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11638 return NULL;
11639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11641 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011642}
11643
Eric Smith8c663262007-08-25 02:26:07 +000011644PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011645 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011646\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011647Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011648
11649static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011650unicode__sizeof__(PyUnicodeObject *v)
11651{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011652 Py_ssize_t size;
11653
11654 /* If it's a compact object, account for base structure +
11655 character data. */
11656 if (PyUnicode_IS_COMPACT_ASCII(v))
11657 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11658 else if (PyUnicode_IS_COMPACT(v))
11659 size = sizeof(PyCompactUnicodeObject) +
11660 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11661 else {
11662 /* If it is a two-block object, account for base object, and
11663 for character block if present. */
11664 size = sizeof(PyUnicodeObject);
11665 if (v->data.any)
11666 size += (PyUnicode_GET_LENGTH(v) + 1) *
11667 PyUnicode_CHARACTER_SIZE(v);
11668 }
11669 /* If the wstr pointer is present, account for it unless it is shared
11670 with the data pointer. Since PyUnicode_DATA will crash if the object
11671 is not ready, check whether it's either not ready (in which case the
11672 data is entirely in wstr) or if the data is not shared. */
11673 if (_PyUnicode_WSTR(v) &&
11674 (!PyUnicode_IS_READY(v) ||
11675 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11676 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11677 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11678 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11679
11680 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011681}
11682
11683PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011684 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011685
11686static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011687unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011688{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011689 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 if (!copy)
11691 return NULL;
11692 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011693}
11694
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695static PyMethodDef unicode_methods[] = {
11696
11697 /* Order is according to common usage: often used methods should
11698 appear first, since lookup is done sequentially. */
11699
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011700 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011701 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11702 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011703 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011704 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11705 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11706 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11707 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11708 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11709 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11710 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011711 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011712 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11713 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11714 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011715 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011716 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11717 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11718 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011719 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011720 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011721 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011722 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011723 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11724 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11725 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11726 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11727 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11728 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11729 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11730 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11731 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11732 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11733 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11734 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11735 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11736 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011737 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011738 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011739 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011740 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011741 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011742 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011743 {"maketrans", (PyCFunction) unicode_maketrans,
11744 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011745 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011746#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011747 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748#endif
11749
11750#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011751 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011752 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753#endif
11754
Benjamin Peterson14339b62009-01-31 16:36:08 +000011755 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756 {NULL, NULL}
11757};
11758
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011759static PyObject *
11760unicode_mod(PyObject *v, PyObject *w)
11761{
Brian Curtindfc80e32011-08-10 20:28:54 -050011762 if (!PyUnicode_Check(v))
11763 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011765}
11766
11767static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011768 0, /*nb_add*/
11769 0, /*nb_subtract*/
11770 0, /*nb_multiply*/
11771 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011772};
11773
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011775 (lenfunc) unicode_length, /* sq_length */
11776 PyUnicode_Concat, /* sq_concat */
11777 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11778 (ssizeargfunc) unicode_getitem, /* sq_item */
11779 0, /* sq_slice */
11780 0, /* sq_ass_item */
11781 0, /* sq_ass_slice */
11782 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783};
11784
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011785static PyObject*
11786unicode_subscript(PyUnicodeObject* self, PyObject* item)
11787{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 if (PyUnicode_READY(self) == -1)
11789 return NULL;
11790
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011791 if (PyIndex_Check(item)) {
11792 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011793 if (i == -1 && PyErr_Occurred())
11794 return NULL;
11795 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011797 return unicode_getitem(self, i);
11798 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011799 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011801 Py_UNICODE* result_buf;
11802 PyObject* result;
11803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011805 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011806 return NULL;
11807 }
11808
11809 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 return PyUnicode_New(0, 0);
11811 } else if (start == 0 && step == 1 &&
11812 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011813 PyUnicode_CheckExact(self)) {
11814 Py_INCREF(self);
11815 return (PyObject *)self;
11816 } else if (step == 1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 return substring(self, start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011818 } else {
11819 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011820 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11821 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011822
Benjamin Peterson29060642009-01-31 22:14:21 +000011823 if (result_buf == NULL)
11824 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011825
11826 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11827 result_buf[i] = source_buf[cur];
11828 }
Tim Petersced69f82003-09-16 20:30:58 +000011829
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011830 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011831 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011832 return result;
11833 }
11834 } else {
11835 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11836 return NULL;
11837 }
11838}
11839
11840static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011841 (lenfunc)unicode_length, /* mp_length */
11842 (binaryfunc)unicode_subscript, /* mp_subscript */
11843 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011844};
11845
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847/* Helpers for PyUnicode_Format() */
11848
11849static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011850getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011852 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011854 (*p_argidx)++;
11855 if (arglen < 0)
11856 return args;
11857 else
11858 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859 }
11860 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011861 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862 return NULL;
11863}
11864
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011865/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011867static PyObject *
11868formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011870 char *p;
11871 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011873
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874 x = PyFloat_AsDouble(v);
11875 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011876 return NULL;
11877
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011879 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011880
Eric Smith0923d1d2009-04-16 20:16:10 +000011881 p = PyOS_double_to_string(x, type, prec,
11882 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011883 if (p == NULL)
11884 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011886 PyMem_Free(p);
11887 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888}
11889
Tim Peters38fd5b62000-09-21 05:43:11 +000011890static PyObject*
11891formatlong(PyObject *val, int flags, int prec, int type)
11892{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011893 char *buf;
11894 int len;
11895 PyObject *str; /* temporary string object. */
11896 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011897
Benjamin Peterson14339b62009-01-31 16:36:08 +000011898 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11899 if (!str)
11900 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011902 Py_DECREF(str);
11903 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011904}
11905
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011907formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011908 size_t buflen,
11909 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011911 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011912 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 if (PyUnicode_GET_LENGTH(v) == 1) {
11914 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011915 buf[1] = '\0';
11916 return 1;
11917 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 goto onError;
11919 }
11920 else {
11921 /* Integer input truncated to a character */
11922 long x;
11923 x = PyLong_AsLong(v);
11924 if (x == -1 && PyErr_Occurred())
11925 goto onError;
11926
11927 if (x < 0 || x > 0x10ffff) {
11928 PyErr_SetString(PyExc_OverflowError,
11929 "%c arg not in range(0x110000)");
11930 return -1;
11931 }
11932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011934 buf[1] = '\0';
11935 return 1;
11936 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011937
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011939 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011940 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011941 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942}
11943
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011944/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011945 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011946*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011947#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011948
Alexander Belopolsky40018472011-02-26 01:02:56 +000011949PyObject *
11950PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 void *fmt;
11953 int fmtkind;
11954 PyObject *result;
11955 Py_UCS4 *res, *res0;
11956 Py_UCS4 max;
11957 int kind;
11958 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011962
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 PyErr_BadInternalCall();
11965 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11968 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011969 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 fmt = PyUnicode_DATA(uformat);
11971 fmtkind = PyUnicode_KIND(uformat);
11972 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11973 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974
11975 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11977 if (res0 == NULL) {
11978 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011979 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981
11982 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011983 arglen = PyTuple_Size(args);
11984 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985 }
11986 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011987 arglen = -1;
11988 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011990 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011991 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993
11994 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 if (--rescnt < 0) {
11997 rescnt = fmtcnt + 100;
11998 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12000 if (res0 == NULL){
12001 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012002 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 }
12004 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012008 }
12009 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012010 /* Got a format specifier */
12011 int flags = 0;
12012 Py_ssize_t width = -1;
12013 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 Py_UCS4 c = '\0';
12015 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012016 int isnumok;
12017 PyObject *v = NULL;
12018 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 void *pbuf;
12020 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 Py_ssize_t len, len1;
12023 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 fmtpos++;
12026 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12027 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012028 Py_ssize_t keylen;
12029 PyObject *key;
12030 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012031
Benjamin Peterson29060642009-01-31 22:14:21 +000012032 if (dict == NULL) {
12033 PyErr_SetString(PyExc_TypeError,
12034 "format requires a mapping");
12035 goto onError;
12036 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012038 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012040 /* Skip over balanced parentheses */
12041 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012043 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012045 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012049 if (fmtcnt < 0 || pcount > 0) {
12050 PyErr_SetString(PyExc_ValueError,
12051 "incomplete format key");
12052 goto onError;
12053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 key = substring(uformat, keystart, keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012055 if (key == NULL)
12056 goto onError;
12057 if (args_owned) {
12058 Py_DECREF(args);
12059 args_owned = 0;
12060 }
12061 args = PyObject_GetItem(dict, key);
12062 Py_DECREF(key);
12063 if (args == NULL) {
12064 goto onError;
12065 }
12066 args_owned = 1;
12067 arglen = -1;
12068 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012069 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012070 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012072 case '-': flags |= F_LJUST; continue;
12073 case '+': flags |= F_SIGN; continue;
12074 case ' ': flags |= F_BLANK; continue;
12075 case '#': flags |= F_ALT; continue;
12076 case '0': flags |= F_ZERO; continue;
12077 }
12078 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012079 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012080 if (c == '*') {
12081 v = getnextarg(args, arglen, &argidx);
12082 if (v == NULL)
12083 goto onError;
12084 if (!PyLong_Check(v)) {
12085 PyErr_SetString(PyExc_TypeError,
12086 "* wants int");
12087 goto onError;
12088 }
12089 width = PyLong_AsLong(v);
12090 if (width == -1 && PyErr_Occurred())
12091 goto onError;
12092 if (width < 0) {
12093 flags |= F_LJUST;
12094 width = -width;
12095 }
12096 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 }
12099 else if (c >= '0' && c <= '9') {
12100 width = c - '0';
12101 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012103 if (c < '0' || c > '9')
12104 break;
12105 if ((width*10) / 10 != width) {
12106 PyErr_SetString(PyExc_ValueError,
12107 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012108 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 }
12110 width = width*10 + (c - '0');
12111 }
12112 }
12113 if (c == '.') {
12114 prec = 0;
12115 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012117 if (c == '*') {
12118 v = getnextarg(args, arglen, &argidx);
12119 if (v == NULL)
12120 goto onError;
12121 if (!PyLong_Check(v)) {
12122 PyErr_SetString(PyExc_TypeError,
12123 "* wants int");
12124 goto onError;
12125 }
12126 prec = PyLong_AsLong(v);
12127 if (prec == -1 && PyErr_Occurred())
12128 goto onError;
12129 if (prec < 0)
12130 prec = 0;
12131 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012133 }
12134 else if (c >= '0' && c <= '9') {
12135 prec = c - '0';
12136 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012138 if (c < '0' || c > '9')
12139 break;
12140 if ((prec*10) / 10 != prec) {
12141 PyErr_SetString(PyExc_ValueError,
12142 "prec too big");
12143 goto onError;
12144 }
12145 prec = prec*10 + (c - '0');
12146 }
12147 }
12148 } /* prec */
12149 if (fmtcnt >= 0) {
12150 if (c == 'h' || c == 'l' || c == 'L') {
12151 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012153 }
12154 }
12155 if (fmtcnt < 0) {
12156 PyErr_SetString(PyExc_ValueError,
12157 "incomplete format");
12158 goto onError;
12159 }
12160 if (c != '%') {
12161 v = getnextarg(args, arglen, &argidx);
12162 if (v == NULL)
12163 goto onError;
12164 }
12165 sign = 0;
12166 fill = ' ';
12167 switch (c) {
12168
12169 case '%':
12170 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012171 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012172 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012174 len = 1;
12175 break;
12176
12177 case 's':
12178 case 'r':
12179 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012180 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012181 temp = v;
12182 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012183 }
12184 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012185 if (c == 's')
12186 temp = PyObject_Str(v);
12187 else if (c == 'r')
12188 temp = PyObject_Repr(v);
12189 else
12190 temp = PyObject_ASCII(v);
12191 if (temp == NULL)
12192 goto onError;
12193 if (PyUnicode_Check(temp))
12194 /* nothing to do */;
12195 else {
12196 Py_DECREF(temp);
12197 PyErr_SetString(PyExc_TypeError,
12198 "%s argument has non-string str()");
12199 goto onError;
12200 }
12201 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 if (PyUnicode_READY(temp) == -1) {
12203 Py_CLEAR(temp);
12204 goto onError;
12205 }
12206 pbuf = PyUnicode_DATA(temp);
12207 kind = PyUnicode_KIND(temp);
12208 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012209 if (prec >= 0 && len > prec)
12210 len = prec;
12211 break;
12212
12213 case 'i':
12214 case 'd':
12215 case 'u':
12216 case 'o':
12217 case 'x':
12218 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012219 isnumok = 0;
12220 if (PyNumber_Check(v)) {
12221 PyObject *iobj=NULL;
12222
12223 if (PyLong_Check(v)) {
12224 iobj = v;
12225 Py_INCREF(iobj);
12226 }
12227 else {
12228 iobj = PyNumber_Long(v);
12229 }
12230 if (iobj!=NULL) {
12231 if (PyLong_Check(iobj)) {
12232 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012233 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012234 Py_DECREF(iobj);
12235 if (!temp)
12236 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 if (PyUnicode_READY(temp) == -1) {
12238 Py_CLEAR(temp);
12239 goto onError;
12240 }
12241 pbuf = PyUnicode_DATA(temp);
12242 kind = PyUnicode_KIND(temp);
12243 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012244 sign = 1;
12245 }
12246 else {
12247 Py_DECREF(iobj);
12248 }
12249 }
12250 }
12251 if (!isnumok) {
12252 PyErr_Format(PyExc_TypeError,
12253 "%%%c format: a number is required, "
12254 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12255 goto onError;
12256 }
12257 if (flags & F_ZERO)
12258 fill = '0';
12259 break;
12260
12261 case 'e':
12262 case 'E':
12263 case 'f':
12264 case 'F':
12265 case 'g':
12266 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012267 temp = formatfloat(v, flags, prec, c);
12268 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012269 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 if (PyUnicode_READY(temp) == -1) {
12271 Py_CLEAR(temp);
12272 goto onError;
12273 }
12274 pbuf = PyUnicode_DATA(temp);
12275 kind = PyUnicode_KIND(temp);
12276 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012277 sign = 1;
12278 if (flags & F_ZERO)
12279 fill = '0';
12280 break;
12281
12282 case 'c':
12283 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012285 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 if (len < 0)
12287 goto onError;
12288 break;
12289
12290 default:
12291 PyErr_Format(PyExc_ValueError,
12292 "unsupported format character '%c' (0x%x) "
12293 "at index %zd",
12294 (31<=c && c<=126) ? (char)c : '?',
12295 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012296 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012297 goto onError;
12298 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 /* pbuf is initialized here. */
12300 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012301 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12303 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12304 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 len--;
12306 }
12307 else if (flags & F_SIGN)
12308 sign = '+';
12309 else if (flags & F_BLANK)
12310 sign = ' ';
12311 else
12312 sign = 0;
12313 }
12314 if (width < len)
12315 width = len;
12316 if (rescnt - (sign != 0) < width) {
12317 reslen -= rescnt;
12318 rescnt = width + fmtcnt + 100;
12319 reslen += rescnt;
12320 if (reslen < 0) {
12321 Py_XDECREF(temp);
12322 PyErr_NoMemory();
12323 goto onError;
12324 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12326 if (res0 == 0) {
12327 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012328 Py_XDECREF(temp);
12329 goto onError;
12330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012332 }
12333 if (sign) {
12334 if (fill != ' ')
12335 *res++ = sign;
12336 rescnt--;
12337 if (width > len)
12338 width--;
12339 }
12340 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12342 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12345 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012346 }
12347 rescnt -= 2;
12348 width -= 2;
12349 if (width < 0)
12350 width = 0;
12351 len -= 2;
12352 }
12353 if (width > len && !(flags & F_LJUST)) {
12354 do {
12355 --rescnt;
12356 *res++ = fill;
12357 } while (--width > len);
12358 }
12359 if (fill == ' ') {
12360 if (sign)
12361 *res++ = sign;
12362 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12364 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12365 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12366 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012367 }
12368 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 /* Copy all characters, preserving len */
12370 len1 = len;
12371 while (len1--) {
12372 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12373 rescnt--;
12374 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 while (--width >= len) {
12376 --rescnt;
12377 *res++ = ' ';
12378 }
12379 if (dict && (argidx < arglen) && c != '%') {
12380 PyErr_SetString(PyExc_TypeError,
12381 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012382 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 goto onError;
12384 }
12385 Py_XDECREF(temp);
12386 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387 } /* until end */
12388 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012389 PyErr_SetString(PyExc_TypeError,
12390 "not all arguments converted during string formatting");
12391 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392 }
12393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394
12395 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12396 if (*res > max)
12397 max = *res;
12398 result = PyUnicode_New(reslen - rescnt, max);
12399 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012400 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401 kind = PyUnicode_KIND(result);
12402 for (res = res0; res < res0+reslen-rescnt; res++)
12403 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12404 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012405 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012406 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012407 }
12408 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012409 return (PyObject *)result;
12410
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012412 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012413 Py_DECREF(uformat);
12414 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012415 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416 }
12417 return NULL;
12418}
12419
Jeremy Hylton938ace62002-07-17 16:30:39 +000012420static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012421unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12422
Tim Peters6d6c1a32001-08-02 04:15:00 +000012423static PyObject *
12424unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12425{
Benjamin Peterson29060642009-01-31 22:14:21 +000012426 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012427 static char *kwlist[] = {"object", "encoding", "errors", 0};
12428 char *encoding = NULL;
12429 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012430
Benjamin Peterson14339b62009-01-31 16:36:08 +000012431 if (type != &PyUnicode_Type)
12432 return unicode_subtype_new(type, args, kwds);
12433 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012434 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012435 return NULL;
12436 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012438 if (encoding == NULL && errors == NULL)
12439 return PyObject_Str(x);
12440 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012441 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012442}
12443
Guido van Rossume023fe02001-08-30 03:12:59 +000012444static PyObject *
12445unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12446{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012447 PyUnicodeObject *tmp, *pnew;
12448 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012449 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012450
Benjamin Peterson14339b62009-01-31 16:36:08 +000012451 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12452 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12453 if (tmp == NULL)
12454 return NULL;
12455 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12457 // it seems kind of strange that tp_alloc gets passed the size
12458 // of the unicode string because there will follow another
12459 // malloc.
12460 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12461 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012462 if (pnew == NULL) {
12463 Py_DECREF(tmp);
12464 return NULL;
12465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12467 if (_PyUnicode_WSTR(pnew) == NULL) {
12468 err = PyErr_NoMemory();
12469 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012470 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012471 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12472 _PyUnicode_WSTR_LENGTH(pnew) = n;
12473 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12474 _PyUnicode_STATE(pnew).interned = 0;
12475 _PyUnicode_STATE(pnew).kind = 0;
12476 _PyUnicode_STATE(pnew).compact = 0;
12477 _PyUnicode_STATE(pnew).ready = 0;
12478 _PyUnicode_STATE(pnew).ascii = 0;
12479 pnew->data.any = NULL;
12480 _PyUnicode_LENGTH(pnew) = 0;
12481 pnew->_base.utf8 = NULL;
12482 pnew->_base.utf8_length = 0;
12483
12484 if (PyUnicode_READY(pnew) == -1) {
12485 PyObject_FREE(_PyUnicode_WSTR(pnew));
12486 goto onError;
12487 }
12488
Benjamin Peterson14339b62009-01-31 16:36:08 +000012489 Py_DECREF(tmp);
12490 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012491
12492 onError:
12493 _Py_ForgetReference((PyObject *)pnew);
12494 PyObject_Del(pnew);
12495 Py_DECREF(tmp);
12496 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012497}
12498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012499PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012500 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012501\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012502Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012503encoding defaults to the current default string encoding.\n\
12504errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012505
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012506static PyObject *unicode_iter(PyObject *seq);
12507
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012509 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012510 "str", /* tp_name */
12511 sizeof(PyUnicodeObject), /* tp_size */
12512 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012514 (destructor)unicode_dealloc, /* tp_dealloc */
12515 0, /* tp_print */
12516 0, /* tp_getattr */
12517 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012518 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012519 unicode_repr, /* tp_repr */
12520 &unicode_as_number, /* tp_as_number */
12521 &unicode_as_sequence, /* tp_as_sequence */
12522 &unicode_as_mapping, /* tp_as_mapping */
12523 (hashfunc) unicode_hash, /* tp_hash*/
12524 0, /* tp_call*/
12525 (reprfunc) unicode_str, /* tp_str */
12526 PyObject_GenericGetAttr, /* tp_getattro */
12527 0, /* tp_setattro */
12528 0, /* tp_as_buffer */
12529 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012530 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012531 unicode_doc, /* tp_doc */
12532 0, /* tp_traverse */
12533 0, /* tp_clear */
12534 PyUnicode_RichCompare, /* tp_richcompare */
12535 0, /* tp_weaklistoffset */
12536 unicode_iter, /* tp_iter */
12537 0, /* tp_iternext */
12538 unicode_methods, /* tp_methods */
12539 0, /* tp_members */
12540 0, /* tp_getset */
12541 &PyBaseObject_Type, /* tp_base */
12542 0, /* tp_dict */
12543 0, /* tp_descr_get */
12544 0, /* tp_descr_set */
12545 0, /* tp_dictoffset */
12546 0, /* tp_init */
12547 0, /* tp_alloc */
12548 unicode_new, /* tp_new */
12549 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550};
12551
12552/* Initialize the Unicode implementation */
12553
Thomas Wouters78890102000-07-22 19:25:51 +000012554void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012556 int i;
12557
Thomas Wouters477c8d52006-05-27 19:21:47 +000012558 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012560 0x000A, /* LINE FEED */
12561 0x000D, /* CARRIAGE RETURN */
12562 0x001C, /* FILE SEPARATOR */
12563 0x001D, /* GROUP SEPARATOR */
12564 0x001E, /* RECORD SEPARATOR */
12565 0x0085, /* NEXT LINE */
12566 0x2028, /* LINE SEPARATOR */
12567 0x2029, /* PARAGRAPH SEPARATOR */
12568 };
12569
Fred Drakee4315f52000-05-09 19:53:39 +000012570 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012572 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012574
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012575 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012577 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012578 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012579
12580 /* initialize the linebreak bloom filter */
12581 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012583 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012584
12585 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586}
12587
12588/* Finalize the Unicode implementation */
12589
Christian Heimesa156e092008-02-16 07:38:31 +000012590int
12591PyUnicode_ClearFreeList(void)
12592{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012594}
12595
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596void
Thomas Wouters78890102000-07-22 19:25:51 +000012597_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012599 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012601 Py_XDECREF(unicode_empty);
12602 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012603
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012604 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012605 if (unicode_latin1[i]) {
12606 Py_DECREF(unicode_latin1[i]);
12607 unicode_latin1[i] = NULL;
12608 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012609 }
Christian Heimesa156e092008-02-16 07:38:31 +000012610 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012612
Walter Dörwald16807132007-05-25 13:52:07 +000012613void
12614PyUnicode_InternInPlace(PyObject **p)
12615{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012616 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12617 PyObject *t;
12618 if (s == NULL || !PyUnicode_Check(s))
12619 Py_FatalError(
12620 "PyUnicode_InternInPlace: unicode strings only please!");
12621 /* If it's a subclass, we don't really know what putting
12622 it in the interned dict might do. */
12623 if (!PyUnicode_CheckExact(s))
12624 return;
12625 if (PyUnicode_CHECK_INTERNED(s))
12626 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 if (PyUnicode_READY(s) == -1) {
12628 assert(0 && "ready fail in intern...");
12629 return;
12630 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012631 if (interned == NULL) {
12632 interned = PyDict_New();
12633 if (interned == NULL) {
12634 PyErr_Clear(); /* Don't leave an exception */
12635 return;
12636 }
12637 }
12638 /* It might be that the GetItem call fails even
12639 though the key is present in the dictionary,
12640 namely when this happens during a stack overflow. */
12641 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012642 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012643 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012644
Benjamin Peterson29060642009-01-31 22:14:21 +000012645 if (t) {
12646 Py_INCREF(t);
12647 Py_DECREF(*p);
12648 *p = t;
12649 return;
12650 }
Walter Dörwald16807132007-05-25 13:52:07 +000012651
Benjamin Peterson14339b62009-01-31 16:36:08 +000012652 PyThreadState_GET()->recursion_critical = 1;
12653 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12654 PyErr_Clear();
12655 PyThreadState_GET()->recursion_critical = 0;
12656 return;
12657 }
12658 PyThreadState_GET()->recursion_critical = 0;
12659 /* The two references in interned are not counted by refcnt.
12660 The deallocator will take care of this */
12661 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012663}
12664
12665void
12666PyUnicode_InternImmortal(PyObject **p)
12667{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12669
Benjamin Peterson14339b62009-01-31 16:36:08 +000012670 PyUnicode_InternInPlace(p);
12671 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012673 Py_INCREF(*p);
12674 }
Walter Dörwald16807132007-05-25 13:52:07 +000012675}
12676
12677PyObject *
12678PyUnicode_InternFromString(const char *cp)
12679{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012680 PyObject *s = PyUnicode_FromString(cp);
12681 if (s == NULL)
12682 return NULL;
12683 PyUnicode_InternInPlace(&s);
12684 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012685}
12686
Alexander Belopolsky40018472011-02-26 01:02:56 +000012687void
12688_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012689{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012690 PyObject *keys;
12691 PyUnicodeObject *s;
12692 Py_ssize_t i, n;
12693 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012694
Benjamin Peterson14339b62009-01-31 16:36:08 +000012695 if (interned == NULL || !PyDict_Check(interned))
12696 return;
12697 keys = PyDict_Keys(interned);
12698 if (keys == NULL || !PyList_Check(keys)) {
12699 PyErr_Clear();
12700 return;
12701 }
Walter Dörwald16807132007-05-25 13:52:07 +000012702
Benjamin Peterson14339b62009-01-31 16:36:08 +000012703 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12704 detector, interned unicode strings are not forcibly deallocated;
12705 rather, we give them their stolen references back, and then clear
12706 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012707
Benjamin Peterson14339b62009-01-31 16:36:08 +000012708 n = PyList_GET_SIZE(keys);
12709 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012710 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012711 for (i = 0; i < n; i++) {
12712 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713 if (PyUnicode_READY(s) == -1)
12714 fprintf(stderr, "could not ready string\n");
12715 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012716 case SSTATE_NOT_INTERNED:
12717 /* XXX Shouldn't happen */
12718 break;
12719 case SSTATE_INTERNED_IMMORTAL:
12720 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012722 break;
12723 case SSTATE_INTERNED_MORTAL:
12724 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012726 break;
12727 default:
12728 Py_FatalError("Inconsistent interned string state.");
12729 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012731 }
12732 fprintf(stderr, "total size of all interned strings: "
12733 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12734 "mortal/immortal\n", mortal_size, immortal_size);
12735 Py_DECREF(keys);
12736 PyDict_Clear(interned);
12737 Py_DECREF(interned);
12738 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012739}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012740
12741
12742/********************* Unicode Iterator **************************/
12743
12744typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012745 PyObject_HEAD
12746 Py_ssize_t it_index;
12747 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012748} unicodeiterobject;
12749
12750static void
12751unicodeiter_dealloc(unicodeiterobject *it)
12752{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012753 _PyObject_GC_UNTRACK(it);
12754 Py_XDECREF(it->it_seq);
12755 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012756}
12757
12758static int
12759unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12760{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012761 Py_VISIT(it->it_seq);
12762 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012763}
12764
12765static PyObject *
12766unicodeiter_next(unicodeiterobject *it)
12767{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012768 PyUnicodeObject *seq;
12769 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012770
Benjamin Peterson14339b62009-01-31 16:36:08 +000012771 assert(it != NULL);
12772 seq = it->it_seq;
12773 if (seq == NULL)
12774 return NULL;
12775 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12778 int kind = PyUnicode_KIND(seq);
12779 void *data = PyUnicode_DATA(seq);
12780 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12781 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012782 if (item != NULL)
12783 ++it->it_index;
12784 return item;
12785 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012786
Benjamin Peterson14339b62009-01-31 16:36:08 +000012787 Py_DECREF(seq);
12788 it->it_seq = NULL;
12789 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012790}
12791
12792static PyObject *
12793unicodeiter_len(unicodeiterobject *it)
12794{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012795 Py_ssize_t len = 0;
12796 if (it->it_seq)
12797 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12798 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012799}
12800
12801PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12802
12803static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012804 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012805 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012806 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012807};
12808
12809PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012810 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12811 "str_iterator", /* tp_name */
12812 sizeof(unicodeiterobject), /* tp_basicsize */
12813 0, /* tp_itemsize */
12814 /* methods */
12815 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12816 0, /* tp_print */
12817 0, /* tp_getattr */
12818 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012819 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012820 0, /* tp_repr */
12821 0, /* tp_as_number */
12822 0, /* tp_as_sequence */
12823 0, /* tp_as_mapping */
12824 0, /* tp_hash */
12825 0, /* tp_call */
12826 0, /* tp_str */
12827 PyObject_GenericGetAttr, /* tp_getattro */
12828 0, /* tp_setattro */
12829 0, /* tp_as_buffer */
12830 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12831 0, /* tp_doc */
12832 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12833 0, /* tp_clear */
12834 0, /* tp_richcompare */
12835 0, /* tp_weaklistoffset */
12836 PyObject_SelfIter, /* tp_iter */
12837 (iternextfunc)unicodeiter_next, /* tp_iternext */
12838 unicodeiter_methods, /* tp_methods */
12839 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012840};
12841
12842static PyObject *
12843unicode_iter(PyObject *seq)
12844{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012845 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012846
Benjamin Peterson14339b62009-01-31 16:36:08 +000012847 if (!PyUnicode_Check(seq)) {
12848 PyErr_BadInternalCall();
12849 return NULL;
12850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 if (PyUnicode_READY(seq) == -1)
12852 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012853 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12854 if (it == NULL)
12855 return NULL;
12856 it->it_index = 0;
12857 Py_INCREF(seq);
12858 it->it_seq = (PyUnicodeObject *)seq;
12859 _PyObject_GC_TRACK(it);
12860 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012861}
12862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012863#define UNIOP(x) Py_UNICODE_##x
12864#define UNIOP_t Py_UNICODE
12865#include "uniops.h"
12866#undef UNIOP
12867#undef UNIOP_t
12868#define UNIOP(x) Py_UCS4_##x
12869#define UNIOP_t Py_UCS4
12870#include "uniops.h"
12871#undef UNIOP
12872#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012873
Victor Stinner71133ff2010-09-01 23:43:53 +000012874Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012875PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012876{
12877 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12878 Py_UNICODE *copy;
12879 Py_ssize_t size;
12880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012881 if (!PyUnicode_Check(unicode)) {
12882 PyErr_BadArgument();
12883 return NULL;
12884 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012885 /* Ensure we won't overflow the size. */
12886 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12887 PyErr_NoMemory();
12888 return NULL;
12889 }
12890 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12891 size *= sizeof(Py_UNICODE);
12892 copy = PyMem_Malloc(size);
12893 if (copy == NULL) {
12894 PyErr_NoMemory();
12895 return NULL;
12896 }
12897 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12898 return copy;
12899}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012900
Georg Brandl66c221e2010-10-14 07:04:07 +000012901/* A _string module, to export formatter_parser and formatter_field_name_split
12902 to the string.Formatter class implemented in Python. */
12903
12904static PyMethodDef _string_methods[] = {
12905 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12906 METH_O, PyDoc_STR("split the argument as a field name")},
12907 {"formatter_parser", (PyCFunction) formatter_parser,
12908 METH_O, PyDoc_STR("parse the argument as a format string")},
12909 {NULL, NULL}
12910};
12911
12912static struct PyModuleDef _string_module = {
12913 PyModuleDef_HEAD_INIT,
12914 "_string",
12915 PyDoc_STR("string helper module"),
12916 0,
12917 _string_methods,
12918 NULL,
12919 NULL,
12920 NULL,
12921 NULL
12922};
12923
12924PyMODINIT_FUNC
12925PyInit__string(void)
12926{
12927 return PyModule_Create(&_string_module);
12928}
12929
12930
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012931#ifdef __cplusplus
12932}
12933#endif