blob: fe0d4a5b643ac48eb333587e1852a3523b758847 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200108#define _PyUnicode_UTF8(op) \
109 (PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 ((PyCompactUnicodeObject*)(op))->utf8)
112#define _PyUnicode_UTF8_LENGTH(op) \
113 (PyUnicode_IS_COMPACT_ASCII(op) ? \
114 ((PyASCIIObject*)(op))->length : \
115 ((PyCompactUnicodeObject*)(op))->utf8_length)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200116#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
117#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
119#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
120#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
121#define _PyUnicode_KIND(op) \
122 (assert(PyUnicode_Check(op)), \
123 ((PyASCIIObject *)(op))->state.kind)
124#define _PyUnicode_GET_LENGTH(op) \
125 (assert(PyUnicode_Check(op)), \
126 ((PyASCIIObject *)(op))->length)
127
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200128/* The Unicode string has been modified: reset the hash */
129#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131
Walter Dörwald16807132007-05-25 13:52:07 +0000132/* This dictionary holds all interned unicode strings. Note that references
133 to strings in this dictionary are *not* counted in the string's ob_refcnt.
134 When the interned string reaches a refcnt of 0 the string deallocation
135 function will delete the reference from this dictionary.
136
137 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000138 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000139*/
140static PyObject *interned;
141
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000142/* The empty Unicode object is shared to improve performance. */
143static PyUnicodeObject *unicode_empty;
144
145/* Single character Unicode strings in the Latin-1 range are being
146 shared as well. */
147static PyUnicodeObject *unicode_latin1[256];
148
Christian Heimes190d79e2008-01-30 11:58:22 +0000149/* Fast detection of the most frequent whitespace characters */
150const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000151 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000152/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000153/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000154/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000155/* case 0x000C: * FORM FEED */
156/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000157 0, 1, 1, 1, 1, 1, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* case 0x001C: * FILE SEPARATOR */
160/* case 0x001D: * GROUP SEPARATOR */
161/* case 0x001E: * RECORD SEPARATOR */
162/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000163 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000165 1, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000169
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000178};
179
Alexander Belopolsky40018472011-02-26 01:02:56 +0000180static PyObject *
181unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000182 PyObject **errorHandler,const char *encoding, const char *reason,
183 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
184 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
185
Alexander Belopolsky40018472011-02-26 01:02:56 +0000186static void
187raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300188 const char *encoding,
189 const Py_UNICODE *unicode, Py_ssize_t size,
190 Py_ssize_t startpos, Py_ssize_t endpos,
191 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000192
Christian Heimes190d79e2008-01-30 11:58:22 +0000193/* Same for linebreaks */
194static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000196/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* 0x000B, * LINE TABULATION */
198/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000202/* 0x001C, * FILE SEPARATOR */
203/* 0x001D, * GROUP SEPARATOR */
204/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 1, 1, 1, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000210
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000219};
220
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300221/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
222 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000223Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000224PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000225{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000226#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000227 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000228#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 /* This is actually an illegal character, so it should
230 not be passed to unichr. */
231 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000232#endif
233}
234
Thomas Wouters477c8d52006-05-27 19:21:47 +0000235/* --- Bloom Filters ----------------------------------------------------- */
236
237/* stuff to implement simple "bloom filters" for Unicode characters.
238 to keep things simple, we use a single bitmask, using the least 5
239 bits from each unicode characters as the bit index. */
240
241/* the linebreak mask is set up by Unicode_Init below */
242
Antoine Pitrouf068f942010-01-13 14:19:12 +0000243#if LONG_BIT >= 128
244#define BLOOM_WIDTH 128
245#elif LONG_BIT >= 64
246#define BLOOM_WIDTH 64
247#elif LONG_BIT >= 32
248#define BLOOM_WIDTH 32
249#else
250#error "LONG_BIT is smaller than 32"
251#endif
252
Thomas Wouters477c8d52006-05-27 19:21:47 +0000253#define BLOOM_MASK unsigned long
254
255static BLOOM_MASK bloom_linebreak;
256
Antoine Pitrouf068f942010-01-13 14:19:12 +0000257#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
258#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259
Benjamin Peterson29060642009-01-31 22:14:21 +0000260#define BLOOM_LINEBREAK(ch) \
261 ((ch) < 128U ? ascii_linebreak[(ch)] : \
262 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000263
Alexander Belopolsky40018472011-02-26 01:02:56 +0000264Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200265make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266{
267 /* calculate simple bloom-style bitmask for a given unicode string */
268
Antoine Pitrouf068f942010-01-13 14:19:12 +0000269 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270 Py_ssize_t i;
271
272 mask = 0;
273 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200274 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000275
276 return mask;
277}
278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279#define BLOOM_MEMBER(mask, chr, str) \
280 (BLOOM(mask, chr) \
281 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
Guido van Rossumd57fd912000-03-10 22:53:23 +0000283/* --- Unicode Object ----------------------------------------------------- */
284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200285static PyObject *
286substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len);
287
288static PyObject *
289fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
290
291Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
292 Py_ssize_t size, Py_UCS4 ch,
293 int direction)
294{
295 /* like wcschr, but doesn't stop at NULL characters */
296 Py_ssize_t i;
297 if (direction == 1) {
298 for(i = 0; i < size; i++)
299 if (PyUnicode_READ(kind, s, i) == ch)
300 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
301 }
302 else {
303 for(i = size-1; i >= 0; i--)
304 if (PyUnicode_READ(kind, s, i) == ch)
305 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
306 }
307 return NULL;
308}
309
Alexander Belopolsky40018472011-02-26 01:02:56 +0000310static int
311unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200312 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313{
314 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 /* Resizing is only supported for old unicode objects. */
317 assert(!PyUnicode_IS_COMPACT(unicode));
318 assert(_PyUnicode_WSTR(unicode) != NULL);
319
320 /* ... and only if they have not been readied yet, because
321 callees usually rely on the wstr representation when resizing. */
322 assert(unicode->data.any == NULL);
323
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000324 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200325 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000326 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 /* Resizing shared object (unicode_empty or single character
329 objects) in-place is not allowed. Use PyUnicode_Resize()
330 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000331
Benjamin Peterson14339b62009-01-31 16:36:08 +0000332 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200333 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
334 _PyUnicode_WSTR(unicode)[0] < 256U &&
335 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000337 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 return -1;
339 }
340
Thomas Wouters477c8d52006-05-27 19:21:47 +0000341 /* We allocate one more byte to make sure the string is Ux0000 terminated.
342 The overallocation is also used by fastsearch, which assumes that it's
343 safe to look at str[length] (without making any assumptions about what
344 it contains). */
345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 oldstr = _PyUnicode_WSTR(unicode);
347 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
348 sizeof(Py_UNICODE) * (length + 1));
349 if (!_PyUnicode_WSTR(unicode)) {
350 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 PyErr_NoMemory();
352 return -1;
353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 _PyUnicode_WSTR(unicode)[length] = 0;
355 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356
Benjamin Peterson29060642009-01-31 22:14:21 +0000357 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 if (unicode->data.any != NULL) {
359 PyObject_FREE(unicode->data.any);
360 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
361 PyObject_FREE(unicode->_base.utf8);
362 }
363 unicode->_base.utf8 = NULL;
364 unicode->_base.utf8_length = 0;
365 unicode->data.any = NULL;
366 _PyUnicode_LENGTH(unicode) = 0;
367 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
368 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200370 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000371
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 return 0;
373}
374
375/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000376 Ux0000 terminated; some code (e.g. new_identifier)
377 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378
379 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000380 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381
382*/
383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384#ifdef Py_DEBUG
385int unicode_old_new_calls = 0;
386#endif
387
Alexander Belopolsky40018472011-02-26 01:02:56 +0000388static PyUnicodeObject *
389_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000390{
391 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200392 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393
Thomas Wouters477c8d52006-05-27 19:21:47 +0000394 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000395 if (length == 0 && unicode_empty != NULL) {
396 Py_INCREF(unicode_empty);
397 return unicode_empty;
398 }
399
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000400 /* Ensure we won't overflow the size. */
401 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
402 return (PyUnicodeObject *)PyErr_NoMemory();
403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200404 if (length < 0) {
405 PyErr_SetString(PyExc_SystemError,
406 "Negative size passed to _PyUnicode_New");
407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 }
409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410#ifdef Py_DEBUG
411 ++unicode_old_new_calls;
412#endif
413
414 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
415 if (unicode == NULL)
416 return NULL;
417 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
418 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
419 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 PyErr_NoMemory();
421 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200423
Jeremy Hyltond8082792003-09-16 19:41:39 +0000424 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000425 * the caller fails before initializing str -- unicode_resize()
426 * reads str[0], and the Keep-Alive optimization can keep memory
427 * allocated for str alive across a call to unicode_dealloc(unicode).
428 * We don't want unicode_resize to read uninitialized memory in
429 * that case.
430 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431 _PyUnicode_WSTR(unicode)[0] = 0;
432 _PyUnicode_WSTR(unicode)[length] = 0;
433 _PyUnicode_WSTR_LENGTH(unicode) = length;
434 _PyUnicode_HASH(unicode) = -1;
435 _PyUnicode_STATE(unicode).interned = 0;
436 _PyUnicode_STATE(unicode).kind = 0;
437 _PyUnicode_STATE(unicode).compact = 0;
438 _PyUnicode_STATE(unicode).ready = 0;
439 _PyUnicode_STATE(unicode).ascii = 0;
440 unicode->data.any = NULL;
441 _PyUnicode_LENGTH(unicode) = 0;
442 unicode->_base.utf8 = NULL;
443 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000445
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000447 /* XXX UNREF/NEWREF interface should be more symmetrical */
448 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000449 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000450 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000451 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452}
453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454#ifdef Py_DEBUG
455int unicode_new_new_calls = 0;
456
457/* Functions wrapping macros for use in debugger */
458char *_PyUnicode_utf8(void *unicode){
459 return _PyUnicode_UTF8(unicode);
460}
461
462void *_PyUnicode_compact_data(void *unicode) {
463 return _PyUnicode_COMPACT_DATA(unicode);
464}
465void *_PyUnicode_data(void *unicode){
466 printf("obj %p\n", unicode);
467 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
468 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
469 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
470 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
471 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
472 return PyUnicode_DATA(unicode);
473}
474#endif
475
476PyObject *
477PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
478{
479 PyObject *obj;
480 PyCompactUnicodeObject *unicode;
481 void *data;
482 int kind_state;
483 int is_sharing = 0, is_ascii = 0;
484 Py_ssize_t char_size;
485 Py_ssize_t struct_size;
486
487 /* Optimization for empty strings */
488 if (size == 0 && unicode_empty != NULL) {
489 Py_INCREF(unicode_empty);
490 return (PyObject *)unicode_empty;
491 }
492
493#ifdef Py_DEBUG
494 ++unicode_new_new_calls;
495#endif
496
497 struct_size = sizeof(PyCompactUnicodeObject);
498 if (maxchar < 128) {
499 kind_state = PyUnicode_1BYTE_KIND;
500 char_size = 1;
501 is_ascii = 1;
502 struct_size = sizeof(PyASCIIObject);
503 }
504 else if (maxchar < 256) {
505 kind_state = PyUnicode_1BYTE_KIND;
506 char_size = 1;
507 }
508 else if (maxchar < 65536) {
509 kind_state = PyUnicode_2BYTE_KIND;
510 char_size = 2;
511 if (sizeof(wchar_t) == 2)
512 is_sharing = 1;
513 }
514 else {
515 kind_state = PyUnicode_4BYTE_KIND;
516 char_size = 4;
517 if (sizeof(wchar_t) == 4)
518 is_sharing = 1;
519 }
520
521 /* Ensure we won't overflow the size. */
522 if (size < 0) {
523 PyErr_SetString(PyExc_SystemError,
524 "Negative size passed to PyUnicode_New");
525 return NULL;
526 }
527 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
528 return PyErr_NoMemory();
529
530 /* Duplicated allocation code from _PyObject_New() instead of a call to
531 * PyObject_New() so we are able to allocate space for the object and
532 * it's data buffer.
533 */
534 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
535 if (obj == NULL)
536 return PyErr_NoMemory();
537 obj = PyObject_INIT(obj, &PyUnicode_Type);
538 if (obj == NULL)
539 return NULL;
540
541 unicode = (PyCompactUnicodeObject *)obj;
542 if (is_ascii)
543 data = ((PyASCIIObject*)obj) + 1;
544 else
545 data = unicode + 1;
546 _PyUnicode_LENGTH(unicode) = size;
547 _PyUnicode_HASH(unicode) = -1;
548 _PyUnicode_STATE(unicode).interned = 0;
549 _PyUnicode_STATE(unicode).kind = kind_state;
550 _PyUnicode_STATE(unicode).compact = 1;
551 _PyUnicode_STATE(unicode).ready = 1;
552 _PyUnicode_STATE(unicode).ascii = is_ascii;
553 if (is_ascii) {
554 ((char*)data)[size] = 0;
555 _PyUnicode_WSTR(unicode) = NULL;
556 }
557 else if (kind_state == PyUnicode_1BYTE_KIND) {
558 ((char*)data)[size] = 0;
559 _PyUnicode_WSTR(unicode) = NULL;
560 _PyUnicode_WSTR_LENGTH(unicode) = 0;
561 unicode->utf8_length = 0;
562 unicode->utf8 = NULL;
563 }
564 else {
565 unicode->utf8 = NULL;
566 if (kind_state == PyUnicode_2BYTE_KIND)
567 ((Py_UCS2*)data)[size] = 0;
568 else /* kind_state == PyUnicode_4BYTE_KIND */
569 ((Py_UCS4*)data)[size] = 0;
570 if (is_sharing) {
571 _PyUnicode_WSTR_LENGTH(unicode) = size;
572 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
573 }
574 else {
575 _PyUnicode_WSTR_LENGTH(unicode) = 0;
576 _PyUnicode_WSTR(unicode) = NULL;
577 }
578 }
579 return obj;
580}
581
582#if SIZEOF_WCHAR_T == 2
583/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
584 will decode surrogate pairs, the other conversions are implemented as macros
585 for efficency.
586
587 This function assumes that unicode can hold one more code point than wstr
588 characters for a terminating null character. */
589static int
590unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
591 PyUnicodeObject *unicode)
592{
593 const wchar_t *iter;
594 Py_UCS4 *ucs4_out;
595
596 assert(unicode && PyUnicode_Check(unicode));
597 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
598 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
599
600 for (iter = begin; iter < end; ) {
601 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
602 _PyUnicode_GET_LENGTH(unicode)));
603 if (*iter >= 0xD800 && *iter <= 0xDBFF
604 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
605 {
606 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
607 iter += 2;
608 }
609 else {
610 *ucs4_out++ = *iter;
611 iter++;
612 }
613 }
614 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
615 _PyUnicode_GET_LENGTH(unicode)));
616
617 return 0;
618}
619#endif
620
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200621Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200622PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
623 PyObject *from, Py_ssize_t from_start,
624 Py_ssize_t how_many)
625{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200626 unsigned int from_kind, to_kind;
627 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200628
629 assert(PyUnicode_Check(from));
630 assert(PyUnicode_Check(to));
631
632 if (PyUnicode_READY(from))
633 return -1;
634 if (PyUnicode_READY(to))
635 return -1;
636
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200637 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200638 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
639 PyErr_Format(PyExc_ValueError,
640 "Cannot write %zi characters at %zi "
641 "in a string of %zi characters",
642 how_many, to_start, PyUnicode_GET_LENGTH(to));
643 return -1;
644 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200645 if (how_many == 0)
646 return 0;
647
648 if (Py_REFCNT(to) != 1) {
649 PyErr_SetString(PyExc_ValueError,
650 "Cannot modify a string having more than 1 reference");
651 return -1;
652 }
Victor Stinnerc17f5402011-09-29 00:16:58 +0200653 _PyUnicode_DIRTY(to);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200656 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200657 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200658 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200659
660 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200661 /* fast path */
Victor Stinnera0702ab2011-09-29 14:14:38 +0200662 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200663 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200664 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200665 + PyUnicode_KIND_SIZE(from_kind, from_start),
666 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200667 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200668 else if (from_kind == PyUnicode_1BYTE_KIND
669 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200670 {
671 _PyUnicode_CONVERT_BYTES(
672 Py_UCS1, Py_UCS2,
673 PyUnicode_1BYTE_DATA(from) + from_start,
674 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
675 PyUnicode_2BYTE_DATA(to) + to_start
676 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200677 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200678 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200679 && to_kind == PyUnicode_4BYTE_KIND)
680 {
681 _PyUnicode_CONVERT_BYTES(
682 Py_UCS1, Py_UCS4,
683 PyUnicode_1BYTE_DATA(from) + from_start,
684 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
685 PyUnicode_4BYTE_DATA(to) + to_start
686 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200687 }
688 else if (from_kind == PyUnicode_2BYTE_KIND
689 && to_kind == PyUnicode_4BYTE_KIND)
690 {
691 _PyUnicode_CONVERT_BYTES(
692 Py_UCS2, Py_UCS4,
693 PyUnicode_2BYTE_DATA(from) + from_start,
694 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
695 PyUnicode_4BYTE_DATA(to) + to_start
696 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200697 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200698 else {
699 int invalid_kinds;
700 if (from_kind > to_kind) {
701 /* slow path to check for character overflow */
702 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
703 Py_UCS4 ch, maxchar;
704 Py_ssize_t i;
705
706 maxchar = 0;
707 invalid_kinds = 0;
708 for (i=0; i < how_many; i++) {
709 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
710 if (ch > maxchar) {
711 maxchar = ch;
712 if (maxchar > to_maxchar) {
713 invalid_kinds = 1;
714 break;
715 }
716 }
717 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
718 }
719 }
720 else
721 invalid_kinds = 1;
722 if (invalid_kinds) {
723 PyErr_Format(PyExc_ValueError,
724 "Cannot copy UCS%u characters "
725 "into a string of UCS%u characters",
726 1 << (from_kind - 1),
727 1 << (to_kind -1));
728 return -1;
729 }
730 }
731 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732}
733
Victor Stinner17222162011-09-28 22:15:37 +0200734/* Find the maximum code point and count the number of surrogate pairs so a
735 correct string length can be computed before converting a string to UCS4.
736 This function counts single surrogates as a character and not as a pair.
737
738 Return 0 on success, or -1 on error. */
739static int
740find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
741 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200742{
743 const wchar_t *iter;
744
745 if (num_surrogates == NULL || maxchar == NULL) {
746 PyErr_SetString(PyExc_SystemError,
747 "unexpected NULL arguments to "
748 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
749 return -1;
750 }
751
752 *num_surrogates = 0;
753 *maxchar = 0;
754
755 for (iter = begin; iter < end; ) {
756 if (*iter > *maxchar)
757 *maxchar = *iter;
758#if SIZEOF_WCHAR_T == 2
759 if (*iter >= 0xD800 && *iter <= 0xDBFF
760 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
761 {
762 Py_UCS4 surrogate_val;
763 surrogate_val = (((iter[0] & 0x3FF)<<10)
764 | (iter[1] & 0x3FF)) + 0x10000;
765 ++(*num_surrogates);
766 if (surrogate_val > *maxchar)
767 *maxchar = surrogate_val;
768 iter += 2;
769 }
770 else
771 iter++;
772#else
773 iter++;
774#endif
775 }
776 return 0;
777}
778
779#ifdef Py_DEBUG
780int unicode_ready_calls = 0;
781#endif
782
783int
784_PyUnicode_Ready(PyUnicodeObject *unicode)
785{
786 wchar_t *end;
787 Py_UCS4 maxchar = 0;
788 Py_ssize_t num_surrogates;
789#if SIZEOF_WCHAR_T == 2
790 Py_ssize_t length_wo_surrogates;
791#endif
792
793 assert(PyUnicode_Check(unicode));
794
795 if (unicode->data.any != NULL) {
796 assert(PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
797 return 0;
798 }
799
800 /* _PyUnicode_Ready() is only intented for old-style API usage where
801 * strings were created using _PyObject_New() and where no canonical
802 * representation (the str field) has been set yet aka strings
803 * which are not yet ready.
804 */
805 assert(_PyUnicode_WSTR(unicode) != NULL);
806 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
807 assert(!PyUnicode_IS_COMPACT(unicode));
808 assert(!PyUnicode_IS_READY(unicode));
809 /* Actually, it should neither be interned nor be anything else: */
810 assert(_PyUnicode_STATE(unicode).interned == 0);
811 assert(unicode->_base.utf8 == NULL);
812
813#ifdef Py_DEBUG
814 ++unicode_ready_calls;
815#endif
816
817 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200818 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200819 &maxchar,
820 &num_surrogates) == -1) {
821 assert(0 && "PyUnicode_FindMaxCharAndNumSurrogatePairs failed");
822 return -1;
823 }
824
825 if (maxchar < 256) {
826 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
827 if (!unicode->data.any) {
828 PyErr_NoMemory();
829 return -1;
830 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200831 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832 _PyUnicode_WSTR(unicode), end,
833 PyUnicode_1BYTE_DATA(unicode));
834 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
835 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
836 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
837 if (maxchar < 128) {
838 unicode->_base.utf8 = unicode->data.any;
839 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
840 }
841 else {
842 unicode->_base.utf8 = NULL;
843 unicode->_base.utf8_length = 0;
844 }
845 PyObject_FREE(_PyUnicode_WSTR(unicode));
846 _PyUnicode_WSTR(unicode) = NULL;
847 _PyUnicode_WSTR_LENGTH(unicode) = 0;
848 }
849 /* In this case we might have to convert down from 4-byte native
850 wchar_t to 2-byte unicode. */
851 else if (maxchar < 65536) {
852 assert(num_surrogates == 0 &&
853 "FindMaxCharAndNumSurrogatePairs() messed up");
854
Victor Stinner506f5922011-09-28 22:34:18 +0200855#if SIZEOF_WCHAR_T == 2
856 /* We can share representations and are done. */
857 unicode->data.any = _PyUnicode_WSTR(unicode);
858 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
859 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
860 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
861 unicode->_base.utf8 = NULL;
862 unicode->_base.utf8_length = 0;
863#else
864 /* sizeof(wchar_t) == 4 */
865 unicode->data.any = PyObject_MALLOC(
866 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
867 if (!unicode->data.any) {
868 PyErr_NoMemory();
869 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870 }
Victor Stinner506f5922011-09-28 22:34:18 +0200871 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
872 _PyUnicode_WSTR(unicode), end,
873 PyUnicode_2BYTE_DATA(unicode));
874 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
875 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
876 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
877 unicode->_base.utf8 = NULL;
878 unicode->_base.utf8_length = 0;
879 PyObject_FREE(_PyUnicode_WSTR(unicode));
880 _PyUnicode_WSTR(unicode) = NULL;
881 _PyUnicode_WSTR_LENGTH(unicode) = 0;
882#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200883 }
884 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
885 else {
886#if SIZEOF_WCHAR_T == 2
887 /* in case the native representation is 2-bytes, we need to allocate a
888 new normalized 4-byte version. */
889 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
890 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
891 if (!unicode->data.any) {
892 PyErr_NoMemory();
893 return -1;
894 }
895 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
896 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
897 unicode->_base.utf8 = NULL;
898 unicode->_base.utf8_length = 0;
899 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
900 unicode) < 0) {
901 assert(0 && "ConvertWideCharToUCS4 failed");
902 return -1;
903 }
904 PyObject_FREE(_PyUnicode_WSTR(unicode));
905 _PyUnicode_WSTR(unicode) = NULL;
906 _PyUnicode_WSTR_LENGTH(unicode) = 0;
907#else
908 assert(num_surrogates == 0);
909
910 unicode->data.any = _PyUnicode_WSTR(unicode);
911 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
912 unicode->_base.utf8 = NULL;
913 unicode->_base.utf8_length = 0;
914 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
915#endif
916 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
917 }
918 _PyUnicode_STATE(unicode).ready = 1;
919 return 0;
920}
921
Alexander Belopolsky40018472011-02-26 01:02:56 +0000922static void
923unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000924{
Walter Dörwald16807132007-05-25 13:52:07 +0000925 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000926 case SSTATE_NOT_INTERNED:
927 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000928
Benjamin Peterson29060642009-01-31 22:14:21 +0000929 case SSTATE_INTERNED_MORTAL:
930 /* revive dead object temporarily for DelItem */
931 Py_REFCNT(unicode) = 3;
932 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
933 Py_FatalError(
934 "deletion of interned string failed");
935 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000936
Benjamin Peterson29060642009-01-31 22:14:21 +0000937 case SSTATE_INTERNED_IMMORTAL:
938 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000939
Benjamin Peterson29060642009-01-31 22:14:21 +0000940 default:
941 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000942 }
943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200944 if (_PyUnicode_WSTR(unicode) &&
945 (!PyUnicode_IS_READY(unicode) ||
946 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
947 PyObject_DEL(_PyUnicode_WSTR(unicode));
948 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
949 PyObject_DEL(unicode->_base.utf8);
950
951 if (PyUnicode_IS_COMPACT(unicode)) {
952 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000953 }
954 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200955 if (unicode->data.any)
956 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000957 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000958 }
959}
960
Alexander Belopolsky40018472011-02-26 01:02:56 +0000961static int
962_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000963{
964 register PyUnicodeObject *v;
965
966 /* Argument checks */
967 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000968 PyErr_BadInternalCall();
969 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000970 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000971 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
973 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000974 PyErr_BadInternalCall();
975 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000976 }
977
978 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 possible since these are being shared.
980 The same goes for new-representation unicode objects or objects which
981 have already been readied.
982 For these, we simply return a fresh copy with the same Unicode content.
983 */
984 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
985 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
986 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000987 PyUnicodeObject *w = _PyUnicode_New(length);
988 if (w == NULL)
989 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
991 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000992 Py_DECREF(*unicode);
993 *unicode = w;
994 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000995 }
996
997 /* Note that we don't have to modify *unicode for unshared Unicode
998 objects, since we can modify them in-place. */
999 return unicode_resize(v, length);
1000}
1001
Alexander Belopolsky40018472011-02-26 01:02:56 +00001002int
1003PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001004{
1005 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
1006}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001008static PyObject*
1009get_latin1_char(unsigned char ch)
1010{
1011 PyUnicodeObject *unicode = unicode_latin1[ch];
1012 if (!unicode) {
1013 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
1014 if (!unicode)
1015 return NULL;
1016 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1017 unicode_latin1[ch] = unicode;
1018 }
1019 Py_INCREF(unicode);
1020 return (PyObject *)unicode;
1021}
1022
Alexander Belopolsky40018472011-02-26 01:02:56 +00001023PyObject *
1024PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025{
1026 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027 Py_UCS4 maxchar = 0;
1028 Py_ssize_t num_surrogates;
1029
1030 if (u == NULL)
1031 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001032
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001033 /* If the Unicode data is known at construction time, we can apply
1034 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001036 /* Optimization for empty strings */
1037 if (size == 0 && unicode_empty != NULL) {
1038 Py_INCREF(unicode_empty);
1039 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001040 }
Tim Petersced69f82003-09-16 20:30:58 +00001041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001042 /* Single character Unicode objects in the Latin-1 range are
1043 shared when using this constructor */
1044 if (size == 1 && *u < 256)
1045 return get_latin1_char((unsigned char)*u);
1046
1047 /* If not empty and not single character, copy the Unicode data
1048 into the new object */
Victor Stinner17222162011-09-28 22:15:37 +02001049 if (find_maxchar_surrogates(u, u + size, &maxchar,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 &num_surrogates) == -1)
1051 return NULL;
1052
1053 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1054 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055 if (!unicode)
1056 return NULL;
1057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 switch (PyUnicode_KIND(unicode)) {
1059 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001060 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1062 break;
1063 case PyUnicode_2BYTE_KIND:
1064#if Py_UNICODE_SIZE == 2
1065 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1066#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001067 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1069#endif
1070 break;
1071 case PyUnicode_4BYTE_KIND:
1072#if SIZEOF_WCHAR_T == 2
1073 /* This is the only case which has to process surrogates, thus
1074 a simple copy loop is not enough and we need a function. */
1075 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1076 Py_DECREF(unicode);
1077 return NULL;
1078 }
1079#else
1080 assert(num_surrogates == 0);
1081 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1082#endif
1083 break;
1084 default:
1085 assert(0 && "Impossible state");
1086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087
1088 return (PyObject *)unicode;
1089}
1090
Alexander Belopolsky40018472011-02-26 01:02:56 +00001091PyObject *
1092PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001093{
1094 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001095
Benjamin Peterson14339b62009-01-31 16:36:08 +00001096 if (size < 0) {
1097 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001098 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001099 return NULL;
1100 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001101
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001102 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001103 some optimizations which share commonly used objects.
1104 Also, this means the input must be UTF-8, so fall back to the
1105 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001106 if (u != NULL) {
1107
Benjamin Peterson29060642009-01-31 22:14:21 +00001108 /* Optimization for empty strings */
1109 if (size == 0 && unicode_empty != NULL) {
1110 Py_INCREF(unicode_empty);
1111 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001112 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001113
1114 /* Single characters are shared when using this constructor.
1115 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 if (size == 1 && Py_CHARMASK(*u) < 128)
1117 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001118
1119 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001120 }
1121
Walter Dörwald55507312007-05-18 13:12:10 +00001122 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001123 if (!unicode)
1124 return NULL;
1125
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001126 return (PyObject *)unicode;
1127}
1128
Alexander Belopolsky40018472011-02-26 01:02:56 +00001129PyObject *
1130PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001131{
1132 size_t size = strlen(u);
1133 if (size > PY_SSIZE_T_MAX) {
1134 PyErr_SetString(PyExc_OverflowError, "input too long");
1135 return NULL;
1136 }
1137
1138 return PyUnicode_FromStringAndSize(u, size);
1139}
1140
Victor Stinnere57b1c02011-09-28 22:20:48 +02001141static PyObject*
1142_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001143{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 PyObject *res;
1145 unsigned char max = 127;
1146 Py_ssize_t i;
1147 for (i = 0; i < size; i++) {
1148 if (u[i] & 0x80) {
1149 max = 255;
1150 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001151 }
1152 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153 res = PyUnicode_New(size, max);
1154 if (!res)
1155 return NULL;
1156 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1157 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001158}
1159
Victor Stinnere57b1c02011-09-28 22:20:48 +02001160static PyObject*
1161_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162{
1163 PyObject *res;
1164 Py_UCS2 max = 0;
1165 Py_ssize_t i;
1166 for (i = 0; i < size; i++)
1167 if (u[i] > max)
1168 max = u[i];
1169 res = PyUnicode_New(size, max);
1170 if (!res)
1171 return NULL;
1172 if (max >= 256)
1173 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1174 else
1175 for (i = 0; i < size; i++)
1176 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1177 return res;
1178}
1179
Victor Stinnere57b1c02011-09-28 22:20:48 +02001180static PyObject*
1181_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182{
1183 PyObject *res;
1184 Py_UCS4 max = 0;
1185 Py_ssize_t i;
1186 for (i = 0; i < size; i++)
1187 if (u[i] > max)
1188 max = u[i];
1189 res = PyUnicode_New(size, max);
1190 if (!res)
1191 return NULL;
1192 if (max >= 0x10000)
1193 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1194 else {
1195 int kind = PyUnicode_KIND(res);
1196 void *data = PyUnicode_DATA(res);
1197 for (i = 0; i < size; i++)
1198 PyUnicode_WRITE(kind, data, i, u[i]);
1199 }
1200 return res;
1201}
1202
1203PyObject*
1204PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1205{
1206 switch(kind) {
1207 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001208 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001210 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001212 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001213 }
1214 assert(0);
1215 return NULL;
1216}
1217
1218
1219/* Widen Unicode objects to larger buffers.
1220 Return NULL if the string is too wide already. */
1221
1222void*
1223_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1224{
1225 Py_ssize_t i;
1226 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1227 void *d = PyUnicode_DATA(s);
1228 unsigned int skind = PyUnicode_KIND(s);
1229 if (PyUnicode_KIND(s) >= kind) {
1230 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1231 return NULL;
1232 }
1233 switch(kind) {
1234 case PyUnicode_2BYTE_KIND: {
1235 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1236 if (!result) {
1237 PyErr_NoMemory();
1238 return 0;
1239 }
1240 for (i = 0; i < len; i++)
1241 result[i] = ((Py_UCS1*)d)[i];
1242 return result;
1243 }
1244 case PyUnicode_4BYTE_KIND: {
1245 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1246 if (!result) {
1247 PyErr_NoMemory();
1248 return 0;
1249 }
1250 for (i = 0; i < len; i++)
1251 result[i] = PyUnicode_READ(skind, d, i);
1252 return result;
1253 }
1254 }
1255 Py_FatalError("invalid kind");
1256 return NULL;
1257}
1258
1259static Py_UCS4*
1260as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1261 int copy_null)
1262{
1263 int kind;
1264 void *data;
1265 Py_ssize_t len, targetlen;
1266 if (PyUnicode_READY(string) == -1)
1267 return NULL;
1268 kind = PyUnicode_KIND(string);
1269 data = PyUnicode_DATA(string);
1270 len = PyUnicode_GET_LENGTH(string);
1271 targetlen = len;
1272 if (copy_null)
1273 targetlen++;
1274 if (!target) {
1275 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1276 PyErr_NoMemory();
1277 return NULL;
1278 }
1279 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1280 if (!target) {
1281 PyErr_NoMemory();
1282 return NULL;
1283 }
1284 }
1285 else {
1286 if (targetsize < targetlen) {
1287 PyErr_Format(PyExc_SystemError,
1288 "string is longer than the buffer");
1289 if (copy_null && 0 < targetsize)
1290 target[0] = 0;
1291 return NULL;
1292 }
1293 }
1294 if (kind != PyUnicode_4BYTE_KIND) {
1295 Py_ssize_t i;
1296 for (i = 0; i < len; i++)
1297 target[i] = PyUnicode_READ(kind, data, i);
1298 }
1299 else
1300 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1301 if (copy_null)
1302 target[len] = 0;
1303 return target;
1304}
1305
1306Py_UCS4*
1307PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1308 int copy_null)
1309{
1310 if (target == NULL || targetsize < 1) {
1311 PyErr_BadInternalCall();
1312 return NULL;
1313 }
1314 return as_ucs4(string, target, targetsize, copy_null);
1315}
1316
1317Py_UCS4*
1318PyUnicode_AsUCS4Copy(PyObject *string)
1319{
1320 return as_ucs4(string, NULL, 0, 1);
1321}
1322
1323#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001324
Alexander Belopolsky40018472011-02-26 01:02:56 +00001325PyObject *
1326PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001328 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001329 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001331 PyErr_BadInternalCall();
1332 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001333 }
1334
Martin v. Löwis790465f2008-04-05 20:41:37 +00001335 if (size == -1) {
1336 size = wcslen(w);
1337 }
1338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340}
1341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001343
Walter Dörwald346737f2007-05-31 10:44:43 +00001344static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001345makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1346 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001347{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001348 *fmt++ = '%';
1349 if (width) {
1350 if (zeropad)
1351 *fmt++ = '0';
1352 fmt += sprintf(fmt, "%d", width);
1353 }
1354 if (precision)
1355 fmt += sprintf(fmt, ".%d", precision);
1356 if (longflag)
1357 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001358 else if (longlongflag) {
1359 /* longlongflag should only ever be nonzero on machines with
1360 HAVE_LONG_LONG defined */
1361#ifdef HAVE_LONG_LONG
1362 char *f = PY_FORMAT_LONG_LONG;
1363 while (*f)
1364 *fmt++ = *f++;
1365#else
1366 /* we shouldn't ever get here */
1367 assert(0);
1368 *fmt++ = 'l';
1369#endif
1370 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001371 else if (size_tflag) {
1372 char *f = PY_FORMAT_SIZE_T;
1373 while (*f)
1374 *fmt++ = *f++;
1375 }
1376 *fmt++ = c;
1377 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001378}
1379
Victor Stinner96865452011-03-01 23:44:09 +00001380/* helper for PyUnicode_FromFormatV() */
1381
1382static const char*
1383parse_format_flags(const char *f,
1384 int *p_width, int *p_precision,
1385 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1386{
1387 int width, precision, longflag, longlongflag, size_tflag;
1388
1389 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1390 f++;
1391 width = 0;
1392 while (Py_ISDIGIT((unsigned)*f))
1393 width = (width*10) + *f++ - '0';
1394 precision = 0;
1395 if (*f == '.') {
1396 f++;
1397 while (Py_ISDIGIT((unsigned)*f))
1398 precision = (precision*10) + *f++ - '0';
1399 if (*f == '%') {
1400 /* "%.3%s" => f points to "3" */
1401 f--;
1402 }
1403 }
1404 if (*f == '\0') {
1405 /* bogus format "%.1" => go backward, f points to "1" */
1406 f--;
1407 }
1408 if (p_width != NULL)
1409 *p_width = width;
1410 if (p_precision != NULL)
1411 *p_precision = precision;
1412
1413 /* Handle %ld, %lu, %lld and %llu. */
1414 longflag = 0;
1415 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001416 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001417
1418 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001419 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001420 longflag = 1;
1421 ++f;
1422 }
1423#ifdef HAVE_LONG_LONG
1424 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001425 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001426 longlongflag = 1;
1427 f += 2;
1428 }
1429#endif
1430 }
1431 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001432 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001433 size_tflag = 1;
1434 ++f;
1435 }
1436 if (p_longflag != NULL)
1437 *p_longflag = longflag;
1438 if (p_longlongflag != NULL)
1439 *p_longlongflag = longlongflag;
1440 if (p_size_tflag != NULL)
1441 *p_size_tflag = size_tflag;
1442 return f;
1443}
1444
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001445/* maximum number of characters required for output of %ld. 21 characters
1446 allows for 64-bit integers (in decimal) and an optional sign. */
1447#define MAX_LONG_CHARS 21
1448/* maximum number of characters required for output of %lld.
1449 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1450 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1451#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1452
Walter Dörwaldd2034312007-05-18 16:29:38 +00001453PyObject *
1454PyUnicode_FromFormatV(const char *format, va_list vargs)
1455{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001456 va_list count;
1457 Py_ssize_t callcount = 0;
1458 PyObject **callresults = NULL;
1459 PyObject **callresult = NULL;
1460 Py_ssize_t n = 0;
1461 int width = 0;
1462 int precision = 0;
1463 int zeropad;
1464 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001466 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001467 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1469 Py_UCS4 argmaxchar;
1470 Py_ssize_t numbersize = 0;
1471 char *numberresults = NULL;
1472 char *numberresult = NULL;
1473 Py_ssize_t i;
1474 int kind;
1475 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001476
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001477 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001478 /* step 1: count the number of %S/%R/%A/%s format specifications
1479 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1480 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 * result in an array)
1482 * also esimate a upper bound for all the number formats in the string,
1483 * numbers will be formated in step 3 and be keept in a '\0'-separated
1484 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001485 for (f = format; *f; f++) {
1486 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001487 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1489 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1490 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1491 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001494#ifdef HAVE_LONG_LONG
1495 if (longlongflag) {
1496 if (width < MAX_LONG_LONG_CHARS)
1497 width = MAX_LONG_LONG_CHARS;
1498 }
1499 else
1500#endif
1501 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1502 including sign. Decimal takes the most space. This
1503 isn't enough for octal. If a width is specified we
1504 need more (which we allocate later). */
1505 if (width < MAX_LONG_CHARS)
1506 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507
1508 /* account for the size + '\0' to separate numbers
1509 inside of the numberresults buffer */
1510 numbersize += (width + 1);
1511 }
1512 }
1513 else if ((unsigned char)*f > 127) {
1514 PyErr_Format(PyExc_ValueError,
1515 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1516 "string, got a non-ASCII byte: 0x%02x",
1517 (unsigned char)*f);
1518 return NULL;
1519 }
1520 }
1521 /* step 2: allocate memory for the results of
1522 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1523 if (callcount) {
1524 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1525 if (!callresults) {
1526 PyErr_NoMemory();
1527 return NULL;
1528 }
1529 callresult = callresults;
1530 }
1531 /* step 2.5: allocate memory for the results of formating numbers */
1532 if (numbersize) {
1533 numberresults = PyObject_Malloc(numbersize);
1534 if (!numberresults) {
1535 PyErr_NoMemory();
1536 goto fail;
1537 }
1538 numberresult = numberresults;
1539 }
1540
1541 /* step 3: format numbers and figure out how large a buffer we need */
1542 for (f = format; *f; f++) {
1543 if (*f == '%') {
1544 const char* p;
1545 int longflag;
1546 int longlongflag;
1547 int size_tflag;
1548 int numprinted;
1549
1550 p = f;
1551 zeropad = (f[1] == '0');
1552 f = parse_format_flags(f, &width, &precision,
1553 &longflag, &longlongflag, &size_tflag);
1554 switch (*f) {
1555 case 'c':
1556 {
1557 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001558 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 n++;
1560 break;
1561 }
1562 case '%':
1563 n++;
1564 break;
1565 case 'i':
1566 case 'd':
1567 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1568 width, precision, *f);
1569 if (longflag)
1570 numprinted = sprintf(numberresult, fmt,
1571 va_arg(count, long));
1572#ifdef HAVE_LONG_LONG
1573 else if (longlongflag)
1574 numprinted = sprintf(numberresult, fmt,
1575 va_arg(count, PY_LONG_LONG));
1576#endif
1577 else if (size_tflag)
1578 numprinted = sprintf(numberresult, fmt,
1579 va_arg(count, Py_ssize_t));
1580 else
1581 numprinted = sprintf(numberresult, fmt,
1582 va_arg(count, int));
1583 n += numprinted;
1584 /* advance by +1 to skip over the '\0' */
1585 numberresult += (numprinted + 1);
1586 assert(*(numberresult - 1) == '\0');
1587 assert(*(numberresult - 2) != '\0');
1588 assert(numprinted >= 0);
1589 assert(numberresult <= numberresults + numbersize);
1590 break;
1591 case 'u':
1592 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1593 width, precision, 'u');
1594 if (longflag)
1595 numprinted = sprintf(numberresult, fmt,
1596 va_arg(count, unsigned long));
1597#ifdef HAVE_LONG_LONG
1598 else if (longlongflag)
1599 numprinted = sprintf(numberresult, fmt,
1600 va_arg(count, unsigned PY_LONG_LONG));
1601#endif
1602 else if (size_tflag)
1603 numprinted = sprintf(numberresult, fmt,
1604 va_arg(count, size_t));
1605 else
1606 numprinted = sprintf(numberresult, fmt,
1607 va_arg(count, unsigned int));
1608 n += numprinted;
1609 numberresult += (numprinted + 1);
1610 assert(*(numberresult - 1) == '\0');
1611 assert(*(numberresult - 2) != '\0');
1612 assert(numprinted >= 0);
1613 assert(numberresult <= numberresults + numbersize);
1614 break;
1615 case 'x':
1616 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1617 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1618 n += numprinted;
1619 numberresult += (numprinted + 1);
1620 assert(*(numberresult - 1) == '\0');
1621 assert(*(numberresult - 2) != '\0');
1622 assert(numprinted >= 0);
1623 assert(numberresult <= numberresults + numbersize);
1624 break;
1625 case 'p':
1626 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1627 /* %p is ill-defined: ensure leading 0x. */
1628 if (numberresult[1] == 'X')
1629 numberresult[1] = 'x';
1630 else if (numberresult[1] != 'x') {
1631 memmove(numberresult + 2, numberresult,
1632 strlen(numberresult) + 1);
1633 numberresult[0] = '0';
1634 numberresult[1] = 'x';
1635 numprinted += 2;
1636 }
1637 n += numprinted;
1638 numberresult += (numprinted + 1);
1639 assert(*(numberresult - 1) == '\0');
1640 assert(*(numberresult - 2) != '\0');
1641 assert(numprinted >= 0);
1642 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001643 break;
1644 case 's':
1645 {
1646 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001647 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001648 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1649 if (!str)
1650 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 /* since PyUnicode_DecodeUTF8 returns already flexible
1652 unicode objects, there is no need to call ready on them */
1653 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001654 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001656 /* Remember the str and switch to the next slot */
1657 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001658 break;
1659 }
1660 case 'U':
1661 {
1662 PyObject *obj = va_arg(count, PyObject *);
1663 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001664 if (PyUnicode_READY(obj) == -1)
1665 goto fail;
1666 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001667 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001668 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001669 break;
1670 }
1671 case 'V':
1672 {
1673 PyObject *obj = va_arg(count, PyObject *);
1674 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001675 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001676 assert(obj || str);
1677 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001678 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 if (PyUnicode_READY(obj) == -1)
1680 goto fail;
1681 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001682 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001684 *callresult++ = NULL;
1685 }
1686 else {
1687 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1688 if (!str_obj)
1689 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001691 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001692 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001693 *callresult++ = str_obj;
1694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001695 break;
1696 }
1697 case 'S':
1698 {
1699 PyObject *obj = va_arg(count, PyObject *);
1700 PyObject *str;
1701 assert(obj);
1702 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001703 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001704 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001706 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001707 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001708 /* Remember the str and switch to the next slot */
1709 *callresult++ = str;
1710 break;
1711 }
1712 case 'R':
1713 {
1714 PyObject *obj = va_arg(count, PyObject *);
1715 PyObject *repr;
1716 assert(obj);
1717 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001719 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001721 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001723 /* Remember the repr and switch to the next slot */
1724 *callresult++ = repr;
1725 break;
1726 }
1727 case 'A':
1728 {
1729 PyObject *obj = va_arg(count, PyObject *);
1730 PyObject *ascii;
1731 assert(obj);
1732 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001734 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001736 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001737 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001738 /* Remember the repr and switch to the next slot */
1739 *callresult++ = ascii;
1740 break;
1741 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001742 default:
1743 /* if we stumble upon an unknown
1744 formatting code, copy the rest of
1745 the format string to the output
1746 string. (we cannot just skip the
1747 code, since there's no way to know
1748 what's in the argument list) */
1749 n += strlen(p);
1750 goto expand;
1751 }
1752 } else
1753 n++;
1754 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001755 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001756 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001758 we don't have to resize the string.
1759 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001761 if (!string)
1762 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 kind = PyUnicode_KIND(string);
1764 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001765 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001769 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001770 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001771
1772 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1774 /* checking for == because the last argument could be a empty
1775 string, which causes i to point to end, the assert at the end of
1776 the loop */
1777 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001778
Benjamin Peterson14339b62009-01-31 16:36:08 +00001779 switch (*f) {
1780 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001781 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 const int ordinal = va_arg(vargs, int);
1783 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001784 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001785 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001786 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001787 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001788 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001789 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 case 'p':
1791 /* unused, since we already have the result */
1792 if (*f == 'p')
1793 (void) va_arg(vargs, void *);
1794 else
1795 (void) va_arg(vargs, int);
1796 /* extract the result from numberresults and append. */
1797 for (; *numberresult; ++i, ++numberresult)
1798 PyUnicode_WRITE(kind, data, i, *numberresult);
1799 /* skip over the separating '\0' */
1800 assert(*numberresult == '\0');
1801 numberresult++;
1802 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001803 break;
1804 case 's':
1805 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001806 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001808 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 size = PyUnicode_GET_LENGTH(*callresult);
1810 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001811 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1812 *callresult, 0,
1813 size) < 0)
1814 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001816 /* We're done with the unicode()/repr() => forget it */
1817 Py_DECREF(*callresult);
1818 /* switch to next unicode()/repr() result */
1819 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001820 break;
1821 }
1822 case 'U':
1823 {
1824 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 Py_ssize_t size;
1826 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1827 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001828 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1829 obj, 0,
1830 size) < 0)
1831 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001833 break;
1834 }
1835 case 'V':
1836 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001837 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001838 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001839 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001840 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 size = PyUnicode_GET_LENGTH(obj);
1842 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001843 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1844 obj, 0,
1845 size) < 0)
1846 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001847 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001848 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 size = PyUnicode_GET_LENGTH(*callresult);
1850 assert(PyUnicode_KIND(*callresult) <=
1851 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001852 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1853 *callresult,
1854 0, size) < 0)
1855 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001856 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001857 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001858 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001859 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001860 break;
1861 }
1862 case 'S':
1863 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001864 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001865 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001866 /* unused, since we already have the result */
1867 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001869 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1870 *callresult, 0,
1871 PyUnicode_GET_LENGTH(*callresult)) < 0)
1872 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001874 /* We're done with the unicode()/repr() => forget it */
1875 Py_DECREF(*callresult);
1876 /* switch to next unicode()/repr() result */
1877 ++callresult;
1878 break;
1879 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001880 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001881 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001882 break;
1883 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001884 for (; *p; ++p, ++i)
1885 PyUnicode_WRITE(kind, data, i, *p);
1886 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001887 goto end;
1888 }
Victor Stinner1205f272010-09-11 00:54:47 +00001889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001890 else {
1891 assert(i < PyUnicode_GET_LENGTH(string));
1892 PyUnicode_WRITE(kind, data, i++, *f);
1893 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001896
Benjamin Peterson29060642009-01-31 22:14:21 +00001897 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001898 if (callresults)
1899 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 if (numberresults)
1901 PyObject_Free(numberresults);
1902 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001903 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001904 if (callresults) {
1905 PyObject **callresult2 = callresults;
1906 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001907 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001908 ++callresult2;
1909 }
1910 PyObject_Free(callresults);
1911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 if (numberresults)
1913 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001914 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001915}
1916
Walter Dörwaldd2034312007-05-18 16:29:38 +00001917PyObject *
1918PyUnicode_FromFormat(const char *format, ...)
1919{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001920 PyObject* ret;
1921 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001922
1923#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001924 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001925#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001926 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001927#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001928 ret = PyUnicode_FromFormatV(format, vargs);
1929 va_end(vargs);
1930 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001931}
1932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001933#ifdef HAVE_WCHAR_H
1934
Victor Stinner5593d8a2010-10-02 11:11:27 +00001935/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1936 convert a Unicode object to a wide character string.
1937
Victor Stinnerd88d9832011-09-06 02:00:05 +02001938 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001939 character) required to convert the unicode object. Ignore size argument.
1940
Victor Stinnerd88d9832011-09-06 02:00:05 +02001941 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001942 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001943 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001944static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001945unicode_aswidechar(PyUnicodeObject *unicode,
1946 wchar_t *w,
1947 Py_ssize_t size)
1948{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001949 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950 const wchar_t *wstr;
1951
1952 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1953 if (wstr == NULL)
1954 return -1;
1955
Victor Stinner5593d8a2010-10-02 11:11:27 +00001956 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001957 if (size > res)
1958 size = res + 1;
1959 else
1960 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001962 return res;
1963 }
1964 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001965 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001966}
1967
1968Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001969PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001970 wchar_t *w,
1971 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972{
1973 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001974 PyErr_BadInternalCall();
1975 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001977 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978}
1979
Victor Stinner137c34c2010-09-29 10:25:54 +00001980wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001981PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001982 Py_ssize_t *size)
1983{
1984 wchar_t* buffer;
1985 Py_ssize_t buflen;
1986
1987 if (unicode == NULL) {
1988 PyErr_BadInternalCall();
1989 return NULL;
1990 }
1991
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001992 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993 if (buflen == -1)
1994 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00001995 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001996 PyErr_NoMemory();
1997 return NULL;
1998 }
1999
Victor Stinner137c34c2010-09-29 10:25:54 +00002000 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2001 if (buffer == NULL) {
2002 PyErr_NoMemory();
2003 return NULL;
2004 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002005 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006 if (buflen == -1)
2007 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002008 if (size != NULL)
2009 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002010 return buffer;
2011}
2012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002014
Alexander Belopolsky40018472011-02-26 01:02:56 +00002015PyObject *
2016PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002019 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002020 PyErr_SetString(PyExc_ValueError,
2021 "chr() arg not in range(0x110000)");
2022 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002023 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025 if (ordinal < 256)
2026 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 v = PyUnicode_New(1, ordinal);
2029 if (v == NULL)
2030 return NULL;
2031 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2032 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002033}
2034
Alexander Belopolsky40018472011-02-26 01:02:56 +00002035PyObject *
2036PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002038 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002039 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002040 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002041 Py_INCREF(obj);
2042 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002043 }
2044 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002045 /* For a Unicode subtype that's not a Unicode object,
2046 return a true Unicode object with the same data. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 if (PyUnicode_READY(obj) == -1)
2048 return NULL;
2049 return substring((PyUnicodeObject *)obj, 0, PyUnicode_GET_LENGTH(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002050 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002051 PyErr_Format(PyExc_TypeError,
2052 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002053 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002054 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002055}
2056
Alexander Belopolsky40018472011-02-26 01:02:56 +00002057PyObject *
2058PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002059 const char *encoding,
2060 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002061{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002062 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002063 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002064
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002066 PyErr_BadInternalCall();
2067 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002069
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002070 /* Decoding bytes objects is the most common case and should be fast */
2071 if (PyBytes_Check(obj)) {
2072 if (PyBytes_GET_SIZE(obj) == 0) {
2073 Py_INCREF(unicode_empty);
2074 v = (PyObject *) unicode_empty;
2075 }
2076 else {
2077 v = PyUnicode_Decode(
2078 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2079 encoding, errors);
2080 }
2081 return v;
2082 }
2083
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002084 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002085 PyErr_SetString(PyExc_TypeError,
2086 "decoding str is not supported");
2087 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002088 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002089
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002090 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2091 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2092 PyErr_Format(PyExc_TypeError,
2093 "coercing to str: need bytes, bytearray "
2094 "or buffer-like object, %.80s found",
2095 Py_TYPE(obj)->tp_name);
2096 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002097 }
Tim Petersced69f82003-09-16 20:30:58 +00002098
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002099 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002100 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002101 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102 }
Tim Petersced69f82003-09-16 20:30:58 +00002103 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002104 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002105
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002106 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002107 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108}
2109
Victor Stinner600d3be2010-06-10 12:00:55 +00002110/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002111 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2112 1 on success. */
2113static int
2114normalize_encoding(const char *encoding,
2115 char *lower,
2116 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002118 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002119 char *l;
2120 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002121
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002122 e = encoding;
2123 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002124 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002125 while (*e) {
2126 if (l == l_end)
2127 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002128 if (Py_ISUPPER(*e)) {
2129 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002130 }
2131 else if (*e == '_') {
2132 *l++ = '-';
2133 e++;
2134 }
2135 else {
2136 *l++ = *e++;
2137 }
2138 }
2139 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002140 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002141}
2142
Alexander Belopolsky40018472011-02-26 01:02:56 +00002143PyObject *
2144PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002145 Py_ssize_t size,
2146 const char *encoding,
2147 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002148{
2149 PyObject *buffer = NULL, *unicode;
2150 Py_buffer info;
2151 char lower[11]; /* Enough for any encoding shortcut */
2152
2153 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002154 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002155
2156 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002157 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002158 if ((strcmp(lower, "utf-8") == 0) ||
2159 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002160 return PyUnicode_DecodeUTF8(s, size, errors);
2161 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002162 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002163 (strcmp(lower, "iso-8859-1") == 0))
2164 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002165#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002166 else if (strcmp(lower, "mbcs") == 0)
2167 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002168#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002169 else if (strcmp(lower, "ascii") == 0)
2170 return PyUnicode_DecodeASCII(s, size, errors);
2171 else if (strcmp(lower, "utf-16") == 0)
2172 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2173 else if (strcmp(lower, "utf-32") == 0)
2174 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176
2177 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002178 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002179 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002180 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002181 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 if (buffer == NULL)
2183 goto onError;
2184 unicode = PyCodec_Decode(buffer, encoding, errors);
2185 if (unicode == NULL)
2186 goto onError;
2187 if (!PyUnicode_Check(unicode)) {
2188 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002189 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002190 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191 Py_DECREF(unicode);
2192 goto onError;
2193 }
2194 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 if (PyUnicode_READY(unicode)) {
2196 Py_DECREF(unicode);
2197 return NULL;
2198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002200
Benjamin Peterson29060642009-01-31 22:14:21 +00002201 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002202 Py_XDECREF(buffer);
2203 return NULL;
2204}
2205
Alexander Belopolsky40018472011-02-26 01:02:56 +00002206PyObject *
2207PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002208 const char *encoding,
2209 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002210{
2211 PyObject *v;
2212
2213 if (!PyUnicode_Check(unicode)) {
2214 PyErr_BadArgument();
2215 goto onError;
2216 }
2217
2218 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002219 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002220
2221 /* Decode via the codec registry */
2222 v = PyCodec_Decode(unicode, encoding, errors);
2223 if (v == NULL)
2224 goto onError;
2225 return v;
2226
Benjamin Peterson29060642009-01-31 22:14:21 +00002227 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002228 return NULL;
2229}
2230
Alexander Belopolsky40018472011-02-26 01:02:56 +00002231PyObject *
2232PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002233 const char *encoding,
2234 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002235{
2236 PyObject *v;
2237
2238 if (!PyUnicode_Check(unicode)) {
2239 PyErr_BadArgument();
2240 goto onError;
2241 }
2242
2243 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002244 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002245
2246 /* Decode via the codec registry */
2247 v = PyCodec_Decode(unicode, encoding, errors);
2248 if (v == NULL)
2249 goto onError;
2250 if (!PyUnicode_Check(v)) {
2251 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002252 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002253 Py_TYPE(v)->tp_name);
2254 Py_DECREF(v);
2255 goto onError;
2256 }
2257 return v;
2258
Benjamin Peterson29060642009-01-31 22:14:21 +00002259 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002260 return NULL;
2261}
2262
Alexander Belopolsky40018472011-02-26 01:02:56 +00002263PyObject *
2264PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002265 Py_ssize_t size,
2266 const char *encoding,
2267 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268{
2269 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002270
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271 unicode = PyUnicode_FromUnicode(s, size);
2272 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2275 Py_DECREF(unicode);
2276 return v;
2277}
2278
Alexander Belopolsky40018472011-02-26 01:02:56 +00002279PyObject *
2280PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002281 const char *encoding,
2282 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002283{
2284 PyObject *v;
2285
2286 if (!PyUnicode_Check(unicode)) {
2287 PyErr_BadArgument();
2288 goto onError;
2289 }
2290
2291 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002292 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002293
2294 /* Encode via the codec registry */
2295 v = PyCodec_Encode(unicode, encoding, errors);
2296 if (v == NULL)
2297 goto onError;
2298 return v;
2299
Benjamin Peterson29060642009-01-31 22:14:21 +00002300 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002301 return NULL;
2302}
2303
Victor Stinnerad158722010-10-27 00:25:46 +00002304PyObject *
2305PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002306{
Victor Stinner99b95382011-07-04 14:23:54 +02002307#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002308 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2309 PyUnicode_GET_SIZE(unicode),
2310 NULL);
2311#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002313#else
Victor Stinner793b5312011-04-27 00:24:21 +02002314 PyInterpreterState *interp = PyThreadState_GET()->interp;
2315 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2316 cannot use it to encode and decode filenames before it is loaded. Load
2317 the Python codec requires to encode at least its own filename. Use the C
2318 version of the locale codec until the codec registry is initialized and
2319 the Python codec is loaded.
2320
2321 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2322 cannot only rely on it: check also interp->fscodec_initialized for
2323 subinterpreters. */
2324 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002325 return PyUnicode_AsEncodedString(unicode,
2326 Py_FileSystemDefaultEncoding,
2327 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002328 }
2329 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002330 /* locale encoding with surrogateescape */
2331 wchar_t *wchar;
2332 char *bytes;
2333 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002334 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002335
2336 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2337 if (wchar == NULL)
2338 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002339 bytes = _Py_wchar2char(wchar, &error_pos);
2340 if (bytes == NULL) {
2341 if (error_pos != (size_t)-1) {
2342 char *errmsg = strerror(errno);
2343 PyObject *exc = NULL;
2344 if (errmsg == NULL)
2345 errmsg = "Py_wchar2char() failed";
2346 raise_encode_exception(&exc,
2347 "filesystemencoding",
2348 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2349 error_pos, error_pos+1,
2350 errmsg);
2351 Py_XDECREF(exc);
2352 }
2353 else
2354 PyErr_NoMemory();
2355 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002356 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002357 }
2358 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002359
2360 bytes_obj = PyBytes_FromString(bytes);
2361 PyMem_Free(bytes);
2362 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002363 }
Victor Stinnerad158722010-10-27 00:25:46 +00002364#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002365}
2366
Alexander Belopolsky40018472011-02-26 01:02:56 +00002367PyObject *
2368PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002369 const char *encoding,
2370 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002371{
2372 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002373 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002374
Guido van Rossumd57fd912000-03-10 22:53:23 +00002375 if (!PyUnicode_Check(unicode)) {
2376 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002378 }
Fred Drakee4315f52000-05-09 19:53:39 +00002379
Victor Stinner2f283c22011-03-02 01:21:46 +00002380 if (encoding == NULL) {
2381 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002383 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002384 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002385 }
Fred Drakee4315f52000-05-09 19:53:39 +00002386
2387 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002388 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002389 if ((strcmp(lower, "utf-8") == 0) ||
2390 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002391 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002392 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002394 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002396 }
Victor Stinner37296e82010-06-10 13:36:23 +00002397 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002398 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002399 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002401#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002402 else if (strcmp(lower, "mbcs") == 0)
2403 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2404 PyUnicode_GET_SIZE(unicode),
2405 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002406#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002407 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410
2411 /* Encode via the codec registry */
2412 v = PyCodec_Encode(unicode, encoding, errors);
2413 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002414 return NULL;
2415
2416 /* The normal path */
2417 if (PyBytes_Check(v))
2418 return v;
2419
2420 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002421 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002422 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002423 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002424
2425 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2426 "encoder %s returned bytearray instead of bytes",
2427 encoding);
2428 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002429 Py_DECREF(v);
2430 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002431 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002432
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002433 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2434 Py_DECREF(v);
2435 return b;
2436 }
2437
2438 PyErr_Format(PyExc_TypeError,
2439 "encoder did not return a bytes object (type=%.400s)",
2440 Py_TYPE(v)->tp_name);
2441 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002442 return NULL;
2443}
2444
Alexander Belopolsky40018472011-02-26 01:02:56 +00002445PyObject *
2446PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002447 const char *encoding,
2448 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002449{
2450 PyObject *v;
2451
2452 if (!PyUnicode_Check(unicode)) {
2453 PyErr_BadArgument();
2454 goto onError;
2455 }
2456
2457 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002458 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002459
2460 /* Encode via the codec registry */
2461 v = PyCodec_Encode(unicode, encoding, errors);
2462 if (v == NULL)
2463 goto onError;
2464 if (!PyUnicode_Check(v)) {
2465 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002466 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002467 Py_TYPE(v)->tp_name);
2468 Py_DECREF(v);
2469 goto onError;
2470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002472
Benjamin Peterson29060642009-01-31 22:14:21 +00002473 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 return NULL;
2475}
2476
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002477PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002478PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002479 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002480 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2481}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002482
Christian Heimes5894ba72007-11-04 11:43:14 +00002483PyObject*
2484PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2485{
Victor Stinner99b95382011-07-04 14:23:54 +02002486#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002487 return PyUnicode_DecodeMBCS(s, size, NULL);
2488#elif defined(__APPLE__)
2489 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2490#else
Victor Stinner793b5312011-04-27 00:24:21 +02002491 PyInterpreterState *interp = PyThreadState_GET()->interp;
2492 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2493 cannot use it to encode and decode filenames before it is loaded. Load
2494 the Python codec requires to encode at least its own filename. Use the C
2495 version of the locale codec until the codec registry is initialized and
2496 the Python codec is loaded.
2497
2498 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2499 cannot only rely on it: check also interp->fscodec_initialized for
2500 subinterpreters. */
2501 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002502 return PyUnicode_Decode(s, size,
2503 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002504 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002505 }
2506 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002507 /* locale encoding with surrogateescape */
2508 wchar_t *wchar;
2509 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002510 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002511
2512 if (s[size] != '\0' || size != strlen(s)) {
2513 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2514 return NULL;
2515 }
2516
Victor Stinner168e1172010-10-16 23:16:16 +00002517 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002518 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002519 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002520
Victor Stinner168e1172010-10-16 23:16:16 +00002521 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002522 PyMem_Free(wchar);
2523 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002524 }
Victor Stinnerad158722010-10-27 00:25:46 +00002525#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002526}
2527
Martin v. Löwis011e8422009-05-05 04:43:17 +00002528
2529int
2530PyUnicode_FSConverter(PyObject* arg, void* addr)
2531{
2532 PyObject *output = NULL;
2533 Py_ssize_t size;
2534 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002535 if (arg == NULL) {
2536 Py_DECREF(*(PyObject**)addr);
2537 return 1;
2538 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002539 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002540 output = arg;
2541 Py_INCREF(output);
2542 }
2543 else {
2544 arg = PyUnicode_FromObject(arg);
2545 if (!arg)
2546 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002547 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002548 Py_DECREF(arg);
2549 if (!output)
2550 return 0;
2551 if (!PyBytes_Check(output)) {
2552 Py_DECREF(output);
2553 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2554 return 0;
2555 }
2556 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002557 size = PyBytes_GET_SIZE(output);
2558 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002559 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002560 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002561 Py_DECREF(output);
2562 return 0;
2563 }
2564 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002565 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002566}
2567
2568
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002569int
2570PyUnicode_FSDecoder(PyObject* arg, void* addr)
2571{
2572 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002573 if (arg == NULL) {
2574 Py_DECREF(*(PyObject**)addr);
2575 return 1;
2576 }
2577 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 if (PyUnicode_READY(arg))
2579 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002580 output = arg;
2581 Py_INCREF(output);
2582 }
2583 else {
2584 arg = PyBytes_FromObject(arg);
2585 if (!arg)
2586 return 0;
2587 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2588 PyBytes_GET_SIZE(arg));
2589 Py_DECREF(arg);
2590 if (!output)
2591 return 0;
2592 if (!PyUnicode_Check(output)) {
2593 Py_DECREF(output);
2594 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2595 return 0;
2596 }
2597 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2599 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002600 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2601 Py_DECREF(output);
2602 return 0;
2603 }
2604 *(PyObject**)addr = output;
2605 return Py_CLEANUP_SUPPORTED;
2606}
2607
2608
Martin v. Löwis5b222132007-06-10 09:51:05 +00002609char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002610PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002611{
Christian Heimesf3863112007-11-22 07:46:41 +00002612 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2614
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002615 if (!PyUnicode_Check(unicode)) {
2616 PyErr_BadArgument();
2617 return NULL;
2618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002620 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621
2622 if (_PyUnicode_UTF8(unicode) == NULL) {
2623 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2624 if (bytes == NULL)
2625 return NULL;
2626 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2627 if (u->_base.utf8 == NULL) {
2628 Py_DECREF(bytes);
2629 return NULL;
2630 }
2631 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2632 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2633 Py_DECREF(bytes);
2634 }
2635
2636 if (psize)
2637 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2638 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002639}
2640
2641char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002643{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2645}
2646
2647#ifdef Py_DEBUG
2648int unicode_as_unicode_calls = 0;
2649#endif
2650
2651
2652Py_UNICODE *
2653PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2654{
2655 PyUnicodeObject *u;
2656 const unsigned char *one_byte;
2657#if SIZEOF_WCHAR_T == 4
2658 const Py_UCS2 *two_bytes;
2659#else
2660 const Py_UCS4 *four_bytes;
2661 const Py_UCS4 *ucs4_end;
2662 Py_ssize_t num_surrogates;
2663#endif
2664 wchar_t *w;
2665 wchar_t *wchar_end;
2666
2667 if (!PyUnicode_Check(unicode)) {
2668 PyErr_BadArgument();
2669 return NULL;
2670 }
2671 u = (PyUnicodeObject*)unicode;
2672 if (_PyUnicode_WSTR(u) == NULL) {
2673 /* Non-ASCII compact unicode object */
2674 assert(_PyUnicode_KIND(u) != 0);
2675 assert(PyUnicode_IS_READY(u));
2676
2677#ifdef Py_DEBUG
2678 ++unicode_as_unicode_calls;
2679#endif
2680
2681 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2682#if SIZEOF_WCHAR_T == 2
2683 four_bytes = PyUnicode_4BYTE_DATA(u);
2684 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2685 num_surrogates = 0;
2686
2687 for (; four_bytes < ucs4_end; ++four_bytes) {
2688 if (*four_bytes > 0xFFFF)
2689 ++num_surrogates;
2690 }
2691
2692 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2693 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2694 if (!_PyUnicode_WSTR(u)) {
2695 PyErr_NoMemory();
2696 return NULL;
2697 }
2698 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2699
2700 w = _PyUnicode_WSTR(u);
2701 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2702 four_bytes = PyUnicode_4BYTE_DATA(u);
2703 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2704 if (*four_bytes > 0xFFFF) {
2705 /* encode surrogate pair in this case */
2706 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2707 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2708 }
2709 else
2710 *w = *four_bytes;
2711
2712 if (w > wchar_end) {
2713 assert(0 && "Miscalculated string end");
2714 }
2715 }
2716 *w = 0;
2717#else
2718 /* sizeof(wchar_t) == 4 */
2719 Py_FatalError("Impossible unicode object state, wstr and str "
2720 "should share memory already.");
2721 return NULL;
2722#endif
2723 }
2724 else {
2725 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2726 (_PyUnicode_LENGTH(u) + 1));
2727 if (!_PyUnicode_WSTR(u)) {
2728 PyErr_NoMemory();
2729 return NULL;
2730 }
2731 if (!PyUnicode_IS_COMPACT_ASCII(u))
2732 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2733 w = _PyUnicode_WSTR(u);
2734 wchar_end = w + _PyUnicode_LENGTH(u);
2735
2736 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2737 one_byte = PyUnicode_1BYTE_DATA(u);
2738 for (; w < wchar_end; ++one_byte, ++w)
2739 *w = *one_byte;
2740 /* null-terminate the wstr */
2741 *w = 0;
2742 }
2743 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2744#if SIZEOF_WCHAR_T == 4
2745 two_bytes = PyUnicode_2BYTE_DATA(u);
2746 for (; w < wchar_end; ++two_bytes, ++w)
2747 *w = *two_bytes;
2748 /* null-terminate the wstr */
2749 *w = 0;
2750#else
2751 /* sizeof(wchar_t) == 2 */
2752 PyObject_FREE(_PyUnicode_WSTR(u));
2753 _PyUnicode_WSTR(u) = NULL;
2754 Py_FatalError("Impossible unicode object state, wstr "
2755 "and str should share memory already.");
2756 return NULL;
2757#endif
2758 }
2759 else {
2760 assert(0 && "This should never happen.");
2761 }
2762 }
2763 }
2764 if (size != NULL)
2765 *size = PyUnicode_WSTR_LENGTH(u);
2766 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002767}
2768
Alexander Belopolsky40018472011-02-26 01:02:56 +00002769Py_UNICODE *
2770PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002772 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773}
2774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775
Alexander Belopolsky40018472011-02-26 01:02:56 +00002776Py_ssize_t
2777PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778{
2779 if (!PyUnicode_Check(unicode)) {
2780 PyErr_BadArgument();
2781 goto onError;
2782 }
2783 return PyUnicode_GET_SIZE(unicode);
2784
Benjamin Peterson29060642009-01-31 22:14:21 +00002785 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 return -1;
2787}
2788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002789Py_ssize_t
2790PyUnicode_GetLength(PyObject *unicode)
2791{
2792 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2793 PyErr_BadArgument();
2794 return -1;
2795 }
2796
2797 return PyUnicode_GET_LENGTH(unicode);
2798}
2799
2800Py_UCS4
2801PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2802{
2803 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2804 return PyErr_BadArgument();
2805 return (Py_UCS4)-1;
2806 }
2807 return PyUnicode_READ_CHAR(unicode, index);
2808}
2809
2810int
2811PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2812{
2813 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2814 return PyErr_BadArgument();
2815 return -1;
2816 }
2817
2818 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2819 index, ch);
2820 return 0;
2821}
2822
Alexander Belopolsky40018472011-02-26 01:02:56 +00002823const char *
2824PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002825{
Victor Stinner42cb4622010-09-01 19:39:01 +00002826 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002827}
2828
Victor Stinner554f3f02010-06-16 23:33:54 +00002829/* create or adjust a UnicodeDecodeError */
2830static void
2831make_decode_exception(PyObject **exceptionObject,
2832 const char *encoding,
2833 const char *input, Py_ssize_t length,
2834 Py_ssize_t startpos, Py_ssize_t endpos,
2835 const char *reason)
2836{
2837 if (*exceptionObject == NULL) {
2838 *exceptionObject = PyUnicodeDecodeError_Create(
2839 encoding, input, length, startpos, endpos, reason);
2840 }
2841 else {
2842 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2843 goto onError;
2844 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2845 goto onError;
2846 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2847 goto onError;
2848 }
2849 return;
2850
2851onError:
2852 Py_DECREF(*exceptionObject);
2853 *exceptionObject = NULL;
2854}
2855
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002856/* error handling callback helper:
2857 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002858 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002859 and adjust various state variables.
2860 return 0 on success, -1 on error
2861*/
2862
Alexander Belopolsky40018472011-02-26 01:02:56 +00002863static int
2864unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002865 const char *encoding, const char *reason,
2866 const char **input, const char **inend, Py_ssize_t *startinpos,
2867 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2868 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002869{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002870 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002871
2872 PyObject *restuple = NULL;
2873 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002874 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002875 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002876 Py_ssize_t requiredsize;
2877 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002878 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002879 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002880 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002881 int res = -1;
2882
2883 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002884 *errorHandler = PyCodec_LookupError(errors);
2885 if (*errorHandler == NULL)
2886 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887 }
2888
Victor Stinner554f3f02010-06-16 23:33:54 +00002889 make_decode_exception(exceptionObject,
2890 encoding,
2891 *input, *inend - *input,
2892 *startinpos, *endinpos,
2893 reason);
2894 if (*exceptionObject == NULL)
2895 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002896
2897 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2898 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002899 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002900 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002901 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002902 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002903 }
2904 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002905 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002906
2907 /* Copy back the bytes variables, which might have been modified by the
2908 callback */
2909 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2910 if (!inputobj)
2911 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002912 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002913 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002914 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002915 *input = PyBytes_AS_STRING(inputobj);
2916 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002917 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002918 /* we can DECREF safely, as the exception has another reference,
2919 so the object won't go away. */
2920 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002921
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002922 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002923 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002924 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002925 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2926 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002927 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002928
2929 /* need more space? (at least enough for what we
2930 have+the replacement+the rest of the string (starting
2931 at the new input position), so we won't have to check space
2932 when there are no errors in the rest of the string) */
2933 repptr = PyUnicode_AS_UNICODE(repunicode);
2934 repsize = PyUnicode_GET_SIZE(repunicode);
2935 requiredsize = *outpos + repsize + insize-newpos;
2936 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002937 if (requiredsize<2*outsize)
2938 requiredsize = 2*outsize;
2939 if (_PyUnicode_Resize(output, requiredsize) < 0)
2940 goto onError;
2941 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002942 }
2943 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002944 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002945 Py_UNICODE_COPY(*outptr, repptr, repsize);
2946 *outptr += repsize;
2947 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002948
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002949 /* we made it! */
2950 res = 0;
2951
Benjamin Peterson29060642009-01-31 22:14:21 +00002952 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002953 Py_XDECREF(restuple);
2954 return res;
2955}
2956
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002957/* --- UTF-7 Codec -------------------------------------------------------- */
2958
Antoine Pitrou244651a2009-05-04 18:56:13 +00002959/* See RFC2152 for details. We encode conservatively and decode liberally. */
2960
2961/* Three simple macros defining base-64. */
2962
2963/* Is c a base-64 character? */
2964
2965#define IS_BASE64(c) \
2966 (((c) >= 'A' && (c) <= 'Z') || \
2967 ((c) >= 'a' && (c) <= 'z') || \
2968 ((c) >= '0' && (c) <= '9') || \
2969 (c) == '+' || (c) == '/')
2970
2971/* given that c is a base-64 character, what is its base-64 value? */
2972
2973#define FROM_BASE64(c) \
2974 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2975 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2976 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2977 (c) == '+' ? 62 : 63)
2978
2979/* What is the base-64 character of the bottom 6 bits of n? */
2980
2981#define TO_BASE64(n) \
2982 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2983
2984/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2985 * decoded as itself. We are permissive on decoding; the only ASCII
2986 * byte not decoding to itself is the + which begins a base64
2987 * string. */
2988
2989#define DECODE_DIRECT(c) \
2990 ((c) <= 127 && (c) != '+')
2991
2992/* The UTF-7 encoder treats ASCII characters differently according to
2993 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2994 * the above). See RFC2152. This array identifies these different
2995 * sets:
2996 * 0 : "Set D"
2997 * alphanumeric and '(),-./:?
2998 * 1 : "Set O"
2999 * !"#$%&*;<=>@[]^_`{|}
3000 * 2 : "whitespace"
3001 * ht nl cr sp
3002 * 3 : special (must be base64 encoded)
3003 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3004 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003005
Tim Petersced69f82003-09-16 20:30:58 +00003006static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003007char utf7_category[128] = {
3008/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3009 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3010/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3011 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3012/* sp ! " # $ % & ' ( ) * + , - . / */
3013 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3014/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3015 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3016/* @ A B C D E F G H I J K L M N O */
3017 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3018/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3019 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3020/* ` a b c d e f g h i j k l m n o */
3021 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3022/* p q r s t u v w x y z { | } ~ del */
3023 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003024};
3025
Antoine Pitrou244651a2009-05-04 18:56:13 +00003026/* ENCODE_DIRECT: this character should be encoded as itself. The
3027 * answer depends on whether we are encoding set O as itself, and also
3028 * on whether we are encoding whitespace as itself. RFC2152 makes it
3029 * clear that the answers to these questions vary between
3030 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003031
Antoine Pitrou244651a2009-05-04 18:56:13 +00003032#define ENCODE_DIRECT(c, directO, directWS) \
3033 ((c) < 128 && (c) > 0 && \
3034 ((utf7_category[(c)] == 0) || \
3035 (directWS && (utf7_category[(c)] == 2)) || \
3036 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003037
Alexander Belopolsky40018472011-02-26 01:02:56 +00003038PyObject *
3039PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003040 Py_ssize_t size,
3041 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003042{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003043 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3044}
3045
Antoine Pitrou244651a2009-05-04 18:56:13 +00003046/* The decoder. The only state we preserve is our read position,
3047 * i.e. how many characters we have consumed. So if we end in the
3048 * middle of a shift sequence we have to back off the read position
3049 * and the output to the beginning of the sequence, otherwise we lose
3050 * all the shift state (seen bits, number of bits seen, high
3051 * surrogate). */
3052
Alexander Belopolsky40018472011-02-26 01:02:56 +00003053PyObject *
3054PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003055 Py_ssize_t size,
3056 const char *errors,
3057 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003058{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003060 Py_ssize_t startinpos;
3061 Py_ssize_t endinpos;
3062 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003063 const char *e;
3064 PyUnicodeObject *unicode;
3065 Py_UNICODE *p;
3066 const char *errmsg = "";
3067 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003068 Py_UNICODE *shiftOutStart;
3069 unsigned int base64bits = 0;
3070 unsigned long base64buffer = 0;
3071 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003072 PyObject *errorHandler = NULL;
3073 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003074
3075 unicode = _PyUnicode_New(size);
3076 if (!unicode)
3077 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003078 if (size == 0) {
3079 if (consumed)
3080 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003081 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003082 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003084 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003085 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003086 e = s + size;
3087
3088 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003089 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003090 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003091 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003092
Antoine Pitrou244651a2009-05-04 18:56:13 +00003093 if (inShift) { /* in a base-64 section */
3094 if (IS_BASE64(ch)) { /* consume a base-64 character */
3095 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3096 base64bits += 6;
3097 s++;
3098 if (base64bits >= 16) {
3099 /* we have enough bits for a UTF-16 value */
3100 Py_UNICODE outCh = (Py_UNICODE)
3101 (base64buffer >> (base64bits-16));
3102 base64bits -= 16;
3103 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3104 if (surrogate) {
3105 /* expecting a second surrogate */
3106 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3107#ifdef Py_UNICODE_WIDE
3108 *p++ = (((surrogate & 0x3FF)<<10)
3109 | (outCh & 0x3FF)) + 0x10000;
3110#else
3111 *p++ = surrogate;
3112 *p++ = outCh;
3113#endif
3114 surrogate = 0;
3115 }
3116 else {
3117 surrogate = 0;
3118 errmsg = "second surrogate missing";
3119 goto utf7Error;
3120 }
3121 }
3122 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3123 /* first surrogate */
3124 surrogate = outCh;
3125 }
3126 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3127 errmsg = "unexpected second surrogate";
3128 goto utf7Error;
3129 }
3130 else {
3131 *p++ = outCh;
3132 }
3133 }
3134 }
3135 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003136 inShift = 0;
3137 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003138 if (surrogate) {
3139 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003140 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003141 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003142 if (base64bits > 0) { /* left-over bits */
3143 if (base64bits >= 6) {
3144 /* We've seen at least one base-64 character */
3145 errmsg = "partial character in shift sequence";
3146 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003147 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003148 else {
3149 /* Some bits remain; they should be zero */
3150 if (base64buffer != 0) {
3151 errmsg = "non-zero padding bits in shift sequence";
3152 goto utf7Error;
3153 }
3154 }
3155 }
3156 if (ch != '-') {
3157 /* '-' is absorbed; other terminating
3158 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003159 *p++ = ch;
3160 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003161 }
3162 }
3163 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003164 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003165 s++; /* consume '+' */
3166 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003167 s++;
3168 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003169 }
3170 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003171 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003172 shiftOutStart = p;
3173 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003174 }
3175 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003176 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003177 *p++ = ch;
3178 s++;
3179 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003180 else {
3181 startinpos = s-starts;
3182 s++;
3183 errmsg = "unexpected special character";
3184 goto utf7Error;
3185 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003186 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003187utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003188 outpos = p-PyUnicode_AS_UNICODE(unicode);
3189 endinpos = s-starts;
3190 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003191 errors, &errorHandler,
3192 "utf7", errmsg,
3193 &starts, &e, &startinpos, &endinpos, &exc, &s,
3194 &unicode, &outpos, &p))
3195 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003196 }
3197
Antoine Pitrou244651a2009-05-04 18:56:13 +00003198 /* end of string */
3199
3200 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3201 /* if we're in an inconsistent state, that's an error */
3202 if (surrogate ||
3203 (base64bits >= 6) ||
3204 (base64bits > 0 && base64buffer != 0)) {
3205 outpos = p-PyUnicode_AS_UNICODE(unicode);
3206 endinpos = size;
3207 if (unicode_decode_call_errorhandler(
3208 errors, &errorHandler,
3209 "utf7", "unterminated shift sequence",
3210 &starts, &e, &startinpos, &endinpos, &exc, &s,
3211 &unicode, &outpos, &p))
3212 goto onError;
3213 if (s < e)
3214 goto restart;
3215 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003216 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003217
3218 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003219 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003220 if (inShift) {
3221 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003222 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003223 }
3224 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003225 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003226 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003227 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003228
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003229 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003230 goto onError;
3231
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003232 Py_XDECREF(errorHandler);
3233 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003234 if (PyUnicode_READY(unicode) == -1) {
3235 Py_DECREF(unicode);
3236 return NULL;
3237 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003238 return (PyObject *)unicode;
3239
Benjamin Peterson29060642009-01-31 22:14:21 +00003240 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003241 Py_XDECREF(errorHandler);
3242 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003243 Py_DECREF(unicode);
3244 return NULL;
3245}
3246
3247
Alexander Belopolsky40018472011-02-26 01:02:56 +00003248PyObject *
3249PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003250 Py_ssize_t size,
3251 int base64SetO,
3252 int base64WhiteSpace,
3253 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003254{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003255 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003256 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003257 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003258 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003259 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003260 unsigned int base64bits = 0;
3261 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003262 char * out;
3263 char * start;
3264
3265 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003266 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003267
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003268 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003269 return PyErr_NoMemory();
3270
Antoine Pitrou244651a2009-05-04 18:56:13 +00003271 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003272 if (v == NULL)
3273 return NULL;
3274
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003275 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003276 for (;i < size; ++i) {
3277 Py_UNICODE ch = s[i];
3278
Antoine Pitrou244651a2009-05-04 18:56:13 +00003279 if (inShift) {
3280 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3281 /* shifting out */
3282 if (base64bits) { /* output remaining bits */
3283 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3284 base64buffer = 0;
3285 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003286 }
3287 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003288 /* Characters not in the BASE64 set implicitly unshift the sequence
3289 so no '-' is required, except if the character is itself a '-' */
3290 if (IS_BASE64(ch) || ch == '-') {
3291 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003292 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003293 *out++ = (char) ch;
3294 }
3295 else {
3296 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003297 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003298 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003299 else { /* not in a shift sequence */
3300 if (ch == '+') {
3301 *out++ = '+';
3302 *out++ = '-';
3303 }
3304 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3305 *out++ = (char) ch;
3306 }
3307 else {
3308 *out++ = '+';
3309 inShift = 1;
3310 goto encode_char;
3311 }
3312 }
3313 continue;
3314encode_char:
3315#ifdef Py_UNICODE_WIDE
3316 if (ch >= 0x10000) {
3317 /* code first surrogate */
3318 base64bits += 16;
3319 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3320 while (base64bits >= 6) {
3321 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3322 base64bits -= 6;
3323 }
3324 /* prepare second surrogate */
3325 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3326 }
3327#endif
3328 base64bits += 16;
3329 base64buffer = (base64buffer << 16) | ch;
3330 while (base64bits >= 6) {
3331 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3332 base64bits -= 6;
3333 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003334 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003335 if (base64bits)
3336 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3337 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003338 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003339 if (_PyBytes_Resize(&v, out - start) < 0)
3340 return NULL;
3341 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003342}
3343
Antoine Pitrou244651a2009-05-04 18:56:13 +00003344#undef IS_BASE64
3345#undef FROM_BASE64
3346#undef TO_BASE64
3347#undef DECODE_DIRECT
3348#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003349
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350/* --- UTF-8 Codec -------------------------------------------------------- */
3351
Tim Petersced69f82003-09-16 20:30:58 +00003352static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003354 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3355 illegal prefix. See RFC 3629 for details */
3356 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3357 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003358 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3360 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3361 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3362 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003363 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3364 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3366 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003367 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3368 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3369 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3370 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3371 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372};
3373
Alexander Belopolsky40018472011-02-26 01:02:56 +00003374PyObject *
3375PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003376 Py_ssize_t size,
3377 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378{
Walter Dörwald69652032004-09-07 20:24:22 +00003379 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3380}
3381
Antoine Pitrouab868312009-01-10 15:40:25 +00003382/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3383#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3384
3385/* Mask to quickly check whether a C 'long' contains a
3386 non-ASCII, UTF8-encoded char. */
3387#if (SIZEOF_LONG == 8)
3388# define ASCII_CHAR_MASK 0x8080808080808080L
3389#elif (SIZEOF_LONG == 4)
3390# define ASCII_CHAR_MASK 0x80808080L
3391#else
3392# error C 'long' size should be either 4 or 8!
3393#endif
3394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003395/* Scans a UTF-8 string and returns the maximum character to be expected,
3396 the size of the decoded unicode string and if any major errors were
3397 encountered.
3398
3399 This function does check basic UTF-8 sanity, it does however NOT CHECK
3400 if the string contains surrogates, and if all continuation bytes are
3401 within the correct ranges, these checks are performed in
3402 PyUnicode_DecodeUTF8Stateful.
3403
3404 If it sets has_errors to 1, it means the value of unicode_size and max_char
3405 will be bogus and you should not rely on useful information in them.
3406 */
3407static Py_UCS4
3408utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3409 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3410 int *has_errors)
3411{
3412 Py_ssize_t n;
3413 Py_ssize_t char_count = 0;
3414 Py_UCS4 max_char = 127, new_max;
3415 Py_UCS4 upper_bound;
3416 const unsigned char *p = (const unsigned char *)s;
3417 const unsigned char *end = p + string_size;
3418 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3419 int err = 0;
3420
3421 for (; p < end && !err; ++p, ++char_count) {
3422 /* Only check value if it's not a ASCII char... */
3423 if (*p < 0x80) {
3424 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3425 an explanation. */
3426 if (!((size_t) p & LONG_PTR_MASK)) {
3427 /* Help register allocation */
3428 register const unsigned char *_p = p;
3429 while (_p < aligned_end) {
3430 unsigned long value = *(unsigned long *) _p;
3431 if (value & ASCII_CHAR_MASK)
3432 break;
3433 _p += SIZEOF_LONG;
3434 char_count += SIZEOF_LONG;
3435 }
3436 p = _p;
3437 if (p == end)
3438 break;
3439 }
3440 }
3441 if (*p >= 0x80) {
3442 n = utf8_code_length[*p];
3443 new_max = max_char;
3444 switch (n) {
3445 /* invalid start byte */
3446 case 0:
3447 err = 1;
3448 break;
3449 case 2:
3450 /* Code points between 0x00FF and 0x07FF inclusive.
3451 Approximate the upper bound of the code point,
3452 if this flips over 255 we can be sure it will be more
3453 than 255 and the string will need 2 bytes per code coint,
3454 if it stays under or equal to 255, we can be sure 1 byte
3455 is enough.
3456 ((*p & 0b00011111) << 6) | 0b00111111 */
3457 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3458 if (max_char < upper_bound)
3459 new_max = upper_bound;
3460 /* Ensure we track at least that we left ASCII space. */
3461 if (new_max < 128)
3462 new_max = 128;
3463 break;
3464 case 3:
3465 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3466 always > 255 and <= 65535 and will always need 2 bytes. */
3467 if (max_char < 65535)
3468 new_max = 65535;
3469 break;
3470 case 4:
3471 /* Code point will be above 0xFFFF for sure in this case. */
3472 new_max = 65537;
3473 break;
3474 /* Internal error, this should be caught by the first if */
3475 case 1:
3476 default:
3477 assert(0 && "Impossible case in utf8_max_char_and_size");
3478 err = 1;
3479 }
3480 /* Instead of number of overall bytes for this code point,
3481 n containts the number of following bytes: */
3482 --n;
3483 /* Check if the follow up chars are all valid continuation bytes */
3484 if (n >= 1) {
3485 const unsigned char *cont;
3486 if ((p + n) >= end) {
3487 if (consumed == 0)
3488 /* incomplete data, non-incremental decoding */
3489 err = 1;
3490 break;
3491 }
3492 for (cont = p + 1; cont < (p + n); ++cont) {
3493 if ((*cont & 0xc0) != 0x80) {
3494 err = 1;
3495 break;
3496 }
3497 }
3498 p += n;
3499 }
3500 else
3501 err = 1;
3502 max_char = new_max;
3503 }
3504 }
3505
3506 if (unicode_size)
3507 *unicode_size = char_count;
3508 if (has_errors)
3509 *has_errors = err;
3510 return max_char;
3511}
3512
3513/* Similar to PyUnicode_WRITE but can also write into wstr field
3514 of the legacy unicode representation */
3515#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3516 do { \
3517 const int k_ = (kind); \
3518 if (k_ == PyUnicode_WCHAR_KIND) \
3519 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3520 else if (k_ == PyUnicode_1BYTE_KIND) \
3521 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3522 else if (k_ == PyUnicode_2BYTE_KIND) \
3523 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3524 else \
3525 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3526 } while (0)
3527
Alexander Belopolsky40018472011-02-26 01:02:56 +00003528PyObject *
3529PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003530 Py_ssize_t size,
3531 const char *errors,
3532 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003533{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003536 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003537 Py_ssize_t startinpos;
3538 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003539 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003541 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 PyObject *errorHandler = NULL;
3543 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003544 Py_UCS4 maxchar = 0;
3545 Py_ssize_t unicode_size;
3546 Py_ssize_t i;
3547 int kind;
3548 void *data;
3549 int has_errors;
3550 Py_UNICODE *error_outptr;
3551#if SIZEOF_WCHAR_T == 2
3552 Py_ssize_t wchar_offset = 0;
3553#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554
Walter Dörwald69652032004-09-07 20:24:22 +00003555 if (size == 0) {
3556 if (consumed)
3557 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003558 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003559 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003560 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3561 consumed, &has_errors);
3562 if (has_errors) {
3563 unicode = _PyUnicode_New(size);
3564 if (!unicode)
3565 return NULL;
3566 kind = PyUnicode_WCHAR_KIND;
3567 data = PyUnicode_AS_UNICODE(unicode);
3568 assert(data != NULL);
3569 }
3570 else {
3571 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3572 if (!unicode)
3573 return NULL;
3574 /* When the string is ASCII only, just use memcpy and return.
3575 unicode_size may be != size if there is an incomplete UTF-8
3576 sequence at the end of the ASCII block. */
3577 if (maxchar < 128 && size == unicode_size) {
3578 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3579 return (PyObject *)unicode;
3580 }
3581 kind = PyUnicode_KIND(unicode);
3582 data = PyUnicode_DATA(unicode);
3583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003585 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003587 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003588
3589 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003590 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591
3592 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003593 /* Fast path for runs of ASCII characters. Given that common UTF-8
3594 input will consist of an overwhelming majority of ASCII
3595 characters, we try to optimize for this case by checking
3596 as many characters as a C 'long' can contain.
3597 First, check if we can do an aligned read, as most CPUs have
3598 a penalty for unaligned reads.
3599 */
3600 if (!((size_t) s & LONG_PTR_MASK)) {
3601 /* Help register allocation */
3602 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003603 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003604 while (_s < aligned_end) {
3605 /* Read a whole long at a time (either 4 or 8 bytes),
3606 and do a fast unrolled copy if it only contains ASCII
3607 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003608 unsigned long value = *(unsigned long *) _s;
3609 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003610 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003611 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3612 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3613 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3614 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003615#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003616 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3617 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3618 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3619 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003620#endif
3621 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003622 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003623 }
3624 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003625 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003626 if (s == e)
3627 break;
3628 ch = (unsigned char)*s;
3629 }
3630 }
3631
3632 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003633 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634 s++;
3635 continue;
3636 }
3637
3638 n = utf8_code_length[ch];
3639
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003640 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003641 if (consumed)
3642 break;
3643 else {
3644 errmsg = "unexpected end of data";
3645 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003646 endinpos = startinpos+1;
3647 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3648 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003649 goto utf8Error;
3650 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652
3653 switch (n) {
3654
3655 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003656 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003657 startinpos = s-starts;
3658 endinpos = startinpos+1;
3659 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660
3661 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003662 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003663 startinpos = s-starts;
3664 endinpos = startinpos+1;
3665 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666
3667 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003668 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003669 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003670 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003671 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 goto utf8Error;
3673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003675 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003676 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 break;
3678
3679 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003680 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3681 will result in surrogates in range d800-dfff. Surrogates are
3682 not valid UTF-8 so they are rejected.
3683 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3684 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003685 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003686 (s[2] & 0xc0) != 0x80 ||
3687 ((unsigned char)s[0] == 0xE0 &&
3688 (unsigned char)s[1] < 0xA0) ||
3689 ((unsigned char)s[0] == 0xED &&
3690 (unsigned char)s[1] > 0x9F)) {
3691 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003692 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003693 endinpos = startinpos + 1;
3694
3695 /* if s[1] first two bits are 1 and 0, then the invalid
3696 continuation byte is s[2], so increment endinpos by 1,
3697 if not, s[1] is invalid and endinpos doesn't need to
3698 be incremented. */
3699 if ((s[1] & 0xC0) == 0x80)
3700 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003701 goto utf8Error;
3702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003704 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003705 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003706 break;
3707
3708 case 4:
3709 if ((s[1] & 0xc0) != 0x80 ||
3710 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003711 (s[3] & 0xc0) != 0x80 ||
3712 ((unsigned char)s[0] == 0xF0 &&
3713 (unsigned char)s[1] < 0x90) ||
3714 ((unsigned char)s[0] == 0xF4 &&
3715 (unsigned char)s[1] > 0x8F)) {
3716 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003717 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003718 endinpos = startinpos + 1;
3719 if ((s[1] & 0xC0) == 0x80) {
3720 endinpos++;
3721 if ((s[2] & 0xC0) == 0x80)
3722 endinpos++;
3723 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003724 goto utf8Error;
3725 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003726 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003727 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3728 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003730 /* If the string is flexible or we have native UCS-4, write
3731 directly.. */
3732 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3733 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003735 else {
3736 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003738 /* translate from 10000..10FFFF to 0..FFFF */
3739 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003741 /* high surrogate = top 10 bits added to D800 */
3742 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3743 (Py_UNICODE)(0xD800 + (ch >> 10)));
3744
3745 /* low surrogate = bottom 10 bits added to DC00 */
3746 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3747 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3748 }
3749#if SIZEOF_WCHAR_T == 2
3750 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003751#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 }
3754 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003755 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003756
Benjamin Peterson29060642009-01-31 22:14:21 +00003757 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758 /* If this is not yet a resizable string, make it one.. */
3759 if (kind != PyUnicode_WCHAR_KIND) {
3760 const Py_UNICODE *u;
3761 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3762 if (!new_unicode)
3763 goto onError;
3764 u = PyUnicode_AsUnicode((PyObject *)unicode);
3765 if (!u)
3766 goto onError;
3767#if SIZEOF_WCHAR_T == 2
3768 i += wchar_offset;
3769#endif
3770 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3771 Py_DECREF(unicode);
3772 unicode = new_unicode;
3773 kind = 0;
3774 data = PyUnicode_AS_UNICODE(new_unicode);
3775 assert(data != NULL);
3776 }
3777 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003778 if (unicode_decode_call_errorhandler(
3779 errors, &errorHandler,
3780 "utf8", errmsg,
3781 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003782 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003783 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003784 /* Update data because unicode_decode_call_errorhandler might have
3785 re-created or resized the unicode object. */
3786 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003787 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003789 /* Ensure the unicode_size calculation above was correct: */
3790 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3791
Walter Dörwald69652032004-09-07 20:24:22 +00003792 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003793 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003795 /* Adjust length and ready string when it contained errors and
3796 is of the old resizable kind. */
3797 if (kind == PyUnicode_WCHAR_KIND) {
3798 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3799 PyUnicode_READY(unicode) == -1)
3800 goto onError;
3801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003803 Py_XDECREF(errorHandler);
3804 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003805 if (PyUnicode_READY(unicode) == -1) {
3806 Py_DECREF(unicode);
3807 return NULL;
3808 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 return (PyObject *)unicode;
3810
Benjamin Peterson29060642009-01-31 22:14:21 +00003811 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812 Py_XDECREF(errorHandler);
3813 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 Py_DECREF(unicode);
3815 return NULL;
3816}
3817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003819
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003820#ifdef __APPLE__
3821
3822/* Simplified UTF-8 decoder using surrogateescape error handler,
3823 used to decode the command line arguments on Mac OS X. */
3824
3825wchar_t*
3826_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3827{
3828 int n;
3829 const char *e;
3830 wchar_t *unicode, *p;
3831
3832 /* Note: size will always be longer than the resulting Unicode
3833 character count */
3834 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3835 PyErr_NoMemory();
3836 return NULL;
3837 }
3838 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3839 if (!unicode)
3840 return NULL;
3841
3842 /* Unpack UTF-8 encoded data */
3843 p = unicode;
3844 e = s + size;
3845 while (s < e) {
3846 Py_UCS4 ch = (unsigned char)*s;
3847
3848 if (ch < 0x80) {
3849 *p++ = (wchar_t)ch;
3850 s++;
3851 continue;
3852 }
3853
3854 n = utf8_code_length[ch];
3855 if (s + n > e) {
3856 goto surrogateescape;
3857 }
3858
3859 switch (n) {
3860 case 0:
3861 case 1:
3862 goto surrogateescape;
3863
3864 case 2:
3865 if ((s[1] & 0xc0) != 0x80)
3866 goto surrogateescape;
3867 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3868 assert ((ch > 0x007F) && (ch <= 0x07FF));
3869 *p++ = (wchar_t)ch;
3870 break;
3871
3872 case 3:
3873 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3874 will result in surrogates in range d800-dfff. Surrogates are
3875 not valid UTF-8 so they are rejected.
3876 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3877 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3878 if ((s[1] & 0xc0) != 0x80 ||
3879 (s[2] & 0xc0) != 0x80 ||
3880 ((unsigned char)s[0] == 0xE0 &&
3881 (unsigned char)s[1] < 0xA0) ||
3882 ((unsigned char)s[0] == 0xED &&
3883 (unsigned char)s[1] > 0x9F)) {
3884
3885 goto surrogateescape;
3886 }
3887 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3888 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003890 break;
3891
3892 case 4:
3893 if ((s[1] & 0xc0) != 0x80 ||
3894 (s[2] & 0xc0) != 0x80 ||
3895 (s[3] & 0xc0) != 0x80 ||
3896 ((unsigned char)s[0] == 0xF0 &&
3897 (unsigned char)s[1] < 0x90) ||
3898 ((unsigned char)s[0] == 0xF4 &&
3899 (unsigned char)s[1] > 0x8F)) {
3900 goto surrogateescape;
3901 }
3902 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3903 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3904 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3905
3906#if SIZEOF_WCHAR_T == 4
3907 *p++ = (wchar_t)ch;
3908#else
3909 /* compute and append the two surrogates: */
3910
3911 /* translate from 10000..10FFFF to 0..FFFF */
3912 ch -= 0x10000;
3913
3914 /* high surrogate = top 10 bits added to D800 */
3915 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3916
3917 /* low surrogate = bottom 10 bits added to DC00 */
3918 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3919#endif
3920 break;
3921 }
3922 s += n;
3923 continue;
3924
3925 surrogateescape:
3926 *p++ = 0xDC00 + ch;
3927 s++;
3928 }
3929 *p = L'\0';
3930 return unicode;
3931}
3932
3933#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003935/* Primary internal function which creates utf8 encoded bytes objects.
3936
3937 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003938 and allocate exactly as much space needed at the end. Else allocate the
3939 maximum possible needed (4 result bytes per Unicode character), and return
3940 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003941*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003942PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944{
Tim Peters602f7402002-04-27 18:03:26 +00003945#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003946
Guido van Rossum98297ee2007-11-06 21:34:58 +00003947 Py_ssize_t i; /* index into s of next input byte */
3948 PyObject *result; /* result string object */
3949 char *p; /* next free byte in output buffer */
3950 Py_ssize_t nallocated; /* number of result bytes allocated */
3951 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003952 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003953 PyObject *errorHandler = NULL;
3954 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003955 int kind;
3956 void *data;
3957 Py_ssize_t size;
3958 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3959#if SIZEOF_WCHAR_T == 2
3960 Py_ssize_t wchar_offset = 0;
3961#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003963 if (!PyUnicode_Check(unicode)) {
3964 PyErr_BadArgument();
3965 return NULL;
3966 }
3967
3968 if (PyUnicode_READY(unicode) == -1)
3969 return NULL;
3970
3971 if (_PyUnicode_UTF8(unicode))
3972 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
3973 _PyUnicode_UTF8_LENGTH(unicode));
3974
3975 kind = PyUnicode_KIND(unicode);
3976 data = PyUnicode_DATA(unicode);
3977 size = PyUnicode_GET_LENGTH(unicode);
3978
Tim Peters602f7402002-04-27 18:03:26 +00003979 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980
Tim Peters602f7402002-04-27 18:03:26 +00003981 if (size <= MAX_SHORT_UNICHARS) {
3982 /* Write into the stack buffer; nallocated can't overflow.
3983 * At the end, we'll allocate exactly as much heap space as it
3984 * turns out we need.
3985 */
3986 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003987 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00003988 p = stackbuf;
3989 }
3990 else {
3991 /* Overallocate on the heap, and give the excess back at the end. */
3992 nallocated = size * 4;
3993 if (nallocated / 4 != size) /* overflow! */
3994 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00003995 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003996 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00003997 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00003998 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003999 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004000
Tim Peters602f7402002-04-27 18:03:26 +00004001 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004002 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004003
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004004 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004005 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004007
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004009 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004010 *p++ = (char)(0xc0 | (ch >> 6));
4011 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004012 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013 Py_ssize_t newpos;
4014 PyObject *rep;
4015 Py_ssize_t repsize, k, startpos;
4016 startpos = i-1;
4017#if SIZEOF_WCHAR_T == 2
4018 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004019#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004020 rep = unicode_encode_call_errorhandler(
4021 errors, &errorHandler, "utf-8", "surrogates not allowed",
4022 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4023 &exc, startpos, startpos+1, &newpos);
4024 if (!rep)
4025 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004027 if (PyBytes_Check(rep))
4028 repsize = PyBytes_GET_SIZE(rep);
4029 else
4030 repsize = PyUnicode_GET_SIZE(rep);
4031
4032 if (repsize > 4) {
4033 Py_ssize_t offset;
4034
4035 if (result == NULL)
4036 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004037 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4041 /* integer overflow */
4042 PyErr_NoMemory();
4043 goto error;
4044 }
4045 nallocated += repsize - 4;
4046 if (result != NULL) {
4047 if (_PyBytes_Resize(&result, nallocated) < 0)
4048 goto error;
4049 } else {
4050 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004051 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004052 goto error;
4053 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4054 }
4055 p = PyBytes_AS_STRING(result) + offset;
4056 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058 if (PyBytes_Check(rep)) {
4059 char *prep = PyBytes_AS_STRING(rep);
4060 for(k = repsize; k > 0; k--)
4061 *p++ = *prep++;
4062 } else /* rep is unicode */ {
4063 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4064 Py_UNICODE c;
4065
4066 for(k=0; k<repsize; k++) {
4067 c = prep[k];
4068 if (0x80 <= c) {
4069 raise_encode_exception(&exc, "utf-8",
4070 PyUnicode_AS_UNICODE(unicode),
4071 size, i-1, i,
4072 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004073 goto error;
4074 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004075 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004076 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004078 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004079 } else if (ch < 0x10000) {
4080 *p++ = (char)(0xe0 | (ch >> 12));
4081 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4082 *p++ = (char)(0x80 | (ch & 0x3f));
4083 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004084 /* Encode UCS4 Unicode ordinals */
4085 *p++ = (char)(0xf0 | (ch >> 18));
4086 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4087 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4088 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089#if SIZEOF_WCHAR_T == 2
4090 wchar_offset++;
4091#endif
Tim Peters602f7402002-04-27 18:03:26 +00004092 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004094
Guido van Rossum98297ee2007-11-06 21:34:58 +00004095 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004096 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004097 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004098 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004099 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004100 }
4101 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004102 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004103 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004104 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004105 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004106 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004108 Py_XDECREF(errorHandler);
4109 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004110 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004111 error:
4112 Py_XDECREF(errorHandler);
4113 Py_XDECREF(exc);
4114 Py_XDECREF(result);
4115 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004116
Tim Peters602f7402002-04-27 18:03:26 +00004117#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118}
4119
Alexander Belopolsky40018472011-02-26 01:02:56 +00004120PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004121PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4122 Py_ssize_t size,
4123 const char *errors)
4124{
4125 PyObject *v, *unicode;
4126
4127 unicode = PyUnicode_FromUnicode(s, size);
4128 if (unicode == NULL)
4129 return NULL;
4130 v = _PyUnicode_AsUTF8String(unicode, errors);
4131 Py_DECREF(unicode);
4132 return v;
4133}
4134
4135PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004136PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004138 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139}
4140
Walter Dörwald41980ca2007-08-16 21:55:45 +00004141/* --- UTF-32 Codec ------------------------------------------------------- */
4142
4143PyObject *
4144PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 Py_ssize_t size,
4146 const char *errors,
4147 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004148{
4149 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4150}
4151
4152PyObject *
4153PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004154 Py_ssize_t size,
4155 const char *errors,
4156 int *byteorder,
4157 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004158{
4159 const char *starts = s;
4160 Py_ssize_t startinpos;
4161 Py_ssize_t endinpos;
4162 Py_ssize_t outpos;
4163 PyUnicodeObject *unicode;
4164 Py_UNICODE *p;
4165#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004166 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004167 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004168#else
4169 const int pairs = 0;
4170#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004171 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004172 int bo = 0; /* assume native ordering by default */
4173 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004174 /* Offsets from q for retrieving bytes in the right order. */
4175#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4176 int iorder[] = {0, 1, 2, 3};
4177#else
4178 int iorder[] = {3, 2, 1, 0};
4179#endif
4180 PyObject *errorHandler = NULL;
4181 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004182
Walter Dörwald41980ca2007-08-16 21:55:45 +00004183 q = (unsigned char *)s;
4184 e = q + size;
4185
4186 if (byteorder)
4187 bo = *byteorder;
4188
4189 /* Check for BOM marks (U+FEFF) in the input and adjust current
4190 byte order setting accordingly. In native mode, the leading BOM
4191 mark is skipped, in all other modes, it is copied to the output
4192 stream as-is (giving a ZWNBSP character). */
4193 if (bo == 0) {
4194 if (size >= 4) {
4195 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004197#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 if (bom == 0x0000FEFF) {
4199 q += 4;
4200 bo = -1;
4201 }
4202 else if (bom == 0xFFFE0000) {
4203 q += 4;
4204 bo = 1;
4205 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004206#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 if (bom == 0x0000FEFF) {
4208 q += 4;
4209 bo = 1;
4210 }
4211 else if (bom == 0xFFFE0000) {
4212 q += 4;
4213 bo = -1;
4214 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004215#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004217 }
4218
4219 if (bo == -1) {
4220 /* force LE */
4221 iorder[0] = 0;
4222 iorder[1] = 1;
4223 iorder[2] = 2;
4224 iorder[3] = 3;
4225 }
4226 else if (bo == 1) {
4227 /* force BE */
4228 iorder[0] = 3;
4229 iorder[1] = 2;
4230 iorder[2] = 1;
4231 iorder[3] = 0;
4232 }
4233
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004234 /* On narrow builds we split characters outside the BMP into two
4235 codepoints => count how much extra space we need. */
4236#ifndef Py_UNICODE_WIDE
4237 for (qq = q; qq < e; qq += 4)
4238 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4239 pairs++;
4240#endif
4241
4242 /* This might be one to much, because of a BOM */
4243 unicode = _PyUnicode_New((size+3)/4+pairs);
4244 if (!unicode)
4245 return NULL;
4246 if (size == 0)
4247 return (PyObject *)unicode;
4248
4249 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004250 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004251
Walter Dörwald41980ca2007-08-16 21:55:45 +00004252 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 Py_UCS4 ch;
4254 /* remaining bytes at the end? (size should be divisible by 4) */
4255 if (e-q<4) {
4256 if (consumed)
4257 break;
4258 errmsg = "truncated data";
4259 startinpos = ((const char *)q)-starts;
4260 endinpos = ((const char *)e)-starts;
4261 goto utf32Error;
4262 /* The remaining input chars are ignored if the callback
4263 chooses to skip the input */
4264 }
4265 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4266 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004267
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 if (ch >= 0x110000)
4269 {
4270 errmsg = "codepoint not in range(0x110000)";
4271 startinpos = ((const char *)q)-starts;
4272 endinpos = startinpos+4;
4273 goto utf32Error;
4274 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004275#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 if (ch >= 0x10000)
4277 {
4278 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4279 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4280 }
4281 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004282#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 *p++ = ch;
4284 q += 4;
4285 continue;
4286 utf32Error:
4287 outpos = p-PyUnicode_AS_UNICODE(unicode);
4288 if (unicode_decode_call_errorhandler(
4289 errors, &errorHandler,
4290 "utf32", errmsg,
4291 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4292 &unicode, &outpos, &p))
4293 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004294 }
4295
4296 if (byteorder)
4297 *byteorder = bo;
4298
4299 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004300 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004301
4302 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004303 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004304 goto onError;
4305
4306 Py_XDECREF(errorHandler);
4307 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004308 if (PyUnicode_READY(unicode) == -1) {
4309 Py_DECREF(unicode);
4310 return NULL;
4311 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004312 return (PyObject *)unicode;
4313
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004315 Py_DECREF(unicode);
4316 Py_XDECREF(errorHandler);
4317 Py_XDECREF(exc);
4318 return NULL;
4319}
4320
4321PyObject *
4322PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004323 Py_ssize_t size,
4324 const char *errors,
4325 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004326{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004327 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004328 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004329 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004330#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004331 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004332#else
4333 const int pairs = 0;
4334#endif
4335 /* Offsets from p for storing byte pairs in the right order. */
4336#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4337 int iorder[] = {0, 1, 2, 3};
4338#else
4339 int iorder[] = {3, 2, 1, 0};
4340#endif
4341
Benjamin Peterson29060642009-01-31 22:14:21 +00004342#define STORECHAR(CH) \
4343 do { \
4344 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4345 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4346 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4347 p[iorder[0]] = (CH) & 0xff; \
4348 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004349 } while(0)
4350
4351 /* In narrow builds we can output surrogate pairs as one codepoint,
4352 so we need less space. */
4353#ifndef Py_UNICODE_WIDE
4354 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004355 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4356 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4357 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004358#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004359 nsize = (size - pairs + (byteorder == 0));
4360 bytesize = nsize * 4;
4361 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004362 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004363 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004364 if (v == NULL)
4365 return NULL;
4366
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004367 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004368 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004370 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004371 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004372
4373 if (byteorder == -1) {
4374 /* force LE */
4375 iorder[0] = 0;
4376 iorder[1] = 1;
4377 iorder[2] = 2;
4378 iorder[3] = 3;
4379 }
4380 else if (byteorder == 1) {
4381 /* force BE */
4382 iorder[0] = 3;
4383 iorder[1] = 2;
4384 iorder[2] = 1;
4385 iorder[3] = 0;
4386 }
4387
4388 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004389 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004390#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004391 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4392 Py_UCS4 ch2 = *s;
4393 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4394 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4395 s++;
4396 size--;
4397 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004398 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004399#endif
4400 STORECHAR(ch);
4401 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004402
4403 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004404 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004405#undef STORECHAR
4406}
4407
Alexander Belopolsky40018472011-02-26 01:02:56 +00004408PyObject *
4409PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004410{
4411 if (!PyUnicode_Check(unicode)) {
4412 PyErr_BadArgument();
4413 return NULL;
4414 }
4415 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004416 PyUnicode_GET_SIZE(unicode),
4417 NULL,
4418 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004419}
4420
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421/* --- UTF-16 Codec ------------------------------------------------------- */
4422
Tim Peters772747b2001-08-09 22:21:55 +00004423PyObject *
4424PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 Py_ssize_t size,
4426 const char *errors,
4427 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428{
Walter Dörwald69652032004-09-07 20:24:22 +00004429 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4430}
4431
Antoine Pitrouab868312009-01-10 15:40:25 +00004432/* Two masks for fast checking of whether a C 'long' may contain
4433 UTF16-encoded surrogate characters. This is an efficient heuristic,
4434 assuming that non-surrogate characters with a code point >= 0x8000 are
4435 rare in most input.
4436 FAST_CHAR_MASK is used when the input is in native byte ordering,
4437 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004438*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004439#if (SIZEOF_LONG == 8)
4440# define FAST_CHAR_MASK 0x8000800080008000L
4441# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4442#elif (SIZEOF_LONG == 4)
4443# define FAST_CHAR_MASK 0x80008000L
4444# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4445#else
4446# error C 'long' size should be either 4 or 8!
4447#endif
4448
Walter Dörwald69652032004-09-07 20:24:22 +00004449PyObject *
4450PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004451 Py_ssize_t size,
4452 const char *errors,
4453 int *byteorder,
4454 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004455{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004457 Py_ssize_t startinpos;
4458 Py_ssize_t endinpos;
4459 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 PyUnicodeObject *unicode;
4461 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004462 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004463 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004464 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004465 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004466 /* Offsets from q for retrieving byte pairs in the right order. */
4467#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4468 int ihi = 1, ilo = 0;
4469#else
4470 int ihi = 0, ilo = 1;
4471#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 PyObject *errorHandler = NULL;
4473 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474
4475 /* Note: size will always be longer than the resulting Unicode
4476 character count */
4477 unicode = _PyUnicode_New(size);
4478 if (!unicode)
4479 return NULL;
4480 if (size == 0)
4481 return (PyObject *)unicode;
4482
4483 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004484 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004485 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004486 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487
4488 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004489 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004491 /* Check for BOM marks (U+FEFF) in the input and adjust current
4492 byte order setting accordingly. In native mode, the leading BOM
4493 mark is skipped, in all other modes, it is copied to the output
4494 stream as-is (giving a ZWNBSP character). */
4495 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004496 if (size >= 2) {
4497 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004498#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004499 if (bom == 0xFEFF) {
4500 q += 2;
4501 bo = -1;
4502 }
4503 else if (bom == 0xFFFE) {
4504 q += 2;
4505 bo = 1;
4506 }
Tim Petersced69f82003-09-16 20:30:58 +00004507#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 if (bom == 0xFEFF) {
4509 q += 2;
4510 bo = 1;
4511 }
4512 else if (bom == 0xFFFE) {
4513 q += 2;
4514 bo = -1;
4515 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004516#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519
Tim Peters772747b2001-08-09 22:21:55 +00004520 if (bo == -1) {
4521 /* force LE */
4522 ihi = 1;
4523 ilo = 0;
4524 }
4525 else if (bo == 1) {
4526 /* force BE */
4527 ihi = 0;
4528 ilo = 1;
4529 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004530#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4531 native_ordering = ilo < ihi;
4532#else
4533 native_ordering = ilo > ihi;
4534#endif
Tim Peters772747b2001-08-09 22:21:55 +00004535
Antoine Pitrouab868312009-01-10 15:40:25 +00004536 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004537 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004538 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004539 /* First check for possible aligned read of a C 'long'. Unaligned
4540 reads are more expensive, better to defer to another iteration. */
4541 if (!((size_t) q & LONG_PTR_MASK)) {
4542 /* Fast path for runs of non-surrogate chars. */
4543 register const unsigned char *_q = q;
4544 Py_UNICODE *_p = p;
4545 if (native_ordering) {
4546 /* Native ordering is simple: as long as the input cannot
4547 possibly contain a surrogate char, do an unrolled copy
4548 of several 16-bit code points to the target object.
4549 The non-surrogate check is done on several input bytes
4550 at a time (as many as a C 'long' can contain). */
4551 while (_q < aligned_end) {
4552 unsigned long data = * (unsigned long *) _q;
4553 if (data & FAST_CHAR_MASK)
4554 break;
4555 _p[0] = ((unsigned short *) _q)[0];
4556 _p[1] = ((unsigned short *) _q)[1];
4557#if (SIZEOF_LONG == 8)
4558 _p[2] = ((unsigned short *) _q)[2];
4559 _p[3] = ((unsigned short *) _q)[3];
4560#endif
4561 _q += SIZEOF_LONG;
4562 _p += SIZEOF_LONG / 2;
4563 }
4564 }
4565 else {
4566 /* Byteswapped ordering is similar, but we must decompose
4567 the copy bytewise, and take care of zero'ing out the
4568 upper bytes if the target object is in 32-bit units
4569 (that is, in UCS-4 builds). */
4570 while (_q < aligned_end) {
4571 unsigned long data = * (unsigned long *) _q;
4572 if (data & SWAPPED_FAST_CHAR_MASK)
4573 break;
4574 /* Zero upper bytes in UCS-4 builds */
4575#if (Py_UNICODE_SIZE > 2)
4576 _p[0] = 0;
4577 _p[1] = 0;
4578#if (SIZEOF_LONG == 8)
4579 _p[2] = 0;
4580 _p[3] = 0;
4581#endif
4582#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004583 /* Issue #4916; UCS-4 builds on big endian machines must
4584 fill the two last bytes of each 4-byte unit. */
4585#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4586# define OFF 2
4587#else
4588# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004589#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004590 ((unsigned char *) _p)[OFF + 1] = _q[0];
4591 ((unsigned char *) _p)[OFF + 0] = _q[1];
4592 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4593 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4594#if (SIZEOF_LONG == 8)
4595 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4596 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4597 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4598 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4599#endif
4600#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004601 _q += SIZEOF_LONG;
4602 _p += SIZEOF_LONG / 2;
4603 }
4604 }
4605 p = _p;
4606 q = _q;
4607 if (q >= e)
4608 break;
4609 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004611
Benjamin Peterson14339b62009-01-31 16:36:08 +00004612 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004613
4614 if (ch < 0xD800 || ch > 0xDFFF) {
4615 *p++ = ch;
4616 continue;
4617 }
4618
4619 /* UTF-16 code pair: */
4620 if (q > e) {
4621 errmsg = "unexpected end of data";
4622 startinpos = (((const char *)q) - 2) - starts;
4623 endinpos = ((const char *)e) + 1 - starts;
4624 goto utf16Error;
4625 }
4626 if (0xD800 <= ch && ch <= 0xDBFF) {
4627 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4628 q += 2;
4629 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004630#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004631 *p++ = ch;
4632 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004633#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004634 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004635#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004636 continue;
4637 }
4638 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004639 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004640 startinpos = (((const char *)q)-4)-starts;
4641 endinpos = startinpos+2;
4642 goto utf16Error;
4643 }
4644
Benjamin Peterson14339b62009-01-31 16:36:08 +00004645 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004646 errmsg = "illegal encoding";
4647 startinpos = (((const char *)q)-2)-starts;
4648 endinpos = startinpos+2;
4649 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004650
Benjamin Peterson29060642009-01-31 22:14:21 +00004651 utf16Error:
4652 outpos = p - PyUnicode_AS_UNICODE(unicode);
4653 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004654 errors,
4655 &errorHandler,
4656 "utf16", errmsg,
4657 &starts,
4658 (const char **)&e,
4659 &startinpos,
4660 &endinpos,
4661 &exc,
4662 (const char **)&q,
4663 &unicode,
4664 &outpos,
4665 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004666 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004668 /* remaining byte at the end? (size should be even) */
4669 if (e == q) {
4670 if (!consumed) {
4671 errmsg = "truncated data";
4672 startinpos = ((const char *)q) - starts;
4673 endinpos = ((const char *)e) + 1 - starts;
4674 outpos = p - PyUnicode_AS_UNICODE(unicode);
4675 if (unicode_decode_call_errorhandler(
4676 errors,
4677 &errorHandler,
4678 "utf16", errmsg,
4679 &starts,
4680 (const char **)&e,
4681 &startinpos,
4682 &endinpos,
4683 &exc,
4684 (const char **)&q,
4685 &unicode,
4686 &outpos,
4687 &p))
4688 goto onError;
4689 /* The remaining input chars are ignored if the callback
4690 chooses to skip the input */
4691 }
4692 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693
4694 if (byteorder)
4695 *byteorder = bo;
4696
Walter Dörwald69652032004-09-07 20:24:22 +00004697 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004698 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004699
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004701 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702 goto onError;
4703
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004704 Py_XDECREF(errorHandler);
4705 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004706 if (PyUnicode_READY(unicode) == -1) {
4707 Py_DECREF(unicode);
4708 return NULL;
4709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710 return (PyObject *)unicode;
4711
Benjamin Peterson29060642009-01-31 22:14:21 +00004712 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004714 Py_XDECREF(errorHandler);
4715 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004716 return NULL;
4717}
4718
Antoine Pitrouab868312009-01-10 15:40:25 +00004719#undef FAST_CHAR_MASK
4720#undef SWAPPED_FAST_CHAR_MASK
4721
Tim Peters772747b2001-08-09 22:21:55 +00004722PyObject *
4723PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004724 Py_ssize_t size,
4725 const char *errors,
4726 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004728 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004729 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004730 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004731#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004732 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004733#else
4734 const int pairs = 0;
4735#endif
Tim Peters772747b2001-08-09 22:21:55 +00004736 /* Offsets from p for storing byte pairs in the right order. */
4737#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4738 int ihi = 1, ilo = 0;
4739#else
4740 int ihi = 0, ilo = 1;
4741#endif
4742
Benjamin Peterson29060642009-01-31 22:14:21 +00004743#define STORECHAR(CH) \
4744 do { \
4745 p[ihi] = ((CH) >> 8) & 0xff; \
4746 p[ilo] = (CH) & 0xff; \
4747 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004748 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004750#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004751 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 if (s[i] >= 0x10000)
4753 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004754#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004755 /* 2 * (size + pairs + (byteorder == 0)) */
4756 if (size > PY_SSIZE_T_MAX ||
4757 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004758 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004759 nsize = size + pairs + (byteorder == 0);
4760 bytesize = nsize * 2;
4761 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004762 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004763 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764 if (v == NULL)
4765 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004767 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004769 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004770 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004771 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004772
4773 if (byteorder == -1) {
4774 /* force LE */
4775 ihi = 1;
4776 ilo = 0;
4777 }
4778 else if (byteorder == 1) {
4779 /* force BE */
4780 ihi = 0;
4781 ilo = 1;
4782 }
4783
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004784 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004785 Py_UNICODE ch = *s++;
4786 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004787#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004788 if (ch >= 0x10000) {
4789 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4790 ch = 0xD800 | ((ch-0x10000) >> 10);
4791 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004792#endif
Tim Peters772747b2001-08-09 22:21:55 +00004793 STORECHAR(ch);
4794 if (ch2)
4795 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004796 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004797
4798 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004799 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004800#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801}
4802
Alexander Belopolsky40018472011-02-26 01:02:56 +00004803PyObject *
4804PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805{
4806 if (!PyUnicode_Check(unicode)) {
4807 PyErr_BadArgument();
4808 return NULL;
4809 }
4810 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004811 PyUnicode_GET_SIZE(unicode),
4812 NULL,
4813 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814}
4815
4816/* --- Unicode Escape Codec ----------------------------------------------- */
4817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004818/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4819 if all the escapes in the string make it still a valid ASCII string.
4820 Returns -1 if any escapes were found which cause the string to
4821 pop out of ASCII range. Otherwise returns the length of the
4822 required buffer to hold the string.
4823 */
4824Py_ssize_t
4825length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4826{
4827 const unsigned char *p = (const unsigned char *)s;
4828 const unsigned char *end = p + size;
4829 Py_ssize_t length = 0;
4830
4831 if (size < 0)
4832 return -1;
4833
4834 for (; p < end; ++p) {
4835 if (*p > 127) {
4836 /* Non-ASCII */
4837 return -1;
4838 }
4839 else if (*p != '\\') {
4840 /* Normal character */
4841 ++length;
4842 }
4843 else {
4844 /* Backslash-escape, check next char */
4845 ++p;
4846 /* Escape sequence reaches till end of string or
4847 non-ASCII follow-up. */
4848 if (p >= end || *p > 127)
4849 return -1;
4850 switch (*p) {
4851 case '\n':
4852 /* backslash + \n result in zero characters */
4853 break;
4854 case '\\': case '\'': case '\"':
4855 case 'b': case 'f': case 't':
4856 case 'n': case 'r': case 'v': case 'a':
4857 ++length;
4858 break;
4859 case '0': case '1': case '2': case '3':
4860 case '4': case '5': case '6': case '7':
4861 case 'x': case 'u': case 'U': case 'N':
4862 /* these do not guarantee ASCII characters */
4863 return -1;
4864 default:
4865 /* count the backslash + the other character */
4866 length += 2;
4867 }
4868 }
4869 }
4870 return length;
4871}
4872
4873/* Similar to PyUnicode_WRITE but either write into wstr field
4874 or treat string as ASCII. */
4875#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4876 do { \
4877 if ((kind) != PyUnicode_WCHAR_KIND) \
4878 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4879 else \
4880 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4881 } while (0)
4882
4883#define WRITE_WSTR(buf, index, value) \
4884 assert(kind == PyUnicode_WCHAR_KIND), \
4885 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4886
4887
Fredrik Lundh06d12682001-01-24 07:59:11 +00004888static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004889
Alexander Belopolsky40018472011-02-26 01:02:56 +00004890PyObject *
4891PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004892 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02004893 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004895 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004896 Py_ssize_t startinpos;
4897 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004898 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004902 char* message;
4903 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004904 PyObject *errorHandler = NULL;
4905 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004906 Py_ssize_t ascii_length;
4907 Py_ssize_t i;
4908 int kind;
4909 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004911 ascii_length = length_of_escaped_ascii_string(s, size);
4912
4913 /* After length_of_escaped_ascii_string() there are two alternatives,
4914 either the string is pure ASCII with named escapes like \n, etc.
4915 and we determined it's exact size (common case)
4916 or it contains \x, \u, ... escape sequences. then we create a
4917 legacy wchar string and resize it at the end of this function. */
4918 if (ascii_length >= 0) {
4919 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4920 if (!v)
4921 goto onError;
4922 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4923 kind = PyUnicode_1BYTE_KIND;
4924 data = PyUnicode_DATA(v);
4925 }
4926 else {
4927 /* Escaped strings will always be longer than the resulting
4928 Unicode string, so we start with size here and then reduce the
4929 length after conversion to the true value.
4930 (but if the error callback returns a long replacement string
4931 we'll have to allocate more space) */
4932 v = _PyUnicode_New(size);
4933 if (!v)
4934 goto onError;
4935 kind = PyUnicode_WCHAR_KIND;
4936 data = PyUnicode_AS_UNICODE(v);
4937 }
4938
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 if (size == 0)
4940 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004941 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004943
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944 while (s < end) {
4945 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004946 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004947 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004949 if (kind == PyUnicode_WCHAR_KIND) {
4950 assert(i < _PyUnicode_WSTR_LENGTH(v));
4951 }
4952 else {
4953 /* The only case in which i == ascii_length is a backslash
4954 followed by a newline. */
4955 assert(i <= ascii_length);
4956 }
4957
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958 /* Non-escape characters are interpreted as Unicode ordinals */
4959 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004960 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961 continue;
4962 }
4963
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004964 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965 /* \ - Escapes */
4966 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004967 c = *s++;
4968 if (s > end)
4969 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004970
4971 if (kind == PyUnicode_WCHAR_KIND) {
4972 assert(i < _PyUnicode_WSTR_LENGTH(v));
4973 }
4974 else {
4975 /* The only case in which i == ascii_length is a backslash
4976 followed by a newline. */
4977 assert(i < ascii_length || (i == ascii_length && c == '\n'));
4978 }
4979
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004980 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981
Benjamin Peterson29060642009-01-31 22:14:21 +00004982 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004984 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
4985 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
4986 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
4987 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
4988 /* FF */
4989 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
4990 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
4991 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
4992 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
4993 /* VT */
4994 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
4995 /* BEL, not classic C */
4996 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004999 case '0': case '1': case '2': case '3':
5000 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005001 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005002 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005003 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005004 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005005 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005007 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008 break;
5009
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 /* hex escapes */
5011 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005013 digits = 2;
5014 message = "truncated \\xXX escape";
5015 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016
Benjamin Peterson29060642009-01-31 22:14:21 +00005017 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005018 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005019 digits = 4;
5020 message = "truncated \\uXXXX escape";
5021 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005024 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005025 digits = 8;
5026 message = "truncated \\UXXXXXXXX escape";
5027 hexescape:
5028 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005029 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005030 if (s+digits>end) {
5031 endinpos = size;
5032 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 errors, &errorHandler,
5034 "unicodeescape", "end of string in escape sequence",
5035 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005036 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005037 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005038 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005039 goto nextByte;
5040 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005041 for (j = 0; j < digits; ++j) {
5042 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005043 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005044 endinpos = (s+j+1)-starts;
5045 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005046 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 errors, &errorHandler,
5048 "unicodeescape", message,
5049 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005050 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005051 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005052 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005053 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005054 }
5055 chr = (chr<<4) & ~0xF;
5056 if (c >= '0' && c <= '9')
5057 chr += c - '0';
5058 else if (c >= 'a' && c <= 'f')
5059 chr += 10 + c - 'a';
5060 else
5061 chr += 10 + c - 'A';
5062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005063 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005064 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005065 /* _decoding_error will have already written into the
5066 target buffer. */
5067 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005068 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005069 /* when we get here, chr is a 32-bit unicode character */
5070 if (chr <= 0xffff)
5071 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005072 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005073 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005074 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005075 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005076#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005077 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005078#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005079 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005080 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5081 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005082#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005083 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005084 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005085 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005086 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 errors, &errorHandler,
5088 "unicodeescape", "illegal Unicode character",
5089 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005090 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005091 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005092 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005093 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005094 break;
5095
Benjamin Peterson29060642009-01-31 22:14:21 +00005096 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005097 case 'N':
5098 message = "malformed \\N character escape";
5099 if (ucnhash_CAPI == NULL) {
5100 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005101 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5102 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005103 if (ucnhash_CAPI == NULL)
5104 goto ucnhashError;
5105 }
5106 if (*s == '{') {
5107 const char *start = s+1;
5108 /* look for the closing brace */
5109 while (*s != '}' && s < end)
5110 s++;
5111 if (s > start && s < end && *s == '}') {
5112 /* found a name. look it up in the unicode database */
5113 message = "unknown Unicode character name";
5114 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005115 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5116 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005117 goto store;
5118 }
5119 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005120 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005121 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005122 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005123 errors, &errorHandler,
5124 "unicodeescape", message,
5125 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005126 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005127 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005128 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005129 break;
5130
5131 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005132 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005133 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005134 message = "\\ at end of string";
5135 s--;
5136 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005137 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005138 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005139 errors, &errorHandler,
5140 "unicodeescape", message,
5141 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005142 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005143 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005144 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005145 }
5146 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005147 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5148 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005149 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005150 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005153 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005155 /* Ensure the length prediction worked in case of ASCII strings */
5156 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5157
5158 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5159 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005160 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005161 Py_XDECREF(errorHandler);
5162 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005164
Benjamin Peterson29060642009-01-31 22:14:21 +00005165 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005166 PyErr_SetString(
5167 PyExc_UnicodeError,
5168 "\\N escapes not supported (can't load unicodedata module)"
5169 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005170 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005171 Py_XDECREF(errorHandler);
5172 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005173 return NULL;
5174
Benjamin Peterson29060642009-01-31 22:14:21 +00005175 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005177 Py_XDECREF(errorHandler);
5178 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 return NULL;
5180}
5181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005182#undef WRITE_ASCII_OR_WSTR
5183#undef WRITE_WSTR
5184
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185/* Return a Unicode-Escape string version of the Unicode object.
5186
5187 If quotes is true, the string is enclosed in u"" or u'' quotes as
5188 appropriate.
5189
5190*/
5191
Walter Dörwald79e913e2007-05-12 11:08:06 +00005192static const char *hexdigits = "0123456789abcdef";
5193
Alexander Belopolsky40018472011-02-26 01:02:56 +00005194PyObject *
5195PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005196 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005198 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005201#ifdef Py_UNICODE_WIDE
5202 const Py_ssize_t expandsize = 10;
5203#else
5204 const Py_ssize_t expandsize = 6;
5205#endif
5206
Thomas Wouters89f507f2006-12-13 04:49:30 +00005207 /* XXX(nnorwitz): rather than over-allocating, it would be
5208 better to choose a different scheme. Perhaps scan the
5209 first N-chars of the string and allocate based on that size.
5210 */
5211 /* Initial allocation is based on the longest-possible unichr
5212 escape.
5213
5214 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5215 unichr, so in this case it's the longest unichr escape. In
5216 narrow (UTF-16) builds this is five chars per source unichr
5217 since there are two unichrs in the surrogate pair, so in narrow
5218 (UTF-16) builds it's not the longest unichr escape.
5219
5220 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5221 so in the narrow (UTF-16) build case it's the longest unichr
5222 escape.
5223 */
5224
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005225 if (size == 0)
5226 return PyBytes_FromStringAndSize(NULL, 0);
5227
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005228 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005229 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005230
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005231 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 2
5233 + expandsize*size
5234 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 if (repr == NULL)
5236 return NULL;
5237
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005238 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 while (size-- > 0) {
5241 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005242
Walter Dörwald79e913e2007-05-12 11:08:06 +00005243 /* Escape backslashes */
5244 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245 *p++ = '\\';
5246 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005247 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005248 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005249
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005250#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005251 /* Map 21-bit characters to '\U00xxxxxx' */
5252 else if (ch >= 0x10000) {
5253 *p++ = '\\';
5254 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005255 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5256 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5257 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5258 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5259 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5260 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5261 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5262 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005264 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005265#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5267 else if (ch >= 0xD800 && ch < 0xDC00) {
5268 Py_UNICODE ch2;
5269 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005270
Benjamin Peterson29060642009-01-31 22:14:21 +00005271 ch2 = *s++;
5272 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005273 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5275 *p++ = '\\';
5276 *p++ = 'U';
5277 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5278 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5279 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5280 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5281 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5282 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5283 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5284 *p++ = hexdigits[ucs & 0x0000000F];
5285 continue;
5286 }
5287 /* Fall through: isolated surrogates are copied as-is */
5288 s--;
5289 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005290 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005291#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005292
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005294 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 *p++ = '\\';
5296 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005297 *p++ = hexdigits[(ch >> 12) & 0x000F];
5298 *p++ = hexdigits[(ch >> 8) & 0x000F];
5299 *p++ = hexdigits[(ch >> 4) & 0x000F];
5300 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005302
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005303 /* Map special whitespace to '\t', \n', '\r' */
5304 else if (ch == '\t') {
5305 *p++ = '\\';
5306 *p++ = 't';
5307 }
5308 else if (ch == '\n') {
5309 *p++ = '\\';
5310 *p++ = 'n';
5311 }
5312 else if (ch == '\r') {
5313 *p++ = '\\';
5314 *p++ = 'r';
5315 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005316
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005317 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005318 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005320 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005321 *p++ = hexdigits[(ch >> 4) & 0x000F];
5322 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005323 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005324
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 /* Copy everything else as-is */
5326 else
5327 *p++ = (char) ch;
5328 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005330 assert(p - PyBytes_AS_STRING(repr) > 0);
5331 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5332 return NULL;
5333 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334}
5335
Alexander Belopolsky40018472011-02-26 01:02:56 +00005336PyObject *
5337PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005339 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 if (!PyUnicode_Check(unicode)) {
5341 PyErr_BadArgument();
5342 return NULL;
5343 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005344 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5345 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005346 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347}
5348
5349/* --- Raw Unicode Escape Codec ------------------------------------------- */
5350
Alexander Belopolsky40018472011-02-26 01:02:56 +00005351PyObject *
5352PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005353 Py_ssize_t size,
5354 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005356 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005357 Py_ssize_t startinpos;
5358 Py_ssize_t endinpos;
5359 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005361 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 const char *end;
5363 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005364 PyObject *errorHandler = NULL;
5365 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005366
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367 /* Escaped strings will always be longer than the resulting
5368 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005369 length after conversion to the true value. (But decoding error
5370 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 v = _PyUnicode_New(size);
5372 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005376 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 end = s + size;
5378 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 unsigned char c;
5380 Py_UCS4 x;
5381 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005382 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383
Benjamin Peterson29060642009-01-31 22:14:21 +00005384 /* Non-escape characters are interpreted as Unicode ordinals */
5385 if (*s != '\\') {
5386 *p++ = (unsigned char)*s++;
5387 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005388 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005389 startinpos = s-starts;
5390
5391 /* \u-escapes are only interpreted iff the number of leading
5392 backslashes if odd */
5393 bs = s;
5394 for (;s < end;) {
5395 if (*s != '\\')
5396 break;
5397 *p++ = (unsigned char)*s++;
5398 }
5399 if (((s - bs) & 1) == 0 ||
5400 s >= end ||
5401 (*s != 'u' && *s != 'U')) {
5402 continue;
5403 }
5404 p--;
5405 count = *s=='u' ? 4 : 8;
5406 s++;
5407
5408 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5409 outpos = p-PyUnicode_AS_UNICODE(v);
5410 for (x = 0, i = 0; i < count; ++i, ++s) {
5411 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005412 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 endinpos = s-starts;
5414 if (unicode_decode_call_errorhandler(
5415 errors, &errorHandler,
5416 "rawunicodeescape", "truncated \\uXXXX",
5417 &starts, &end, &startinpos, &endinpos, &exc, &s,
5418 &v, &outpos, &p))
5419 goto onError;
5420 goto nextByte;
5421 }
5422 x = (x<<4) & ~0xF;
5423 if (c >= '0' && c <= '9')
5424 x += c - '0';
5425 else if (c >= 'a' && c <= 'f')
5426 x += 10 + c - 'a';
5427 else
5428 x += 10 + c - 'A';
5429 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005430 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005431 /* UCS-2 character */
5432 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005433 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005434 /* UCS-4 character. Either store directly, or as
5435 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005436#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005438#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005439 x -= 0x10000L;
5440 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5441 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005442#endif
5443 } else {
5444 endinpos = s-starts;
5445 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005446 if (unicode_decode_call_errorhandler(
5447 errors, &errorHandler,
5448 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005449 &starts, &end, &startinpos, &endinpos, &exc, &s,
5450 &v, &outpos, &p))
5451 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005452 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005453 nextByte:
5454 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005456 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005457 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005458 Py_XDECREF(errorHandler);
5459 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005460 if (PyUnicode_READY(v) == -1) {
5461 Py_DECREF(v);
5462 return NULL;
5463 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005465
Benjamin Peterson29060642009-01-31 22:14:21 +00005466 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005468 Py_XDECREF(errorHandler);
5469 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 return NULL;
5471}
5472
Alexander Belopolsky40018472011-02-26 01:02:56 +00005473PyObject *
5474PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005475 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005477 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 char *p;
5479 char *q;
5480
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005481#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005482 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005483#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005484 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005485#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005486
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005487 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005488 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005489
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005490 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 if (repr == NULL)
5492 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005493 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005494 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005496 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 while (size-- > 0) {
5498 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005499#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005500 /* Map 32-bit characters to '\Uxxxxxxxx' */
5501 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005502 *p++ = '\\';
5503 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005504 *p++ = hexdigits[(ch >> 28) & 0xf];
5505 *p++ = hexdigits[(ch >> 24) & 0xf];
5506 *p++ = hexdigits[(ch >> 20) & 0xf];
5507 *p++ = hexdigits[(ch >> 16) & 0xf];
5508 *p++ = hexdigits[(ch >> 12) & 0xf];
5509 *p++ = hexdigits[(ch >> 8) & 0xf];
5510 *p++ = hexdigits[(ch >> 4) & 0xf];
5511 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005512 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005513 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005514#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5516 if (ch >= 0xD800 && ch < 0xDC00) {
5517 Py_UNICODE ch2;
5518 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005519
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 ch2 = *s++;
5521 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005522 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5524 *p++ = '\\';
5525 *p++ = 'U';
5526 *p++ = hexdigits[(ucs >> 28) & 0xf];
5527 *p++ = hexdigits[(ucs >> 24) & 0xf];
5528 *p++ = hexdigits[(ucs >> 20) & 0xf];
5529 *p++ = hexdigits[(ucs >> 16) & 0xf];
5530 *p++ = hexdigits[(ucs >> 12) & 0xf];
5531 *p++ = hexdigits[(ucs >> 8) & 0xf];
5532 *p++ = hexdigits[(ucs >> 4) & 0xf];
5533 *p++ = hexdigits[ucs & 0xf];
5534 continue;
5535 }
5536 /* Fall through: isolated surrogates are copied as-is */
5537 s--;
5538 size++;
5539 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005540#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 /* Map 16-bit characters to '\uxxxx' */
5542 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 *p++ = '\\';
5544 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005545 *p++ = hexdigits[(ch >> 12) & 0xf];
5546 *p++ = hexdigits[(ch >> 8) & 0xf];
5547 *p++ = hexdigits[(ch >> 4) & 0xf];
5548 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005550 /* Copy everything else as-is */
5551 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 *p++ = (char) ch;
5553 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005554 size = p - q;
5555
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005556 assert(size > 0);
5557 if (_PyBytes_Resize(&repr, size) < 0)
5558 return NULL;
5559 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560}
5561
Alexander Belopolsky40018472011-02-26 01:02:56 +00005562PyObject *
5563PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005565 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005567 PyErr_BadArgument();
5568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005570 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5571 PyUnicode_GET_SIZE(unicode));
5572
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005573 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574}
5575
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005576/* --- Unicode Internal Codec ------------------------------------------- */
5577
Alexander Belopolsky40018472011-02-26 01:02:56 +00005578PyObject *
5579_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005580 Py_ssize_t size,
5581 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005582{
5583 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005584 Py_ssize_t startinpos;
5585 Py_ssize_t endinpos;
5586 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005587 PyUnicodeObject *v;
5588 Py_UNICODE *p;
5589 const char *end;
5590 const char *reason;
5591 PyObject *errorHandler = NULL;
5592 PyObject *exc = NULL;
5593
Neal Norwitzd43069c2006-01-08 01:12:10 +00005594#ifdef Py_UNICODE_WIDE
5595 Py_UNICODE unimax = PyUnicode_GetMax();
5596#endif
5597
Thomas Wouters89f507f2006-12-13 04:49:30 +00005598 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005599 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5600 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005602 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5603 as string was created with the old API. */
5604 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005606 p = PyUnicode_AS_UNICODE(v);
5607 end = s + size;
5608
5609 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005610 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005611 /* We have to sanity check the raw data, otherwise doom looms for
5612 some malformed UCS-4 data. */
5613 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005614#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005615 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005616#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005617 end-s < Py_UNICODE_SIZE
5618 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005619 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005620 startinpos = s - starts;
5621 if (end-s < Py_UNICODE_SIZE) {
5622 endinpos = end-starts;
5623 reason = "truncated input";
5624 }
5625 else {
5626 endinpos = s - starts + Py_UNICODE_SIZE;
5627 reason = "illegal code point (> 0x10FFFF)";
5628 }
5629 outpos = p - PyUnicode_AS_UNICODE(v);
5630 if (unicode_decode_call_errorhandler(
5631 errors, &errorHandler,
5632 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005633 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005634 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005635 goto onError;
5636 }
5637 }
5638 else {
5639 p++;
5640 s += Py_UNICODE_SIZE;
5641 }
5642 }
5643
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005644 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005645 goto onError;
5646 Py_XDECREF(errorHandler);
5647 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005648 if (PyUnicode_READY(v) == -1) {
5649 Py_DECREF(v);
5650 return NULL;
5651 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005652 return (PyObject *)v;
5653
Benjamin Peterson29060642009-01-31 22:14:21 +00005654 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005655 Py_XDECREF(v);
5656 Py_XDECREF(errorHandler);
5657 Py_XDECREF(exc);
5658 return NULL;
5659}
5660
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661/* --- Latin-1 Codec ------------------------------------------------------ */
5662
Alexander Belopolsky40018472011-02-26 01:02:56 +00005663PyObject *
5664PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005665 Py_ssize_t size,
5666 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005669 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670}
5671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005672/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005673static void
5674make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005675 const char *encoding,
5676 const Py_UNICODE *unicode, Py_ssize_t size,
5677 Py_ssize_t startpos, Py_ssize_t endpos,
5678 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005680 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 *exceptionObject = PyUnicodeEncodeError_Create(
5682 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 }
5684 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5686 goto onError;
5687 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5688 goto onError;
5689 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5690 goto onError;
5691 return;
5692 onError:
5693 Py_DECREF(*exceptionObject);
5694 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 }
5696}
5697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005699static void
5700raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005701 const char *encoding,
5702 const Py_UNICODE *unicode, Py_ssize_t size,
5703 Py_ssize_t startpos, Py_ssize_t endpos,
5704 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705{
5706 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005708 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005710}
5711
5712/* error handling callback helper:
5713 build arguments, call the callback and check the arguments,
5714 put the result into newpos and return the replacement string, which
5715 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005716static PyObject *
5717unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005718 PyObject **errorHandler,
5719 const char *encoding, const char *reason,
5720 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5721 Py_ssize_t startpos, Py_ssize_t endpos,
5722 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005723{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005724 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005725
5726 PyObject *restuple;
5727 PyObject *resunicode;
5728
5729 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005732 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005733 }
5734
5735 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739
5740 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005742 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005744 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005745 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 Py_DECREF(restuple);
5747 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005748 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005749 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 &resunicode, newpos)) {
5751 Py_DECREF(restuple);
5752 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005754 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5755 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5756 Py_DECREF(restuple);
5757 return NULL;
5758 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005759 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005761 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5763 Py_DECREF(restuple);
5764 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005765 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005766 Py_INCREF(resunicode);
5767 Py_DECREF(restuple);
5768 return resunicode;
5769}
5770
Alexander Belopolsky40018472011-02-26 01:02:56 +00005771static PyObject *
5772unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005773 Py_ssize_t size,
5774 const char *errors,
5775 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776{
5777 /* output object */
5778 PyObject *res;
5779 /* pointers to the beginning and end+1 of input */
5780 const Py_UNICODE *startp = p;
5781 const Py_UNICODE *endp = p + size;
5782 /* pointer to the beginning of the unencodable characters */
5783 /* const Py_UNICODE *badp = NULL; */
5784 /* pointer into the output */
5785 char *str;
5786 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005787 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005788 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5789 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790 PyObject *errorHandler = NULL;
5791 PyObject *exc = NULL;
5792 /* the following variable is used for caching string comparisons
5793 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5794 int known_errorHandler = -1;
5795
5796 /* allocate enough for a simple encoding without
5797 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005798 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005799 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005800 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005801 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005802 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005803 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005804 ressize = size;
5805
5806 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005808
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 /* can we encode this? */
5810 if (c<limit) {
5811 /* no overflow check, because we know that the space is enough */
5812 *str++ = (char)c;
5813 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005814 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 else {
5816 Py_ssize_t unicodepos = p-startp;
5817 Py_ssize_t requiredsize;
5818 PyObject *repunicode;
5819 Py_ssize_t repsize;
5820 Py_ssize_t newpos;
5821 Py_ssize_t respos;
5822 Py_UNICODE *uni2;
5823 /* startpos for collecting unencodable chars */
5824 const Py_UNICODE *collstart = p;
5825 const Py_UNICODE *collend = p;
5826 /* find all unecodable characters */
5827 while ((collend < endp) && ((*collend)>=limit))
5828 ++collend;
5829 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5830 if (known_errorHandler==-1) {
5831 if ((errors==NULL) || (!strcmp(errors, "strict")))
5832 known_errorHandler = 1;
5833 else if (!strcmp(errors, "replace"))
5834 known_errorHandler = 2;
5835 else if (!strcmp(errors, "ignore"))
5836 known_errorHandler = 3;
5837 else if (!strcmp(errors, "xmlcharrefreplace"))
5838 known_errorHandler = 4;
5839 else
5840 known_errorHandler = 0;
5841 }
5842 switch (known_errorHandler) {
5843 case 1: /* strict */
5844 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5845 goto onError;
5846 case 2: /* replace */
5847 while (collstart++<collend)
5848 *str++ = '?'; /* fall through */
5849 case 3: /* ignore */
5850 p = collend;
5851 break;
5852 case 4: /* xmlcharrefreplace */
5853 respos = str - PyBytes_AS_STRING(res);
5854 /* determine replacement size (temporarily (mis)uses p) */
5855 for (p = collstart, repsize = 0; p < collend; ++p) {
5856 if (*p<10)
5857 repsize += 2+1+1;
5858 else if (*p<100)
5859 repsize += 2+2+1;
5860 else if (*p<1000)
5861 repsize += 2+3+1;
5862 else if (*p<10000)
5863 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005864#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 else
5866 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005867#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 else if (*p<100000)
5869 repsize += 2+5+1;
5870 else if (*p<1000000)
5871 repsize += 2+6+1;
5872 else
5873 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005874#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 }
5876 requiredsize = respos+repsize+(endp-collend);
5877 if (requiredsize > ressize) {
5878 if (requiredsize<2*ressize)
5879 requiredsize = 2*ressize;
5880 if (_PyBytes_Resize(&res, requiredsize))
5881 goto onError;
5882 str = PyBytes_AS_STRING(res) + respos;
5883 ressize = requiredsize;
5884 }
5885 /* generate replacement (temporarily (mis)uses p) */
5886 for (p = collstart; p < collend; ++p) {
5887 str += sprintf(str, "&#%d;", (int)*p);
5888 }
5889 p = collend;
5890 break;
5891 default:
5892 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5893 encoding, reason, startp, size, &exc,
5894 collstart-startp, collend-startp, &newpos);
5895 if (repunicode == NULL)
5896 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005897 if (PyBytes_Check(repunicode)) {
5898 /* Directly copy bytes result to output. */
5899 repsize = PyBytes_Size(repunicode);
5900 if (repsize > 1) {
5901 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005902 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005903 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5904 Py_DECREF(repunicode);
5905 goto onError;
5906 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005907 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005908 ressize += repsize-1;
5909 }
5910 memcpy(str, PyBytes_AsString(repunicode), repsize);
5911 str += repsize;
5912 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005913 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005914 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005915 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 /* need more space? (at least enough for what we
5917 have+the replacement+the rest of the string, so
5918 we won't have to check space for encodable characters) */
5919 respos = str - PyBytes_AS_STRING(res);
5920 repsize = PyUnicode_GET_SIZE(repunicode);
5921 requiredsize = respos+repsize+(endp-collend);
5922 if (requiredsize > ressize) {
5923 if (requiredsize<2*ressize)
5924 requiredsize = 2*ressize;
5925 if (_PyBytes_Resize(&res, requiredsize)) {
5926 Py_DECREF(repunicode);
5927 goto onError;
5928 }
5929 str = PyBytes_AS_STRING(res) + respos;
5930 ressize = requiredsize;
5931 }
5932 /* check if there is anything unencodable in the replacement
5933 and copy it to the output */
5934 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5935 c = *uni2;
5936 if (c >= limit) {
5937 raise_encode_exception(&exc, encoding, startp, size,
5938 unicodepos, unicodepos+1, reason);
5939 Py_DECREF(repunicode);
5940 goto onError;
5941 }
5942 *str = (char)c;
5943 }
5944 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005945 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005946 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005947 }
5948 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005949 /* Resize if we allocated to much */
5950 size = str - PyBytes_AS_STRING(res);
5951 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005952 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005953 if (_PyBytes_Resize(&res, size) < 0)
5954 goto onError;
5955 }
5956
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005957 Py_XDECREF(errorHandler);
5958 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005959 return res;
5960
5961 onError:
5962 Py_XDECREF(res);
5963 Py_XDECREF(errorHandler);
5964 Py_XDECREF(exc);
5965 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005966}
5967
Alexander Belopolsky40018472011-02-26 01:02:56 +00005968PyObject *
5969PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005970 Py_ssize_t size,
5971 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005973 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974}
5975
Alexander Belopolsky40018472011-02-26 01:02:56 +00005976PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005977_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978{
5979 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 PyErr_BadArgument();
5981 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005983 if (PyUnicode_READY(unicode) == -1)
5984 return NULL;
5985 /* Fast path: if it is a one-byte string, construct
5986 bytes object directly. */
5987 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
5988 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
5989 PyUnicode_GET_LENGTH(unicode));
5990 /* Non-Latin-1 characters present. Defer to above function to
5991 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005994 errors);
5995}
5996
5997PyObject*
5998PyUnicode_AsLatin1String(PyObject *unicode)
5999{
6000 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001}
6002
6003/* --- 7-bit ASCII Codec -------------------------------------------------- */
6004
Alexander Belopolsky40018472011-02-26 01:02:56 +00006005PyObject *
6006PyUnicode_DecodeASCII(const char *s,
6007 Py_ssize_t size,
6008 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 PyUnicodeObject *v;
6012 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006013 Py_ssize_t startinpos;
6014 Py_ssize_t endinpos;
6015 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006016 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006017 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018 PyObject *errorHandler = NULL;
6019 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006020 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006021
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006023 if (size == 1 && *(unsigned char*)s < 128)
6024 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6025
6026 /* Fast path. Assume the input actually *is* ASCII, and allocate
6027 a single-block Unicode object with that assumption. If there is
6028 an error, drop the object and start over. */
6029 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6030 if (v == NULL)
6031 goto onError;
6032 d = PyUnicode_1BYTE_DATA(v);
6033 for (i = 0; i < size; i++) {
6034 unsigned char ch = ((unsigned char*)s)[i];
6035 if (ch < 128)
6036 d[i] = ch;
6037 else
6038 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006040 if (i == size)
6041 return (PyObject*)v;
6042 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006043
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 v = _PyUnicode_New(size);
6045 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006050 e = s + size;
6051 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 register unsigned char c = (unsigned char)*s;
6053 if (c < 128) {
6054 *p++ = c;
6055 ++s;
6056 }
6057 else {
6058 startinpos = s-starts;
6059 endinpos = startinpos + 1;
6060 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6061 if (unicode_decode_call_errorhandler(
6062 errors, &errorHandler,
6063 "ascii", "ordinal not in range(128)",
6064 &starts, &e, &startinpos, &endinpos, &exc, &s,
6065 &v, &outpos, &p))
6066 goto onError;
6067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006069 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006072 Py_XDECREF(errorHandler);
6073 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006074 if (PyUnicode_READY(v) == -1) {
6075 Py_DECREF(v);
6076 return NULL;
6077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006079
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006082 Py_XDECREF(errorHandler);
6083 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 return NULL;
6085}
6086
Alexander Belopolsky40018472011-02-26 01:02:56 +00006087PyObject *
6088PyUnicode_EncodeASCII(const Py_UNICODE *p,
6089 Py_ssize_t size,
6090 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006092 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093}
6094
Alexander Belopolsky40018472011-02-26 01:02:56 +00006095PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006096_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097{
6098 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 PyErr_BadArgument();
6100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006102 if (PyUnicode_READY(unicode) == -1)
6103 return NULL;
6104 /* Fast path: if it is an ASCII-only string, construct bytes object
6105 directly. Else defer to above function to raise the exception. */
6106 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6107 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6108 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006111 errors);
6112}
6113
6114PyObject *
6115PyUnicode_AsASCIIString(PyObject *unicode)
6116{
6117 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118}
6119
Victor Stinner99b95382011-07-04 14:23:54 +02006120#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006121
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006122/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006123
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006124#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006125#define NEED_RETRY
6126#endif
6127
6128/* XXX This code is limited to "true" double-byte encodings, as
6129 a) it assumes an incomplete character consists of a single byte, and
6130 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006132
Alexander Belopolsky40018472011-02-26 01:02:56 +00006133static int
6134is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006135{
6136 const char *curr = s + offset;
6137
6138 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 const char *prev = CharPrev(s, curr);
6140 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006141 }
6142 return 0;
6143}
6144
6145/*
6146 * Decode MBCS string into unicode object. If 'final' is set, converts
6147 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6148 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006149static int
6150decode_mbcs(PyUnicodeObject **v,
6151 const char *s, /* MBCS string */
6152 int size, /* sizeof MBCS string */
6153 int final,
6154 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006155{
6156 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006157 Py_ssize_t n;
6158 DWORD usize;
6159 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006160
6161 assert(size >= 0);
6162
Victor Stinner554f3f02010-06-16 23:33:54 +00006163 /* check and handle 'errors' arg */
6164 if (errors==NULL || strcmp(errors, "strict")==0)
6165 flags = MB_ERR_INVALID_CHARS;
6166 else if (strcmp(errors, "ignore")==0)
6167 flags = 0;
6168 else {
6169 PyErr_Format(PyExc_ValueError,
6170 "mbcs encoding does not support errors='%s'",
6171 errors);
6172 return -1;
6173 }
6174
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006175 /* Skip trailing lead-byte unless 'final' is set */
6176 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006178
6179 /* First get the size of the result */
6180 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006181 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6182 if (usize==0)
6183 goto mbcs_decode_error;
6184 } else
6185 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006186
6187 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 /* Create unicode object */
6189 *v = _PyUnicode_New(usize);
6190 if (*v == NULL)
6191 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006192 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006193 }
6194 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 /* Extend unicode object */
6196 n = PyUnicode_GET_SIZE(*v);
6197 if (_PyUnicode_Resize(v, n + usize) < 0)
6198 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006199 }
6200
6201 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006202 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006204 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6205 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006207 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006208 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006209
6210mbcs_decode_error:
6211 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6212 we raise a UnicodeDecodeError - else it is a 'generic'
6213 windows error
6214 */
6215 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6216 /* Ideally, we should get reason from FormatMessage - this
6217 is the Windows 2000 English version of the message
6218 */
6219 PyObject *exc = NULL;
6220 const char *reason = "No mapping for the Unicode character exists "
6221 "in the target multi-byte code page.";
6222 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6223 if (exc != NULL) {
6224 PyCodec_StrictErrors(exc);
6225 Py_DECREF(exc);
6226 }
6227 } else {
6228 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6229 }
6230 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006231}
6232
Alexander Belopolsky40018472011-02-26 01:02:56 +00006233PyObject *
6234PyUnicode_DecodeMBCSStateful(const char *s,
6235 Py_ssize_t size,
6236 const char *errors,
6237 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006238{
6239 PyUnicodeObject *v = NULL;
6240 int done;
6241
6242 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006243 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006244
6245#ifdef NEED_RETRY
6246 retry:
6247 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006248 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006249 else
6250#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006251 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006252
6253 if (done < 0) {
6254 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006256 }
6257
6258 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006260
6261#ifdef NEED_RETRY
6262 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 s += done;
6264 size -= done;
6265 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006266 }
6267#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006268 if (PyUnicode_READY(v) == -1) {
6269 Py_DECREF(v);
6270 return NULL;
6271 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006272 return (PyObject *)v;
6273}
6274
Alexander Belopolsky40018472011-02-26 01:02:56 +00006275PyObject *
6276PyUnicode_DecodeMBCS(const char *s,
6277 Py_ssize_t size,
6278 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006279{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006280 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6281}
6282
6283/*
6284 * Convert unicode into string object (MBCS).
6285 * Returns 0 if succeed, -1 otherwise.
6286 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006287static int
6288encode_mbcs(PyObject **repr,
6289 const Py_UNICODE *p, /* unicode */
6290 int size, /* size of unicode */
6291 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006292{
Victor Stinner554f3f02010-06-16 23:33:54 +00006293 BOOL usedDefaultChar = FALSE;
6294 BOOL *pusedDefaultChar;
6295 int mbcssize;
6296 Py_ssize_t n;
6297 PyObject *exc = NULL;
6298 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006299
6300 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006301
Victor Stinner554f3f02010-06-16 23:33:54 +00006302 /* check and handle 'errors' arg */
6303 if (errors==NULL || strcmp(errors, "strict")==0) {
6304 flags = WC_NO_BEST_FIT_CHARS;
6305 pusedDefaultChar = &usedDefaultChar;
6306 } else if (strcmp(errors, "replace")==0) {
6307 flags = 0;
6308 pusedDefaultChar = NULL;
6309 } else {
6310 PyErr_Format(PyExc_ValueError,
6311 "mbcs encoding does not support errors='%s'",
6312 errors);
6313 return -1;
6314 }
6315
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006316 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006317 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006318 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6319 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 if (mbcssize == 0) {
6321 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6322 return -1;
6323 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006324 /* If we used a default char, then we failed! */
6325 if (pusedDefaultChar && *pusedDefaultChar)
6326 goto mbcs_encode_error;
6327 } else {
6328 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006329 }
6330
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006331 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 /* Create string object */
6333 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6334 if (*repr == NULL)
6335 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006336 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006337 }
6338 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 /* Extend string object */
6340 n = PyBytes_Size(*repr);
6341 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6342 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006343 }
6344
6345 /* Do the conversion */
6346 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006348 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6349 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6351 return -1;
6352 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006353 if (pusedDefaultChar && *pusedDefaultChar)
6354 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006355 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006356 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006357
6358mbcs_encode_error:
6359 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6360 Py_XDECREF(exc);
6361 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006362}
6363
Alexander Belopolsky40018472011-02-26 01:02:56 +00006364PyObject *
6365PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6366 Py_ssize_t size,
6367 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006368{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006369 PyObject *repr = NULL;
6370 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006371
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006372#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006373 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006374 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006375 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006376 else
6377#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006378 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006379
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006380 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 Py_XDECREF(repr);
6382 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006383 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006384
6385#ifdef NEED_RETRY
6386 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 p += INT_MAX;
6388 size -= INT_MAX;
6389 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006390 }
6391#endif
6392
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006393 return repr;
6394}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006395
Alexander Belopolsky40018472011-02-26 01:02:56 +00006396PyObject *
6397PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006398{
6399 if (!PyUnicode_Check(unicode)) {
6400 PyErr_BadArgument();
6401 return NULL;
6402 }
6403 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 PyUnicode_GET_SIZE(unicode),
6405 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006406}
6407
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006408#undef NEED_RETRY
6409
Victor Stinner99b95382011-07-04 14:23:54 +02006410#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006411
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412/* --- Character Mapping Codec -------------------------------------------- */
6413
Alexander Belopolsky40018472011-02-26 01:02:56 +00006414PyObject *
6415PyUnicode_DecodeCharmap(const char *s,
6416 Py_ssize_t size,
6417 PyObject *mapping,
6418 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006420 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006421 Py_ssize_t startinpos;
6422 Py_ssize_t endinpos;
6423 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 PyUnicodeObject *v;
6426 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006427 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428 PyObject *errorHandler = NULL;
6429 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006430 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006431 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006432
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 /* Default to Latin-1 */
6434 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436
6437 v = _PyUnicode_New(size);
6438 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006443 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006444 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 mapstring = PyUnicode_AS_UNICODE(mapping);
6446 maplen = PyUnicode_GET_SIZE(mapping);
6447 while (s < e) {
6448 unsigned char ch = *s;
6449 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450
Benjamin Peterson29060642009-01-31 22:14:21 +00006451 if (ch < maplen)
6452 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453
Benjamin Peterson29060642009-01-31 22:14:21 +00006454 if (x == 0xfffe) {
6455 /* undefined mapping */
6456 outpos = p-PyUnicode_AS_UNICODE(v);
6457 startinpos = s-starts;
6458 endinpos = startinpos+1;
6459 if (unicode_decode_call_errorhandler(
6460 errors, &errorHandler,
6461 "charmap", "character maps to <undefined>",
6462 &starts, &e, &startinpos, &endinpos, &exc, &s,
6463 &v, &outpos, &p)) {
6464 goto onError;
6465 }
6466 continue;
6467 }
6468 *p++ = x;
6469 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006470 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006471 }
6472 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 while (s < e) {
6474 unsigned char ch = *s;
6475 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006476
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6478 w = PyLong_FromLong((long)ch);
6479 if (w == NULL)
6480 goto onError;
6481 x = PyObject_GetItem(mapping, w);
6482 Py_DECREF(w);
6483 if (x == NULL) {
6484 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6485 /* No mapping found means: mapping is undefined. */
6486 PyErr_Clear();
6487 x = Py_None;
6488 Py_INCREF(x);
6489 } else
6490 goto onError;
6491 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006492
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 /* Apply mapping */
6494 if (PyLong_Check(x)) {
6495 long value = PyLong_AS_LONG(x);
6496 if (value < 0 || value > 65535) {
6497 PyErr_SetString(PyExc_TypeError,
6498 "character mapping must be in range(65536)");
6499 Py_DECREF(x);
6500 goto onError;
6501 }
6502 *p++ = (Py_UNICODE)value;
6503 }
6504 else if (x == Py_None) {
6505 /* undefined mapping */
6506 outpos = p-PyUnicode_AS_UNICODE(v);
6507 startinpos = s-starts;
6508 endinpos = startinpos+1;
6509 if (unicode_decode_call_errorhandler(
6510 errors, &errorHandler,
6511 "charmap", "character maps to <undefined>",
6512 &starts, &e, &startinpos, &endinpos, &exc, &s,
6513 &v, &outpos, &p)) {
6514 Py_DECREF(x);
6515 goto onError;
6516 }
6517 Py_DECREF(x);
6518 continue;
6519 }
6520 else if (PyUnicode_Check(x)) {
6521 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006522
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 if (targetsize == 1)
6524 /* 1-1 mapping */
6525 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006526
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 else if (targetsize > 1) {
6528 /* 1-n mapping */
6529 if (targetsize > extrachars) {
6530 /* resize first */
6531 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6532 Py_ssize_t needed = (targetsize - extrachars) + \
6533 (targetsize << 2);
6534 extrachars += needed;
6535 /* XXX overflow detection missing */
6536 if (_PyUnicode_Resize(&v,
6537 PyUnicode_GET_SIZE(v) + needed) < 0) {
6538 Py_DECREF(x);
6539 goto onError;
6540 }
6541 p = PyUnicode_AS_UNICODE(v) + oldpos;
6542 }
6543 Py_UNICODE_COPY(p,
6544 PyUnicode_AS_UNICODE(x),
6545 targetsize);
6546 p += targetsize;
6547 extrachars -= targetsize;
6548 }
6549 /* 1-0 mapping: skip the character */
6550 }
6551 else {
6552 /* wrong return value */
6553 PyErr_SetString(PyExc_TypeError,
6554 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006555 Py_DECREF(x);
6556 goto onError;
6557 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 Py_DECREF(x);
6559 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 }
6562 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6564 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006565 Py_XDECREF(errorHandler);
6566 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006567 if (PyUnicode_READY(v) == -1) {
6568 Py_DECREF(v);
6569 return NULL;
6570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006572
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006574 Py_XDECREF(errorHandler);
6575 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 Py_XDECREF(v);
6577 return NULL;
6578}
6579
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006580/* Charmap encoding: the lookup table */
6581
Alexander Belopolsky40018472011-02-26 01:02:56 +00006582struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 PyObject_HEAD
6584 unsigned char level1[32];
6585 int count2, count3;
6586 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006587};
6588
6589static PyObject*
6590encoding_map_size(PyObject *obj, PyObject* args)
6591{
6592 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006593 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006594 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006595}
6596
6597static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006598 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006599 PyDoc_STR("Return the size (in bytes) of this object") },
6600 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006601};
6602
6603static void
6604encoding_map_dealloc(PyObject* o)
6605{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006606 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006607}
6608
6609static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006610 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 "EncodingMap", /*tp_name*/
6612 sizeof(struct encoding_map), /*tp_basicsize*/
6613 0, /*tp_itemsize*/
6614 /* methods */
6615 encoding_map_dealloc, /*tp_dealloc*/
6616 0, /*tp_print*/
6617 0, /*tp_getattr*/
6618 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006619 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006620 0, /*tp_repr*/
6621 0, /*tp_as_number*/
6622 0, /*tp_as_sequence*/
6623 0, /*tp_as_mapping*/
6624 0, /*tp_hash*/
6625 0, /*tp_call*/
6626 0, /*tp_str*/
6627 0, /*tp_getattro*/
6628 0, /*tp_setattro*/
6629 0, /*tp_as_buffer*/
6630 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6631 0, /*tp_doc*/
6632 0, /*tp_traverse*/
6633 0, /*tp_clear*/
6634 0, /*tp_richcompare*/
6635 0, /*tp_weaklistoffset*/
6636 0, /*tp_iter*/
6637 0, /*tp_iternext*/
6638 encoding_map_methods, /*tp_methods*/
6639 0, /*tp_members*/
6640 0, /*tp_getset*/
6641 0, /*tp_base*/
6642 0, /*tp_dict*/
6643 0, /*tp_descr_get*/
6644 0, /*tp_descr_set*/
6645 0, /*tp_dictoffset*/
6646 0, /*tp_init*/
6647 0, /*tp_alloc*/
6648 0, /*tp_new*/
6649 0, /*tp_free*/
6650 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006651};
6652
6653PyObject*
6654PyUnicode_BuildEncodingMap(PyObject* string)
6655{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006656 PyObject *result;
6657 struct encoding_map *mresult;
6658 int i;
6659 int need_dict = 0;
6660 unsigned char level1[32];
6661 unsigned char level2[512];
6662 unsigned char *mlevel1, *mlevel2, *mlevel3;
6663 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006664 int kind;
6665 void *data;
6666 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006668 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006669 PyErr_BadArgument();
6670 return NULL;
6671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006672 kind = PyUnicode_KIND(string);
6673 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006674 memset(level1, 0xFF, sizeof level1);
6675 memset(level2, 0xFF, sizeof level2);
6676
6677 /* If there isn't a one-to-one mapping of NULL to \0,
6678 or if there are non-BMP characters, we need to use
6679 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006680 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006681 need_dict = 1;
6682 for (i = 1; i < 256; i++) {
6683 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006684 ch = PyUnicode_READ(kind, data, i);
6685 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006686 need_dict = 1;
6687 break;
6688 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006689 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006690 /* unmapped character */
6691 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006692 l1 = ch >> 11;
6693 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006694 if (level1[l1] == 0xFF)
6695 level1[l1] = count2++;
6696 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006697 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006698 }
6699
6700 if (count2 >= 0xFF || count3 >= 0xFF)
6701 need_dict = 1;
6702
6703 if (need_dict) {
6704 PyObject *result = PyDict_New();
6705 PyObject *key, *value;
6706 if (!result)
6707 return NULL;
6708 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006709 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006710 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006711 if (!key || !value)
6712 goto failed1;
6713 if (PyDict_SetItem(result, key, value) == -1)
6714 goto failed1;
6715 Py_DECREF(key);
6716 Py_DECREF(value);
6717 }
6718 return result;
6719 failed1:
6720 Py_XDECREF(key);
6721 Py_XDECREF(value);
6722 Py_DECREF(result);
6723 return NULL;
6724 }
6725
6726 /* Create a three-level trie */
6727 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6728 16*count2 + 128*count3 - 1);
6729 if (!result)
6730 return PyErr_NoMemory();
6731 PyObject_Init(result, &EncodingMapType);
6732 mresult = (struct encoding_map*)result;
6733 mresult->count2 = count2;
6734 mresult->count3 = count3;
6735 mlevel1 = mresult->level1;
6736 mlevel2 = mresult->level23;
6737 mlevel3 = mresult->level23 + 16*count2;
6738 memcpy(mlevel1, level1, 32);
6739 memset(mlevel2, 0xFF, 16*count2);
6740 memset(mlevel3, 0, 128*count3);
6741 count3 = 0;
6742 for (i = 1; i < 256; i++) {
6743 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006744 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006745 /* unmapped character */
6746 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006747 o1 = PyUnicode_READ(kind, data, i)>>11;
6748 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006749 i2 = 16*mlevel1[o1] + o2;
6750 if (mlevel2[i2] == 0xFF)
6751 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006752 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006753 i3 = 128*mlevel2[i2] + o3;
6754 mlevel3[i3] = i;
6755 }
6756 return result;
6757}
6758
6759static int
6760encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6761{
6762 struct encoding_map *map = (struct encoding_map*)mapping;
6763 int l1 = c>>11;
6764 int l2 = (c>>7) & 0xF;
6765 int l3 = c & 0x7F;
6766 int i;
6767
6768#ifdef Py_UNICODE_WIDE
6769 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006771 }
6772#endif
6773 if (c == 0)
6774 return 0;
6775 /* level 1*/
6776 i = map->level1[l1];
6777 if (i == 0xFF) {
6778 return -1;
6779 }
6780 /* level 2*/
6781 i = map->level23[16*i+l2];
6782 if (i == 0xFF) {
6783 return -1;
6784 }
6785 /* level 3 */
6786 i = map->level23[16*map->count2 + 128*i + l3];
6787 if (i == 0) {
6788 return -1;
6789 }
6790 return i;
6791}
6792
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006793/* Lookup the character ch in the mapping. If the character
6794 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006795 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006796static PyObject *
6797charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798{
Christian Heimes217cfd12007-12-02 14:31:20 +00006799 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006800 PyObject *x;
6801
6802 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006804 x = PyObject_GetItem(mapping, w);
6805 Py_DECREF(w);
6806 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6808 /* No mapping found means: mapping is undefined. */
6809 PyErr_Clear();
6810 x = Py_None;
6811 Py_INCREF(x);
6812 return x;
6813 } else
6814 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006816 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006817 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006818 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 long value = PyLong_AS_LONG(x);
6820 if (value < 0 || value > 255) {
6821 PyErr_SetString(PyExc_TypeError,
6822 "character mapping must be in range(256)");
6823 Py_DECREF(x);
6824 return NULL;
6825 }
6826 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006828 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 /* wrong return value */
6832 PyErr_Format(PyExc_TypeError,
6833 "character mapping must return integer, bytes or None, not %.400s",
6834 x->ob_type->tp_name);
6835 Py_DECREF(x);
6836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 }
6838}
6839
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006840static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006841charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006842{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006843 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6844 /* exponentially overallocate to minimize reallocations */
6845 if (requiredsize < 2*outsize)
6846 requiredsize = 2*outsize;
6847 if (_PyBytes_Resize(outobj, requiredsize))
6848 return -1;
6849 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006850}
6851
Benjamin Peterson14339b62009-01-31 16:36:08 +00006852typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006854} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006855/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006856 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006857 space is available. Return a new reference to the object that
6858 was put in the output buffer, or Py_None, if the mapping was undefined
6859 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006860 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006861static charmapencode_result
6862charmapencode_output(Py_UNICODE c, PyObject *mapping,
6863 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006864{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006865 PyObject *rep;
6866 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006867 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006868
Christian Heimes90aa7642007-12-19 02:45:37 +00006869 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006870 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006872 if (res == -1)
6873 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006874 if (outsize<requiredsize)
6875 if (charmapencode_resize(outobj, outpos, requiredsize))
6876 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006877 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 outstart[(*outpos)++] = (char)res;
6879 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006880 }
6881
6882 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006883 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006885 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 Py_DECREF(rep);
6887 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006888 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 if (PyLong_Check(rep)) {
6890 Py_ssize_t requiredsize = *outpos+1;
6891 if (outsize<requiredsize)
6892 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6893 Py_DECREF(rep);
6894 return enc_EXCEPTION;
6895 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006896 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006897 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006898 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 else {
6900 const char *repchars = PyBytes_AS_STRING(rep);
6901 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6902 Py_ssize_t requiredsize = *outpos+repsize;
6903 if (outsize<requiredsize)
6904 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6905 Py_DECREF(rep);
6906 return enc_EXCEPTION;
6907 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006908 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006909 memcpy(outstart + *outpos, repchars, repsize);
6910 *outpos += repsize;
6911 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006912 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006913 Py_DECREF(rep);
6914 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006915}
6916
6917/* handle an error in PyUnicode_EncodeCharmap
6918 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006919static int
6920charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006921 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006922 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006923 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006924 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006925{
6926 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006927 Py_ssize_t repsize;
6928 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006929 Py_UNICODE *uni2;
6930 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006931 Py_ssize_t collstartpos = *inpos;
6932 Py_ssize_t collendpos = *inpos+1;
6933 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006934 char *encoding = "charmap";
6935 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006936 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006937
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006938 /* find all unencodable characters */
6939 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006940 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006941 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006942 int res = encoding_map_lookup(p[collendpos], mapping);
6943 if (res != -1)
6944 break;
6945 ++collendpos;
6946 continue;
6947 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006948
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 rep = charmapencode_lookup(p[collendpos], mapping);
6950 if (rep==NULL)
6951 return -1;
6952 else if (rep!=Py_None) {
6953 Py_DECREF(rep);
6954 break;
6955 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006956 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006958 }
6959 /* cache callback name lookup
6960 * (if not done yet, i.e. it's the first error) */
6961 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006962 if ((errors==NULL) || (!strcmp(errors, "strict")))
6963 *known_errorHandler = 1;
6964 else if (!strcmp(errors, "replace"))
6965 *known_errorHandler = 2;
6966 else if (!strcmp(errors, "ignore"))
6967 *known_errorHandler = 3;
6968 else if (!strcmp(errors, "xmlcharrefreplace"))
6969 *known_errorHandler = 4;
6970 else
6971 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006972 }
6973 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006974 case 1: /* strict */
6975 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6976 return -1;
6977 case 2: /* replace */
6978 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 x = charmapencode_output('?', mapping, res, respos);
6980 if (x==enc_EXCEPTION) {
6981 return -1;
6982 }
6983 else if (x==enc_FAILED) {
6984 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6985 return -1;
6986 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006987 }
6988 /* fall through */
6989 case 3: /* ignore */
6990 *inpos = collendpos;
6991 break;
6992 case 4: /* xmlcharrefreplace */
6993 /* generate replacement (temporarily (mis)uses p) */
6994 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006995 char buffer[2+29+1+1];
6996 char *cp;
6997 sprintf(buffer, "&#%d;", (int)p[collpos]);
6998 for (cp = buffer; *cp; ++cp) {
6999 x = charmapencode_output(*cp, mapping, res, respos);
7000 if (x==enc_EXCEPTION)
7001 return -1;
7002 else if (x==enc_FAILED) {
7003 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7004 return -1;
7005 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007006 }
7007 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007008 *inpos = collendpos;
7009 break;
7010 default:
7011 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 encoding, reason, p, size, exceptionObject,
7013 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007014 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007016 if (PyBytes_Check(repunicode)) {
7017 /* Directly copy bytes result to output. */
7018 Py_ssize_t outsize = PyBytes_Size(*res);
7019 Py_ssize_t requiredsize;
7020 repsize = PyBytes_Size(repunicode);
7021 requiredsize = *respos + repsize;
7022 if (requiredsize > outsize)
7023 /* Make room for all additional bytes. */
7024 if (charmapencode_resize(res, respos, requiredsize)) {
7025 Py_DECREF(repunicode);
7026 return -1;
7027 }
7028 memcpy(PyBytes_AsString(*res) + *respos,
7029 PyBytes_AsString(repunicode), repsize);
7030 *respos += repsize;
7031 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007032 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007033 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007034 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007035 /* generate replacement */
7036 repsize = PyUnicode_GET_SIZE(repunicode);
7037 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 x = charmapencode_output(*uni2, mapping, res, respos);
7039 if (x==enc_EXCEPTION) {
7040 return -1;
7041 }
7042 else if (x==enc_FAILED) {
7043 Py_DECREF(repunicode);
7044 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7045 return -1;
7046 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007047 }
7048 *inpos = newpos;
7049 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007050 }
7051 return 0;
7052}
7053
Alexander Belopolsky40018472011-02-26 01:02:56 +00007054PyObject *
7055PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7056 Py_ssize_t size,
7057 PyObject *mapping,
7058 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007060 /* output object */
7061 PyObject *res = NULL;
7062 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007063 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007064 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007065 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007066 PyObject *errorHandler = NULL;
7067 PyObject *exc = NULL;
7068 /* the following variable is used for caching string comparisons
7069 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7070 * 3=ignore, 4=xmlcharrefreplace */
7071 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072
7073 /* Default to Latin-1 */
7074 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007075 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007077 /* allocate enough for a simple encoding without
7078 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007079 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007080 if (res == NULL)
7081 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007082 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007085 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 /* try to encode it */
7087 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7088 if (x==enc_EXCEPTION) /* error */
7089 goto onError;
7090 if (x==enc_FAILED) { /* unencodable character */
7091 if (charmap_encoding_error(p, size, &inpos, mapping,
7092 &exc,
7093 &known_errorHandler, &errorHandler, errors,
7094 &res, &respos)) {
7095 goto onError;
7096 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007097 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007098 else
7099 /* done with this character => adjust input position */
7100 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007103 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007104 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007105 if (_PyBytes_Resize(&res, respos) < 0)
7106 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007107
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007108 Py_XDECREF(exc);
7109 Py_XDECREF(errorHandler);
7110 return res;
7111
Benjamin Peterson29060642009-01-31 22:14:21 +00007112 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007113 Py_XDECREF(res);
7114 Py_XDECREF(exc);
7115 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116 return NULL;
7117}
7118
Alexander Belopolsky40018472011-02-26 01:02:56 +00007119PyObject *
7120PyUnicode_AsCharmapString(PyObject *unicode,
7121 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122{
7123 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007124 PyErr_BadArgument();
7125 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126 }
7127 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007128 PyUnicode_GET_SIZE(unicode),
7129 mapping,
7130 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131}
7132
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007133/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007134static void
7135make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007136 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007137 Py_ssize_t startpos, Py_ssize_t endpos,
7138 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007140 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007141 *exceptionObject = _PyUnicodeTranslateError_Create(
7142 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 }
7144 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007145 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7146 goto onError;
7147 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7148 goto onError;
7149 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7150 goto onError;
7151 return;
7152 onError:
7153 Py_DECREF(*exceptionObject);
7154 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155 }
7156}
7157
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007158/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007159static void
7160raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007161 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007162 Py_ssize_t startpos, Py_ssize_t endpos,
7163 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007164{
7165 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007166 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007167 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007169}
7170
7171/* error handling callback helper:
7172 build arguments, call the callback and check the arguments,
7173 put the result into newpos and return the replacement string, which
7174 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007175static PyObject *
7176unicode_translate_call_errorhandler(const char *errors,
7177 PyObject **errorHandler,
7178 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007179 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007180 Py_ssize_t startpos, Py_ssize_t endpos,
7181 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007182{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007183 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007184
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007185 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007186 PyObject *restuple;
7187 PyObject *resunicode;
7188
7189 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007191 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007193 }
7194
7195 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007196 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007197 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007199
7200 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007202 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007203 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007204 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007205 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 Py_DECREF(restuple);
7207 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007208 }
7209 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 &resunicode, &i_newpos)) {
7211 Py_DECREF(restuple);
7212 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007213 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007214 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007215 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007216 else
7217 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007218 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7220 Py_DECREF(restuple);
7221 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007222 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007223 Py_INCREF(resunicode);
7224 Py_DECREF(restuple);
7225 return resunicode;
7226}
7227
7228/* Lookup the character ch in the mapping and put the result in result,
7229 which must be decrefed by the caller.
7230 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007231static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007232charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007233{
Christian Heimes217cfd12007-12-02 14:31:20 +00007234 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007235 PyObject *x;
7236
7237 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007239 x = PyObject_GetItem(mapping, w);
7240 Py_DECREF(w);
7241 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7243 /* No mapping found means: use 1:1 mapping. */
7244 PyErr_Clear();
7245 *result = NULL;
7246 return 0;
7247 } else
7248 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007249 }
7250 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007251 *result = x;
7252 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007253 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007254 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 long value = PyLong_AS_LONG(x);
7256 long max = PyUnicode_GetMax();
7257 if (value < 0 || value > max) {
7258 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007259 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007260 Py_DECREF(x);
7261 return -1;
7262 }
7263 *result = x;
7264 return 0;
7265 }
7266 else if (PyUnicode_Check(x)) {
7267 *result = x;
7268 return 0;
7269 }
7270 else {
7271 /* wrong return value */
7272 PyErr_SetString(PyExc_TypeError,
7273 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007274 Py_DECREF(x);
7275 return -1;
7276 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007277}
7278/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 if not reallocate and adjust various state variables.
7280 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007281static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007282charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007283 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007285 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007286 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 /* exponentially overallocate to minimize reallocations */
7288 if (requiredsize < 2 * oldsize)
7289 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007290 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7291 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007292 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007293 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007294 }
7295 return 0;
7296}
7297/* lookup the character, put the result in the output string and adjust
7298 various state variables. Return a new reference to the object that
7299 was put in the output buffer in *result, or Py_None, if the mapping was
7300 undefined (in which case no character was written).
7301 The called must decref result.
7302 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007303static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007304charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7305 PyObject *mapping, Py_UCS4 **output,
7306 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007307 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007308{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007309 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7310 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007312 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007314 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007315 }
7316 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007318 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007320 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007321 }
7322 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007323 Py_ssize_t repsize;
7324 if (PyUnicode_READY(*res) == -1)
7325 return -1;
7326 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 if (repsize==1) {
7328 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007329 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007330 }
7331 else if (repsize!=0) {
7332 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007333 Py_ssize_t requiredsize = *opos +
7334 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007336 Py_ssize_t i;
7337 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007339 for(i = 0; i < repsize; i++)
7340 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007342 }
7343 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007345 return 0;
7346}
7347
Alexander Belopolsky40018472011-02-26 01:02:56 +00007348PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007349_PyUnicode_TranslateCharmap(PyObject *input,
7350 PyObject *mapping,
7351 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007353 /* input object */
7354 char *idata;
7355 Py_ssize_t size, i;
7356 int kind;
7357 /* output buffer */
7358 Py_UCS4 *output = NULL;
7359 Py_ssize_t osize;
7360 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007361 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007362 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007363 char *reason = "character maps to <undefined>";
7364 PyObject *errorHandler = NULL;
7365 PyObject *exc = NULL;
7366 /* the following variable is used for caching string comparisons
7367 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7368 * 3=ignore, 4=xmlcharrefreplace */
7369 int known_errorHandler = -1;
7370
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007372 PyErr_BadArgument();
7373 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007376 if (PyUnicode_READY(input) == -1)
7377 return NULL;
7378 idata = (char*)PyUnicode_DATA(input);
7379 kind = PyUnicode_KIND(input);
7380 size = PyUnicode_GET_LENGTH(input);
7381 i = 0;
7382
7383 if (size == 0) {
7384 Py_INCREF(input);
7385 return input;
7386 }
7387
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007388 /* allocate enough for a simple 1:1 translation without
7389 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007390 osize = size;
7391 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7392 opos = 0;
7393 if (output == NULL) {
7394 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007395 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007396 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007398 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007399 /* try to encode it */
7400 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007401 if (charmaptranslate_output(input, i, mapping,
7402 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 Py_XDECREF(x);
7404 goto onError;
7405 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007406 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007408 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 else { /* untranslatable character */
7410 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7411 Py_ssize_t repsize;
7412 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007413 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007415 Py_ssize_t collstart = i;
7416 Py_ssize_t collend = i+1;
7417 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418
Benjamin Peterson29060642009-01-31 22:14:21 +00007419 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007420 while (collend < size) {
7421 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 goto onError;
7423 Py_XDECREF(x);
7424 if (x!=Py_None)
7425 break;
7426 ++collend;
7427 }
7428 /* cache callback name lookup
7429 * (if not done yet, i.e. it's the first error) */
7430 if (known_errorHandler==-1) {
7431 if ((errors==NULL) || (!strcmp(errors, "strict")))
7432 known_errorHandler = 1;
7433 else if (!strcmp(errors, "replace"))
7434 known_errorHandler = 2;
7435 else if (!strcmp(errors, "ignore"))
7436 known_errorHandler = 3;
7437 else if (!strcmp(errors, "xmlcharrefreplace"))
7438 known_errorHandler = 4;
7439 else
7440 known_errorHandler = 0;
7441 }
7442 switch (known_errorHandler) {
7443 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007444 raise_translate_exception(&exc, input, collstart,
7445 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007446 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 case 2: /* replace */
7448 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007449 for (coll = collstart; coll<collend; coll++)
7450 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 /* fall through */
7452 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007453 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 break;
7455 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007456 /* generate replacement (temporarily (mis)uses i) */
7457 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 char buffer[2+29+1+1];
7459 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007460 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7461 if (charmaptranslate_makespace(&output, &osize,
7462 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 goto onError;
7464 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007465 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007467 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007468 break;
7469 default:
7470 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007471 reason, input, &exc,
7472 collstart, collend, &newpos);
7473 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 goto onError;
7475 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007476 repsize = PyUnicode_GET_LENGTH(repunicode);
7477 if (charmaptranslate_makespace(&output, &osize,
7478 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 Py_DECREF(repunicode);
7480 goto onError;
7481 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007482 for (uni2 = 0; repsize-->0; ++uni2)
7483 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7484 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007486 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007487 }
7488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007489 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7490 if (!res)
7491 goto onError;
7492 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007493 Py_XDECREF(exc);
7494 Py_XDECREF(errorHandler);
7495 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496
Benjamin Peterson29060642009-01-31 22:14:21 +00007497 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007498 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007499 Py_XDECREF(exc);
7500 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501 return NULL;
7502}
7503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007504/* Deprecated. Use PyUnicode_Translate instead. */
7505PyObject *
7506PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7507 Py_ssize_t size,
7508 PyObject *mapping,
7509 const char *errors)
7510{
7511 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7512 if (!unicode)
7513 return NULL;
7514 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7515}
7516
Alexander Belopolsky40018472011-02-26 01:02:56 +00007517PyObject *
7518PyUnicode_Translate(PyObject *str,
7519 PyObject *mapping,
7520 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521{
7522 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007523
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 str = PyUnicode_FromObject(str);
7525 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007527 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528 Py_DECREF(str);
7529 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007530
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 Py_XDECREF(str);
7533 return NULL;
7534}
Tim Petersced69f82003-09-16 20:30:58 +00007535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007536static Py_UCS4
7537fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7538{
7539 /* No need to call PyUnicode_READY(self) because this function is only
7540 called as a callback from fixup() which does it already. */
7541 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7542 const int kind = PyUnicode_KIND(self);
7543 void *data = PyUnicode_DATA(self);
7544 Py_UCS4 maxchar = 0, ch, fixed;
7545 Py_ssize_t i;
7546
7547 for (i = 0; i < len; ++i) {
7548 ch = PyUnicode_READ(kind, data, i);
7549 fixed = 0;
7550 if (ch > 127) {
7551 if (Py_UNICODE_ISSPACE(ch))
7552 fixed = ' ';
7553 else {
7554 const int decimal = Py_UNICODE_TODECIMAL(ch);
7555 if (decimal >= 0)
7556 fixed = '0' + decimal;
7557 }
7558 if (fixed != 0) {
7559 if (fixed > maxchar)
7560 maxchar = fixed;
7561 PyUnicode_WRITE(kind, data, i, fixed);
7562 }
7563 else if (ch > maxchar)
7564 maxchar = ch;
7565 }
7566 else if (ch > maxchar)
7567 maxchar = ch;
7568 }
7569
7570 return maxchar;
7571}
7572
7573PyObject *
7574_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7575{
7576 if (!PyUnicode_Check(unicode)) {
7577 PyErr_BadInternalCall();
7578 return NULL;
7579 }
7580 if (PyUnicode_READY(unicode) == -1)
7581 return NULL;
7582 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7583 /* If the string is already ASCII, just return the same string */
7584 Py_INCREF(unicode);
7585 return unicode;
7586 }
7587 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7588}
7589
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007590PyObject *
7591PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7592 Py_ssize_t length)
7593{
7594 PyObject *result;
7595 Py_UNICODE *p; /* write pointer into result */
7596 Py_ssize_t i;
7597 /* Copy to a new string */
7598 result = (PyObject *)_PyUnicode_New(length);
7599 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7600 if (result == NULL)
7601 return result;
7602 p = PyUnicode_AS_UNICODE(result);
7603 /* Iterate over code points */
7604 for (i = 0; i < length; i++) {
7605 Py_UNICODE ch =s[i];
7606 if (ch > 127) {
7607 int decimal = Py_UNICODE_TODECIMAL(ch);
7608 if (decimal >= 0)
7609 p[i] = '0' + decimal;
7610 }
7611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007612 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7613 Py_DECREF(result);
7614 return NULL;
7615 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007616 return result;
7617}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007618/* --- Decimal Encoder ---------------------------------------------------- */
7619
Alexander Belopolsky40018472011-02-26 01:02:56 +00007620int
7621PyUnicode_EncodeDecimal(Py_UNICODE *s,
7622 Py_ssize_t length,
7623 char *output,
7624 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007625{
7626 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007627 PyObject *errorHandler = NULL;
7628 PyObject *exc = NULL;
7629 const char *encoding = "decimal";
7630 const char *reason = "invalid decimal Unicode string";
7631 /* the following variable is used for caching string comparisons
7632 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7633 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007634
7635 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 PyErr_BadArgument();
7637 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007638 }
7639
7640 p = s;
7641 end = s + length;
7642 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007643 register Py_UNICODE ch = *p;
7644 int decimal;
7645 PyObject *repunicode;
7646 Py_ssize_t repsize;
7647 Py_ssize_t newpos;
7648 Py_UNICODE *uni2;
7649 Py_UNICODE *collstart;
7650 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007651
Benjamin Peterson29060642009-01-31 22:14:21 +00007652 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007653 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 ++p;
7655 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007656 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 decimal = Py_UNICODE_TODECIMAL(ch);
7658 if (decimal >= 0) {
7659 *output++ = '0' + decimal;
7660 ++p;
7661 continue;
7662 }
7663 if (0 < ch && ch < 256) {
7664 *output++ = (char)ch;
7665 ++p;
7666 continue;
7667 }
7668 /* All other characters are considered unencodable */
7669 collstart = p;
7670 collend = p+1;
7671 while (collend < end) {
7672 if ((0 < *collend && *collend < 256) ||
7673 !Py_UNICODE_ISSPACE(*collend) ||
7674 Py_UNICODE_TODECIMAL(*collend))
7675 break;
7676 }
7677 /* cache callback name lookup
7678 * (if not done yet, i.e. it's the first error) */
7679 if (known_errorHandler==-1) {
7680 if ((errors==NULL) || (!strcmp(errors, "strict")))
7681 known_errorHandler = 1;
7682 else if (!strcmp(errors, "replace"))
7683 known_errorHandler = 2;
7684 else if (!strcmp(errors, "ignore"))
7685 known_errorHandler = 3;
7686 else if (!strcmp(errors, "xmlcharrefreplace"))
7687 known_errorHandler = 4;
7688 else
7689 known_errorHandler = 0;
7690 }
7691 switch (known_errorHandler) {
7692 case 1: /* strict */
7693 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7694 goto onError;
7695 case 2: /* replace */
7696 for (p = collstart; p < collend; ++p)
7697 *output++ = '?';
7698 /* fall through */
7699 case 3: /* ignore */
7700 p = collend;
7701 break;
7702 case 4: /* xmlcharrefreplace */
7703 /* generate replacement (temporarily (mis)uses p) */
7704 for (p = collstart; p < collend; ++p)
7705 output += sprintf(output, "&#%d;", (int)*p);
7706 p = collend;
7707 break;
7708 default:
7709 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7710 encoding, reason, s, length, &exc,
7711 collstart-s, collend-s, &newpos);
7712 if (repunicode == NULL)
7713 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007714 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007715 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007716 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7717 Py_DECREF(repunicode);
7718 goto onError;
7719 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007720 /* generate replacement */
7721 repsize = PyUnicode_GET_SIZE(repunicode);
7722 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7723 Py_UNICODE ch = *uni2;
7724 if (Py_UNICODE_ISSPACE(ch))
7725 *output++ = ' ';
7726 else {
7727 decimal = Py_UNICODE_TODECIMAL(ch);
7728 if (decimal >= 0)
7729 *output++ = '0' + decimal;
7730 else if (0 < ch && ch < 256)
7731 *output++ = (char)ch;
7732 else {
7733 Py_DECREF(repunicode);
7734 raise_encode_exception(&exc, encoding,
7735 s, length, collstart-s, collend-s, reason);
7736 goto onError;
7737 }
7738 }
7739 }
7740 p = s + newpos;
7741 Py_DECREF(repunicode);
7742 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007743 }
7744 /* 0-terminate the output string */
7745 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007746 Py_XDECREF(exc);
7747 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007748 return 0;
7749
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007751 Py_XDECREF(exc);
7752 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007753 return -1;
7754}
7755
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756/* --- Helpers ------------------------------------------------------------ */
7757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007758#include "stringlib/ucs1lib.h"
7759#include "stringlib/fastsearch.h"
7760#include "stringlib/partition.h"
7761#include "stringlib/split.h"
7762#include "stringlib/count.h"
7763#include "stringlib/find.h"
7764#include "stringlib/localeutil.h"
7765#include "stringlib/undef.h"
7766
7767#include "stringlib/ucs2lib.h"
7768#include "stringlib/fastsearch.h"
7769#include "stringlib/partition.h"
7770#include "stringlib/split.h"
7771#include "stringlib/count.h"
7772#include "stringlib/find.h"
7773#include "stringlib/localeutil.h"
7774#include "stringlib/undef.h"
7775
7776#include "stringlib/ucs4lib.h"
7777#include "stringlib/fastsearch.h"
7778#include "stringlib/partition.h"
7779#include "stringlib/split.h"
7780#include "stringlib/count.h"
7781#include "stringlib/find.h"
7782#include "stringlib/localeutil.h"
7783#include "stringlib/undef.h"
7784
7785static Py_ssize_t
7786any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7787 const Py_UCS1*, Py_ssize_t,
7788 Py_ssize_t, Py_ssize_t),
7789 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7790 const Py_UCS2*, Py_ssize_t,
7791 Py_ssize_t, Py_ssize_t),
7792 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7793 const Py_UCS4*, Py_ssize_t,
7794 Py_ssize_t, Py_ssize_t),
7795 PyObject* s1, PyObject* s2,
7796 Py_ssize_t start,
7797 Py_ssize_t end)
7798{
7799 int kind1, kind2, kind;
7800 void *buf1, *buf2;
7801 Py_ssize_t len1, len2, result;
7802
7803 kind1 = PyUnicode_KIND(s1);
7804 kind2 = PyUnicode_KIND(s2);
7805 kind = kind1 > kind2 ? kind1 : kind2;
7806 buf1 = PyUnicode_DATA(s1);
7807 buf2 = PyUnicode_DATA(s2);
7808 if (kind1 != kind)
7809 buf1 = _PyUnicode_AsKind(s1, kind);
7810 if (!buf1)
7811 return -2;
7812 if (kind2 != kind)
7813 buf2 = _PyUnicode_AsKind(s2, kind);
7814 if (!buf2) {
7815 if (kind1 != kind) PyMem_Free(buf1);
7816 return -2;
7817 }
7818 len1 = PyUnicode_GET_LENGTH(s1);
7819 len2 = PyUnicode_GET_LENGTH(s2);
7820
7821 switch(kind) {
7822 case PyUnicode_1BYTE_KIND:
7823 result = ucs1(buf1, len1, buf2, len2, start, end);
7824 break;
7825 case PyUnicode_2BYTE_KIND:
7826 result = ucs2(buf1, len1, buf2, len2, start, end);
7827 break;
7828 case PyUnicode_4BYTE_KIND:
7829 result = ucs4(buf1, len1, buf2, len2, start, end);
7830 break;
7831 default:
7832 assert(0); result = -2;
7833 }
7834
7835 if (kind1 != kind)
7836 PyMem_Free(buf1);
7837 if (kind2 != kind)
7838 PyMem_Free(buf2);
7839
7840 return result;
7841}
7842
7843Py_ssize_t
7844_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7845 Py_ssize_t n_buffer,
7846 void *digits, Py_ssize_t n_digits,
7847 Py_ssize_t min_width,
7848 const char *grouping,
7849 const char *thousands_sep)
7850{
7851 switch(kind) {
7852 case PyUnicode_1BYTE_KIND:
7853 return _PyUnicode_ucs1_InsertThousandsGrouping(
7854 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7855 min_width, grouping, thousands_sep);
7856 case PyUnicode_2BYTE_KIND:
7857 return _PyUnicode_ucs2_InsertThousandsGrouping(
7858 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7859 min_width, grouping, thousands_sep);
7860 case PyUnicode_4BYTE_KIND:
7861 return _PyUnicode_ucs4_InsertThousandsGrouping(
7862 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7863 min_width, grouping, thousands_sep);
7864 }
7865 assert(0);
7866 return -1;
7867}
7868
7869
Eric Smith8c663262007-08-25 02:26:07 +00007870#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007871#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007872
Thomas Wouters477c8d52006-05-27 19:21:47 +00007873#include "stringlib/count.h"
7874#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007875
Thomas Wouters477c8d52006-05-27 19:21:47 +00007876/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007877#define ADJUST_INDICES(start, end, len) \
7878 if (end > len) \
7879 end = len; \
7880 else if (end < 0) { \
7881 end += len; \
7882 if (end < 0) \
7883 end = 0; \
7884 } \
7885 if (start < 0) { \
7886 start += len; \
7887 if (start < 0) \
7888 start = 0; \
7889 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007890
Alexander Belopolsky40018472011-02-26 01:02:56 +00007891Py_ssize_t
7892PyUnicode_Count(PyObject *str,
7893 PyObject *substr,
7894 Py_ssize_t start,
7895 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007897 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007898 PyUnicodeObject* str_obj;
7899 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007900 int kind1, kind2, kind;
7901 void *buf1 = NULL, *buf2 = NULL;
7902 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007903
Thomas Wouters477c8d52006-05-27 19:21:47 +00007904 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007905 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007907 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 if (!sub_obj || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 Py_DECREF(str_obj);
7910 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911 }
Tim Petersced69f82003-09-16 20:30:58 +00007912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007913 kind1 = PyUnicode_KIND(str_obj);
7914 kind2 = PyUnicode_KIND(sub_obj);
7915 kind = kind1 > kind2 ? kind1 : kind2;
7916 buf1 = PyUnicode_DATA(str_obj);
7917 if (kind1 != kind)
7918 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7919 if (!buf1)
7920 goto onError;
7921 buf2 = PyUnicode_DATA(sub_obj);
7922 if (kind2 != kind)
7923 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7924 if (!buf2)
7925 goto onError;
7926 len1 = PyUnicode_GET_LENGTH(str_obj);
7927 len2 = PyUnicode_GET_LENGTH(sub_obj);
7928
7929 ADJUST_INDICES(start, end, len1);
7930 switch(kind) {
7931 case PyUnicode_1BYTE_KIND:
7932 result = ucs1lib_count(
7933 ((Py_UCS1*)buf1) + start, end - start,
7934 buf2, len2, PY_SSIZE_T_MAX
7935 );
7936 break;
7937 case PyUnicode_2BYTE_KIND:
7938 result = ucs2lib_count(
7939 ((Py_UCS2*)buf1) + start, end - start,
7940 buf2, len2, PY_SSIZE_T_MAX
7941 );
7942 break;
7943 case PyUnicode_4BYTE_KIND:
7944 result = ucs4lib_count(
7945 ((Py_UCS4*)buf1) + start, end - start,
7946 buf2, len2, PY_SSIZE_T_MAX
7947 );
7948 break;
7949 default:
7950 assert(0); result = 0;
7951 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007952
7953 Py_DECREF(sub_obj);
7954 Py_DECREF(str_obj);
7955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007956 if (kind1 != kind)
7957 PyMem_Free(buf1);
7958 if (kind2 != kind)
7959 PyMem_Free(buf2);
7960
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007962 onError:
7963 Py_DECREF(sub_obj);
7964 Py_DECREF(str_obj);
7965 if (kind1 != kind && buf1)
7966 PyMem_Free(buf1);
7967 if (kind2 != kind && buf2)
7968 PyMem_Free(buf2);
7969 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970}
7971
Alexander Belopolsky40018472011-02-26 01:02:56 +00007972Py_ssize_t
7973PyUnicode_Find(PyObject *str,
7974 PyObject *sub,
7975 Py_ssize_t start,
7976 Py_ssize_t end,
7977 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007979 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00007980
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007982 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007984 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007985 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 Py_DECREF(str);
7987 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988 }
Tim Petersced69f82003-09-16 20:30:58 +00007989
Thomas Wouters477c8d52006-05-27 19:21:47 +00007990 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007991 result = any_find_slice(
7992 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
7993 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007994 );
7995 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007996 result = any_find_slice(
7997 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
7998 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00007999 );
8000
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008002 Py_DECREF(sub);
8003
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004 return result;
8005}
8006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008007Py_ssize_t
8008PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8009 Py_ssize_t start, Py_ssize_t end,
8010 int direction)
8011{
8012 char *result;
8013 int kind;
8014 if (PyUnicode_READY(str) == -1)
8015 return -2;
8016 if (end > PyUnicode_GET_LENGTH(str))
8017 end = PyUnicode_GET_LENGTH(str);
8018 kind = PyUnicode_KIND(str);
8019 result = findchar(PyUnicode_1BYTE_DATA(str)
8020 + PyUnicode_KIND_SIZE(kind, start),
8021 kind,
8022 end-start, ch, direction);
8023 if (!result)
8024 return -1;
8025 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8026}
8027
Alexander Belopolsky40018472011-02-26 01:02:56 +00008028static int
8029tailmatch(PyUnicodeObject *self,
8030 PyUnicodeObject *substring,
8031 Py_ssize_t start,
8032 Py_ssize_t end,
8033 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008035 int kind_self;
8036 int kind_sub;
8037 void *data_self;
8038 void *data_sub;
8039 Py_ssize_t offset;
8040 Py_ssize_t i;
8041 Py_ssize_t end_sub;
8042
8043 if (PyUnicode_READY(self) == -1 ||
8044 PyUnicode_READY(substring) == -1)
8045 return 0;
8046
8047 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 return 1;
8049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008050 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8051 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008055 kind_self = PyUnicode_KIND(self);
8056 data_self = PyUnicode_DATA(self);
8057 kind_sub = PyUnicode_KIND(substring);
8058 data_sub = PyUnicode_DATA(substring);
8059 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8060
8061 if (direction > 0)
8062 offset = end;
8063 else
8064 offset = start;
8065
8066 if (PyUnicode_READ(kind_self, data_self, offset) ==
8067 PyUnicode_READ(kind_sub, data_sub, 0) &&
8068 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8069 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8070 /* If both are of the same kind, memcmp is sufficient */
8071 if (kind_self == kind_sub) {
8072 return ! memcmp((char *)data_self +
8073 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8074 data_sub,
8075 PyUnicode_GET_LENGTH(substring) *
8076 PyUnicode_CHARACTER_SIZE(substring));
8077 }
8078 /* otherwise we have to compare each character by first accesing it */
8079 else {
8080 /* We do not need to compare 0 and len(substring)-1 because
8081 the if statement above ensured already that they are equal
8082 when we end up here. */
8083 // TODO: honor direction and do a forward or backwards search
8084 for (i = 1; i < end_sub; ++i) {
8085 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8086 PyUnicode_READ(kind_sub, data_sub, i))
8087 return 0;
8088 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 }
8092
8093 return 0;
8094}
8095
Alexander Belopolsky40018472011-02-26 01:02:56 +00008096Py_ssize_t
8097PyUnicode_Tailmatch(PyObject *str,
8098 PyObject *substr,
8099 Py_ssize_t start,
8100 Py_ssize_t end,
8101 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008103 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008104
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105 str = PyUnicode_FromObject(str);
8106 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108 substr = PyUnicode_FromObject(substr);
8109 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 Py_DECREF(str);
8111 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112 }
Tim Petersced69f82003-09-16 20:30:58 +00008113
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 (PyUnicodeObject *)substr,
8116 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117 Py_DECREF(str);
8118 Py_DECREF(substr);
8119 return result;
8120}
8121
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122/* Apply fixfct filter to the Unicode object self and return a
8123 reference to the modified object */
8124
Alexander Belopolsky40018472011-02-26 01:02:56 +00008125static PyObject *
8126fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008127 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008129 PyObject *u;
8130 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 if (PyUnicode_READY(self) == -1)
8133 return NULL;
8134 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8135 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8136 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008140 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8141 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008143 /* fix functions return the new maximum character in a string,
8144 if the kind of the resulting unicode object does not change,
8145 everything is fine. Otherwise we need to change the string kind
8146 and re-run the fix function. */
8147 maxchar_new = fixfct((PyUnicodeObject*)u);
8148 if (maxchar_new == 0)
8149 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8150 else if (maxchar_new <= 127)
8151 maxchar_new = 127;
8152 else if (maxchar_new <= 255)
8153 maxchar_new = 255;
8154 else if (maxchar_new <= 65535)
8155 maxchar_new = 65535;
8156 else
8157 maxchar_new = 1114111; /* 0x10ffff */
8158
8159 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 /* fixfct should return TRUE if it modified the buffer. If
8161 FALSE, return a reference to the original buffer instead
8162 (to save space, not time) */
8163 Py_INCREF(self);
8164 Py_DECREF(u);
8165 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008167 else if (maxchar_new == maxchar_old) {
8168 return u;
8169 }
8170 else {
8171 /* In case the maximum character changed, we need to
8172 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008173 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008174 if (v == NULL) {
8175 Py_DECREF(u);
8176 return NULL;
8177 }
8178 if (maxchar_new > maxchar_old) {
8179 /* If the maxchar increased so that the kind changed, not all
8180 characters are representable anymore and we need to fix the
8181 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008182 if (PyUnicode_CopyCharacters(v, 0,
8183 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008184 PyUnicode_GET_LENGTH(self)) < 0)
8185 {
8186 Py_DECREF(u);
8187 return NULL;
8188 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008189 maxchar_old = fixfct((PyUnicodeObject*)v);
8190 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8191 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008192 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008193 if (PyUnicode_CopyCharacters(v, 0,
8194 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008195 PyUnicode_GET_LENGTH(self)) < 0)
8196 {
8197 Py_DECREF(u);
8198 return NULL;
8199 }
8200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008201
8202 Py_DECREF(u);
8203 return v;
8204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008205}
8206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008207static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008208fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008210 /* No need to call PyUnicode_READY(self) because this function is only
8211 called as a callback from fixup() which does it already. */
8212 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8213 const int kind = PyUnicode_KIND(self);
8214 void *data = PyUnicode_DATA(self);
8215 int touched = 0;
8216 Py_UCS4 maxchar = 0;
8217 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008219 for (i = 0; i < len; ++i) {
8220 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8221 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8222 if (up != ch) {
8223 if (up > maxchar)
8224 maxchar = up;
8225 PyUnicode_WRITE(kind, data, i, up);
8226 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008228 else if (ch > maxchar)
8229 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230 }
8231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008232 if (touched)
8233 return maxchar;
8234 else
8235 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236}
8237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008239fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008241 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8242 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8243 const int kind = PyUnicode_KIND(self);
8244 void *data = PyUnicode_DATA(self);
8245 int touched = 0;
8246 Py_UCS4 maxchar = 0;
8247 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008249 for(i = 0; i < len; ++i) {
8250 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8251 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8252 if (lo != ch) {
8253 if (lo > maxchar)
8254 maxchar = lo;
8255 PyUnicode_WRITE(kind, data, i, lo);
8256 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008258 else if (ch > maxchar)
8259 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 }
8261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008262 if (touched)
8263 return maxchar;
8264 else
8265 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266}
8267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008268static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008269fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008271 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8272 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8273 const int kind = PyUnicode_KIND(self);
8274 void *data = PyUnicode_DATA(self);
8275 int touched = 0;
8276 Py_UCS4 maxchar = 0;
8277 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008279 for(i = 0; i < len; ++i) {
8280 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8281 Py_UCS4 nu = 0;
8282
8283 if (Py_UNICODE_ISUPPER(ch))
8284 nu = Py_UNICODE_TOLOWER(ch);
8285 else if (Py_UNICODE_ISLOWER(ch))
8286 nu = Py_UNICODE_TOUPPER(ch);
8287
8288 if (nu != 0) {
8289 if (nu > maxchar)
8290 maxchar = nu;
8291 PyUnicode_WRITE(kind, data, i, nu);
8292 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008294 else if (ch > maxchar)
8295 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296 }
8297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008298 if (touched)
8299 return maxchar;
8300 else
8301 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302}
8303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008304static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008305fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008307 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8308 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8309 const int kind = PyUnicode_KIND(self);
8310 void *data = PyUnicode_DATA(self);
8311 int touched = 0;
8312 Py_UCS4 maxchar = 0;
8313 Py_ssize_t i = 0;
8314 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008315
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008316 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008318
8319 ch = PyUnicode_READ(kind, data, i);
8320 if (!Py_UNICODE_ISUPPER(ch)) {
8321 maxchar = Py_UNICODE_TOUPPER(ch);
8322 PyUnicode_WRITE(kind, data, i, maxchar);
8323 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008325 ++i;
8326 for(; i < len; ++i) {
8327 ch = PyUnicode_READ(kind, data, i);
8328 if (!Py_UNICODE_ISLOWER(ch)) {
8329 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8330 if (lo > maxchar)
8331 maxchar = lo;
8332 PyUnicode_WRITE(kind, data, i, lo);
8333 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008334 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335 else if (ch > maxchar)
8336 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008338
8339 if (touched)
8340 return maxchar;
8341 else
8342 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343}
8344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008345static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008346fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8349 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8350 const int kind = PyUnicode_KIND(self);
8351 void *data = PyUnicode_DATA(self);
8352 Py_UCS4 maxchar = 0;
8353 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 int previous_is_cased;
8355
8356 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008357 if (len == 1) {
8358 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8359 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8360 if (ti != ch) {
8361 PyUnicode_WRITE(kind, data, i, ti);
8362 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 }
8364 else
8365 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008368 for(; i < len; ++i) {
8369 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8370 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008371
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008373 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 nu = Py_UNICODE_TOTITLE(ch);
8376
8377 if (nu > maxchar)
8378 maxchar = nu;
8379 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008380
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 if (Py_UNICODE_ISLOWER(ch) ||
8382 Py_UNICODE_ISUPPER(ch) ||
8383 Py_UNICODE_ISTITLE(ch))
8384 previous_is_cased = 1;
8385 else
8386 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008388 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389}
8390
Tim Peters8ce9f162004-08-27 01:49:32 +00008391PyObject *
8392PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008395 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008396 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008397 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008398 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8399 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008400 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 Py_ssize_t sz, i, res_offset;
8402 Py_UCS4 maxchar = 0;
8403 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404
Tim Peters05eba1f2004-08-27 21:32:02 +00008405 fseq = PySequence_Fast(seq, "");
8406 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008407 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008408 }
8409
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008410 /* NOTE: the following code can't call back into Python code,
8411 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008412 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008413
Tim Peters05eba1f2004-08-27 21:32:02 +00008414 seqlen = PySequence_Fast_GET_SIZE(fseq);
8415 /* If empty sequence, return u"". */
8416 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008418 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008419 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008420 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008421 /* If singleton sequence with an exact Unicode, return that. */
8422 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 item = items[0];
8424 if (PyUnicode_CheckExact(item)) {
8425 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008426 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 goto Done;
8428 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008429 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008430 else {
8431 /* Set up sep and seplen */
8432 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 /* fall back to a blank space separator */
8434 sep = PyUnicode_FromOrdinal(' ');
8435 if (!sep || PyUnicode_READY(sep) == -1)
8436 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008437 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008438 else {
8439 if (!PyUnicode_Check(separator)) {
8440 PyErr_Format(PyExc_TypeError,
8441 "separator: expected str instance,"
8442 " %.80s found",
8443 Py_TYPE(separator)->tp_name);
8444 goto onError;
8445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008446 if (PyUnicode_READY(separator) == -1)
8447 goto onError;
8448 sep = separator;
8449 seplen = PyUnicode_GET_LENGTH(separator);
8450 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8451 /* inc refcount to keep this code path symetric with the
8452 above case of a blank separator */
8453 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008454 }
8455 }
8456
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008457 /* There are at least two things to join, or else we have a subclass
8458 * of str in the sequence.
8459 * Do a pre-pass to figure out the total amount of space we'll
8460 * need (sz), and see whether all argument are strings.
8461 */
8462 sz = 0;
8463 for (i = 0; i < seqlen; i++) {
8464 const Py_ssize_t old_sz = sz;
8465 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 if (!PyUnicode_Check(item)) {
8467 PyErr_Format(PyExc_TypeError,
8468 "sequence item %zd: expected str instance,"
8469 " %.80s found",
8470 i, Py_TYPE(item)->tp_name);
8471 goto onError;
8472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 if (PyUnicode_READY(item) == -1)
8474 goto onError;
8475 sz += PyUnicode_GET_LENGTH(item);
8476 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8477 if (item_maxchar > maxchar)
8478 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008479 if (i != 0)
8480 sz += seplen;
8481 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8482 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008484 goto onError;
8485 }
8486 }
Tim Petersced69f82003-09-16 20:30:58 +00008487
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008488 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008489 if (res == NULL)
8490 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008491
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008492 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008494 Py_ssize_t itemlen;
8495 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 /* Copy item, and maybe the separator. */
8498 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008499 if (PyUnicode_CopyCharacters(res, res_offset,
8500 sep, 0, seplen) < 0)
8501 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008504 if (PyUnicode_CopyCharacters(res, res_offset,
8505 item, 0, itemlen) < 0)
8506 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008510
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008512 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008513 Py_XDECREF(sep);
8514 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008517 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008518 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008519 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520 return NULL;
8521}
8522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523#define FILL(kind, data, value, start, length) \
8524 do { \
8525 Py_ssize_t i_ = 0; \
8526 assert(kind != PyUnicode_WCHAR_KIND); \
8527 switch ((kind)) { \
8528 case PyUnicode_1BYTE_KIND: { \
8529 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8530 memset(to_, (unsigned char)value, length); \
8531 break; \
8532 } \
8533 case PyUnicode_2BYTE_KIND: { \
8534 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8535 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8536 break; \
8537 } \
8538 default: { \
8539 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8540 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8541 break; \
8542 } \
8543 } \
8544 } while (0)
8545
Alexander Belopolsky40018472011-02-26 01:02:56 +00008546static PyUnicodeObject *
8547pad(PyUnicodeObject *self,
8548 Py_ssize_t left,
8549 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008550 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008552 PyObject *u;
8553 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008554 int kind;
8555 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556
8557 if (left < 0)
8558 left = 0;
8559 if (right < 0)
8560 right = 0;
8561
Tim Peters7a29bd52001-09-12 03:03:31 +00008562 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 Py_INCREF(self);
8564 return self;
8565 }
8566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008567 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8568 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008569 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8570 return NULL;
8571 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8573 if (fill > maxchar)
8574 maxchar = fill;
8575 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008576 if (!u)
8577 return NULL;
8578
8579 kind = PyUnicode_KIND(u);
8580 data = PyUnicode_DATA(u);
8581 if (left)
8582 FILL(kind, data, fill, 0, left);
8583 if (right)
8584 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008585 if (PyUnicode_CopyCharacters(u, left,
8586 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008587 _PyUnicode_LENGTH(self)) < 0)
8588 {
8589 Py_DECREF(u);
8590 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 }
8592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596
Alexander Belopolsky40018472011-02-26 01:02:56 +00008597PyObject *
8598PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601
8602 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606 switch(PyUnicode_KIND(string)) {
8607 case PyUnicode_1BYTE_KIND:
8608 list = ucs1lib_splitlines(
8609 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8610 PyUnicode_GET_LENGTH(string), keepends);
8611 break;
8612 case PyUnicode_2BYTE_KIND:
8613 list = ucs2lib_splitlines(
8614 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8615 PyUnicode_GET_LENGTH(string), keepends);
8616 break;
8617 case PyUnicode_4BYTE_KIND:
8618 list = ucs4lib_splitlines(
8619 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8620 PyUnicode_GET_LENGTH(string), keepends);
8621 break;
8622 default:
8623 assert(0);
8624 list = 0;
8625 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626 Py_DECREF(string);
8627 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628}
8629
Alexander Belopolsky40018472011-02-26 01:02:56 +00008630static PyObject *
8631split(PyUnicodeObject *self,
8632 PyUnicodeObject *substring,
8633 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 int kind1, kind2, kind;
8636 void *buf1, *buf2;
8637 Py_ssize_t len1, len2;
8638 PyObject* out;
8639
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008641 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 if (PyUnicode_READY(self) == -1)
8644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 if (substring == NULL)
8647 switch(PyUnicode_KIND(self)) {
8648 case PyUnicode_1BYTE_KIND:
8649 return ucs1lib_split_whitespace(
8650 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8651 PyUnicode_GET_LENGTH(self), maxcount
8652 );
8653 case PyUnicode_2BYTE_KIND:
8654 return ucs2lib_split_whitespace(
8655 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8656 PyUnicode_GET_LENGTH(self), maxcount
8657 );
8658 case PyUnicode_4BYTE_KIND:
8659 return ucs4lib_split_whitespace(
8660 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8661 PyUnicode_GET_LENGTH(self), maxcount
8662 );
8663 default:
8664 assert(0);
8665 return NULL;
8666 }
8667
8668 if (PyUnicode_READY(substring) == -1)
8669 return NULL;
8670
8671 kind1 = PyUnicode_KIND(self);
8672 kind2 = PyUnicode_KIND(substring);
8673 kind = kind1 > kind2 ? kind1 : kind2;
8674 buf1 = PyUnicode_DATA(self);
8675 buf2 = PyUnicode_DATA(substring);
8676 if (kind1 != kind)
8677 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8678 if (!buf1)
8679 return NULL;
8680 if (kind2 != kind)
8681 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8682 if (!buf2) {
8683 if (kind1 != kind) PyMem_Free(buf1);
8684 return NULL;
8685 }
8686 len1 = PyUnicode_GET_LENGTH(self);
8687 len2 = PyUnicode_GET_LENGTH(substring);
8688
8689 switch(kind) {
8690 case PyUnicode_1BYTE_KIND:
8691 out = ucs1lib_split(
8692 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8693 break;
8694 case PyUnicode_2BYTE_KIND:
8695 out = ucs2lib_split(
8696 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8697 break;
8698 case PyUnicode_4BYTE_KIND:
8699 out = ucs4lib_split(
8700 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8701 break;
8702 default:
8703 out = NULL;
8704 }
8705 if (kind1 != kind)
8706 PyMem_Free(buf1);
8707 if (kind2 != kind)
8708 PyMem_Free(buf2);
8709 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710}
8711
Alexander Belopolsky40018472011-02-26 01:02:56 +00008712static PyObject *
8713rsplit(PyUnicodeObject *self,
8714 PyUnicodeObject *substring,
8715 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008716{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008717 int kind1, kind2, kind;
8718 void *buf1, *buf2;
8719 Py_ssize_t len1, len2;
8720 PyObject* out;
8721
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008722 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008723 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 if (PyUnicode_READY(self) == -1)
8726 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008728 if (substring == NULL)
8729 switch(PyUnicode_KIND(self)) {
8730 case PyUnicode_1BYTE_KIND:
8731 return ucs1lib_rsplit_whitespace(
8732 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8733 PyUnicode_GET_LENGTH(self), maxcount
8734 );
8735 case PyUnicode_2BYTE_KIND:
8736 return ucs2lib_rsplit_whitespace(
8737 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8738 PyUnicode_GET_LENGTH(self), maxcount
8739 );
8740 case PyUnicode_4BYTE_KIND:
8741 return ucs4lib_rsplit_whitespace(
8742 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8743 PyUnicode_GET_LENGTH(self), maxcount
8744 );
8745 default:
8746 assert(0);
8747 return NULL;
8748 }
8749
8750 if (PyUnicode_READY(substring) == -1)
8751 return NULL;
8752
8753 kind1 = PyUnicode_KIND(self);
8754 kind2 = PyUnicode_KIND(substring);
8755 kind = kind1 > kind2 ? kind1 : kind2;
8756 buf1 = PyUnicode_DATA(self);
8757 buf2 = PyUnicode_DATA(substring);
8758 if (kind1 != kind)
8759 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8760 if (!buf1)
8761 return NULL;
8762 if (kind2 != kind)
8763 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8764 if (!buf2) {
8765 if (kind1 != kind) PyMem_Free(buf1);
8766 return NULL;
8767 }
8768 len1 = PyUnicode_GET_LENGTH(self);
8769 len2 = PyUnicode_GET_LENGTH(substring);
8770
8771 switch(kind) {
8772 case PyUnicode_1BYTE_KIND:
8773 out = ucs1lib_rsplit(
8774 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8775 break;
8776 case PyUnicode_2BYTE_KIND:
8777 out = ucs2lib_rsplit(
8778 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8779 break;
8780 case PyUnicode_4BYTE_KIND:
8781 out = ucs4lib_rsplit(
8782 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8783 break;
8784 default:
8785 out = NULL;
8786 }
8787 if (kind1 != kind)
8788 PyMem_Free(buf1);
8789 if (kind2 != kind)
8790 PyMem_Free(buf2);
8791 return out;
8792}
8793
8794static Py_ssize_t
8795anylib_find(int kind, void *buf1, Py_ssize_t len1,
8796 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8797{
8798 switch(kind) {
8799 case PyUnicode_1BYTE_KIND:
8800 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8801 case PyUnicode_2BYTE_KIND:
8802 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8803 case PyUnicode_4BYTE_KIND:
8804 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8805 }
8806 assert(0);
8807 return -1;
8808}
8809
8810static Py_ssize_t
8811anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8812 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8813{
8814 switch(kind) {
8815 case PyUnicode_1BYTE_KIND:
8816 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8817 case PyUnicode_2BYTE_KIND:
8818 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8819 case PyUnicode_4BYTE_KIND:
8820 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8821 }
8822 assert(0);
8823 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008824}
8825
Alexander Belopolsky40018472011-02-26 01:02:56 +00008826static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827replace(PyObject *self, PyObject *str1,
8828 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008830 PyObject *u;
8831 char *sbuf = PyUnicode_DATA(self);
8832 char *buf1 = PyUnicode_DATA(str1);
8833 char *buf2 = PyUnicode_DATA(str2);
8834 int srelease = 0, release1 = 0, release2 = 0;
8835 int skind = PyUnicode_KIND(self);
8836 int kind1 = PyUnicode_KIND(str1);
8837 int kind2 = PyUnicode_KIND(str2);
8838 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8839 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8840 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841
8842 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008843 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008844 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008845 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008847 if (skind < kind1)
8848 /* substring too wide to be present */
8849 goto nothing;
8850
8851 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008852 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008853 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008855 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008857 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 Py_UCS4 u1, u2, maxchar;
8859 int mayshrink, rkind;
8860 u1 = PyUnicode_READ_CHAR(str1, 0);
8861 if (!findchar(sbuf, PyUnicode_KIND(self),
8862 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008863 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864 u2 = PyUnicode_READ_CHAR(str2, 0);
8865 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8866 /* Replacing u1 with u2 may cause a maxchar reduction in the
8867 result string. */
8868 mayshrink = maxchar > 127;
8869 if (u2 > maxchar) {
8870 maxchar = u2;
8871 mayshrink = 0;
8872 }
8873 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008874 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008876 if (PyUnicode_CopyCharacters(u, 0,
8877 (PyObject*)self, 0, slen) < 0)
8878 {
8879 Py_DECREF(u);
8880 return NULL;
8881 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 rkind = PyUnicode_KIND(u);
8883 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8884 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008885 if (--maxcount < 0)
8886 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008887 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008888 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889 if (mayshrink) {
8890 PyObject *tmp = u;
8891 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8892 PyUnicode_GET_LENGTH(tmp));
8893 Py_DECREF(tmp);
8894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008896 int rkind = skind;
8897 char *res;
8898 if (kind1 < rkind) {
8899 /* widen substring */
8900 buf1 = _PyUnicode_AsKind(str1, rkind);
8901 if (!buf1) goto error;
8902 release1 = 1;
8903 }
8904 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008905 if (i < 0)
8906 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008907 if (rkind > kind2) {
8908 /* widen replacement */
8909 buf2 = _PyUnicode_AsKind(str2, rkind);
8910 if (!buf2) goto error;
8911 release2 = 1;
8912 }
8913 else if (rkind < kind2) {
8914 /* widen self and buf1 */
8915 rkind = kind2;
8916 if (release1) PyMem_Free(buf1);
8917 sbuf = _PyUnicode_AsKind(self, rkind);
8918 if (!sbuf) goto error;
8919 srelease = 1;
8920 buf1 = _PyUnicode_AsKind(str1, rkind);
8921 if (!buf1) goto error;
8922 release1 = 1;
8923 }
8924 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8925 if (!res) {
8926 PyErr_NoMemory();
8927 goto error;
8928 }
8929 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008930 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8932 buf2,
8933 PyUnicode_KIND_SIZE(rkind, len2));
8934 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008935
8936 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8938 slen-i,
8939 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008940 if (i == -1)
8941 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8943 buf2,
8944 PyUnicode_KIND_SIZE(rkind, len2));
8945 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008946 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947
8948 u = PyUnicode_FromKindAndData(rkind, res, slen);
8949 PyMem_Free(res);
8950 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 Py_ssize_t n, i, j, ires;
8955 Py_ssize_t product, new_size;
8956 int rkind = skind;
8957 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008959 if (kind1 < rkind) {
8960 buf1 = _PyUnicode_AsKind(str1, rkind);
8961 if (!buf1) goto error;
8962 release1 = 1;
8963 }
8964 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008965 if (n == 0)
8966 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 if (kind2 < rkind) {
8968 buf2 = _PyUnicode_AsKind(str2, rkind);
8969 if (!buf2) goto error;
8970 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 else if (kind2 > rkind) {
8973 rkind = kind2;
8974 sbuf = _PyUnicode_AsKind(self, rkind);
8975 if (!sbuf) goto error;
8976 srelease = 1;
8977 if (release1) PyMem_Free(buf1);
8978 buf1 = _PyUnicode_AsKind(str1, rkind);
8979 if (!buf1) goto error;
8980 release1 = 1;
8981 }
8982 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
8983 PyUnicode_GET_LENGTH(str1))); */
8984 product = n * (len2-len1);
8985 if ((product / (len2-len1)) != n) {
8986 PyErr_SetString(PyExc_OverflowError,
8987 "replace string is too long");
8988 goto error;
8989 }
8990 new_size = slen + product;
8991 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
8992 PyErr_SetString(PyExc_OverflowError,
8993 "replace string is too long");
8994 goto error;
8995 }
8996 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
8997 if (!res)
8998 goto error;
8999 ires = i = 0;
9000 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009001 while (n-- > 0) {
9002 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003 j = anylib_find(rkind,
9004 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9005 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009006 if (j == -1)
9007 break;
9008 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009009 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9011 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9012 PyUnicode_KIND_SIZE(rkind, j-i));
9013 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009014 }
9015 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 if (len2 > 0) {
9017 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9018 buf2,
9019 PyUnicode_KIND_SIZE(rkind, len2));
9020 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009023 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009025 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9027 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9028 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009029 } else {
9030 /* interleave */
9031 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9033 buf2,
9034 PyUnicode_KIND_SIZE(rkind, len2));
9035 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009036 if (--n <= 0)
9037 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009038 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9039 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9040 PyUnicode_KIND_SIZE(rkind, 1));
9041 ires++;
9042 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009043 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9045 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9046 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 if (srelease)
9051 PyMem_FREE(sbuf);
9052 if (release1)
9053 PyMem_FREE(buf1);
9054 if (release2)
9055 PyMem_FREE(buf2);
9056 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009057
Benjamin Peterson29060642009-01-31 22:14:21 +00009058 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009059 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 if (srelease)
9061 PyMem_FREE(sbuf);
9062 if (release1)
9063 PyMem_FREE(buf1);
9064 if (release2)
9065 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009066 if (PyUnicode_CheckExact(self)) {
9067 Py_INCREF(self);
9068 return (PyObject *) self;
9069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 return PyUnicode_FromKindAndData(PyUnicode_KIND(self),
9071 PyUnicode_DATA(self),
9072 PyUnicode_GET_LENGTH(self));
9073 error:
9074 if (srelease && sbuf)
9075 PyMem_FREE(sbuf);
9076 if (release1 && buf1)
9077 PyMem_FREE(buf1);
9078 if (release2 && buf2)
9079 PyMem_FREE(buf2);
9080 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081}
9082
9083/* --- Unicode Object Methods --------------------------------------------- */
9084
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009085PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009086 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087\n\
9088Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009089characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090
9091static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009092unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094 return fixup(self, fixtitle);
9095}
9096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009097PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009098 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099\n\
9100Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009101have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102
9103static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009104unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009105{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106 return fixup(self, fixcapitalize);
9107}
9108
9109#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009110PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112\n\
9113Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009114normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009115
9116static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009117unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118{
9119 PyObject *list;
9120 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009121 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123 /* Split into words */
9124 list = split(self, NULL, -1);
9125 if (!list)
9126 return NULL;
9127
9128 /* Capitalize each word */
9129 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9130 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132 if (item == NULL)
9133 goto onError;
9134 Py_DECREF(PyList_GET_ITEM(list, i));
9135 PyList_SET_ITEM(list, i, item);
9136 }
9137
9138 /* Join the words to form a new string */
9139 item = PyUnicode_Join(NULL, list);
9140
Benjamin Peterson29060642009-01-31 22:14:21 +00009141 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142 Py_DECREF(list);
9143 return (PyObject *)item;
9144}
9145#endif
9146
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009147/* Argument converter. Coerces to a single unicode character */
9148
9149static int
9150convert_uc(PyObject *obj, void *addr)
9151{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009153 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009154
Benjamin Peterson14339b62009-01-31 16:36:08 +00009155 uniobj = PyUnicode_FromObject(obj);
9156 if (uniobj == NULL) {
9157 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009158 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009159 return 0;
9160 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009162 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009164 Py_DECREF(uniobj);
9165 return 0;
9166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167 if (PyUnicode_READY(uniobj)) {
9168 Py_DECREF(uniobj);
9169 return 0;
9170 }
9171 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009172 Py_DECREF(uniobj);
9173 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009174}
9175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009176PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009177 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009179Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009180done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181
9182static PyObject *
9183unicode_center(PyUnicodeObject *self, PyObject *args)
9184{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009185 Py_ssize_t marg, left;
9186 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 Py_UCS4 fillchar = ' ';
9188
9189 if (PyUnicode_READY(self) == -1)
9190 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009191
Thomas Woutersde017742006-02-16 19:34:37 +00009192 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009193 return NULL;
9194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196 Py_INCREF(self);
9197 return (PyObject*) self;
9198 }
9199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009200 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009201 left = marg / 2 + (marg & width & 1);
9202
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009203 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009204}
9205
Marc-André Lemburge5034372000-08-08 08:04:29 +00009206#if 0
9207
9208/* This code should go into some future Unicode collation support
9209 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009210 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009211
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009212/* speedy UTF-16 code point order comparison */
9213/* gleaned from: */
9214/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9215
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009216static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009217{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009218 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009219 0, 0, 0, 0, 0, 0, 0, 0,
9220 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009221 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009222};
9223
Guido van Rossumd57fd912000-03-10 22:53:23 +00009224static int
9225unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9226{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009227 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009228
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229 Py_UNICODE *s1 = str1->str;
9230 Py_UNICODE *s2 = str2->str;
9231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009232 len1 = str1->_base._base.length;
9233 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009234
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009236 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009237
9238 c1 = *s1++;
9239 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009240
Benjamin Peterson29060642009-01-31 22:14:21 +00009241 if (c1 > (1<<11) * 26)
9242 c1 += utf16Fixup[c1>>11];
9243 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009244 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009245 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009246
9247 if (c1 != c2)
9248 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009249
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009250 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009251 }
9252
9253 return (len1 < len2) ? -1 : (len1 != len2);
9254}
9255
Marc-André Lemburge5034372000-08-08 08:04:29 +00009256#else
9257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258/* This function assumes that str1 and str2 are readied by the caller. */
9259
Marc-André Lemburge5034372000-08-08 08:04:29 +00009260static int
9261unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 int kind1, kind2;
9264 void *data1, *data2;
9265 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 kind1 = PyUnicode_KIND(str1);
9268 kind2 = PyUnicode_KIND(str2);
9269 data1 = PyUnicode_DATA(str1);
9270 data2 = PyUnicode_DATA(str2);
9271 len1 = PyUnicode_GET_LENGTH(str1);
9272 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 for (i = 0; i < len1 && i < len2; ++i) {
9275 Py_UCS4 c1, c2;
9276 c1 = PyUnicode_READ(kind1, data1, i);
9277 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009278
9279 if (c1 != c2)
9280 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009281 }
9282
9283 return (len1 < len2) ? -1 : (len1 != len2);
9284}
9285
9286#endif
9287
Alexander Belopolsky40018472011-02-26 01:02:56 +00009288int
9289PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9292 if (PyUnicode_READY(left) == -1 ||
9293 PyUnicode_READY(right) == -1)
9294 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009295 return unicode_compare((PyUnicodeObject *)left,
9296 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009298 PyErr_Format(PyExc_TypeError,
9299 "Can't compare %.100s and %.100s",
9300 left->ob_type->tp_name,
9301 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302 return -1;
9303}
9304
Martin v. Löwis5b222132007-06-10 09:51:05 +00009305int
9306PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9307{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 Py_ssize_t i;
9309 int kind;
9310 void *data;
9311 Py_UCS4 chr;
9312
Martin v. Löwis5b222132007-06-10 09:51:05 +00009313 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 if (PyUnicode_READY(uni) == -1)
9315 return -1;
9316 kind = PyUnicode_KIND(uni);
9317 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009318 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9320 if (chr != str[i])
9321 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009322 /* This check keeps Python strings that end in '\0' from comparing equal
9323 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009325 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009326 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009327 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009328 return 0;
9329}
9330
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009331
Benjamin Peterson29060642009-01-31 22:14:21 +00009332#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009333 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009334
Alexander Belopolsky40018472011-02-26 01:02:56 +00009335PyObject *
9336PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009337{
9338 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009339
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009340 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9341 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 if (PyUnicode_READY(left) == -1 ||
9343 PyUnicode_READY(right) == -1)
9344 return NULL;
9345 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9346 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009347 if (op == Py_EQ) {
9348 Py_INCREF(Py_False);
9349 return Py_False;
9350 }
9351 if (op == Py_NE) {
9352 Py_INCREF(Py_True);
9353 return Py_True;
9354 }
9355 }
9356 if (left == right)
9357 result = 0;
9358 else
9359 result = unicode_compare((PyUnicodeObject *)left,
9360 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009361
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009362 /* Convert the return value to a Boolean */
9363 switch (op) {
9364 case Py_EQ:
9365 v = TEST_COND(result == 0);
9366 break;
9367 case Py_NE:
9368 v = TEST_COND(result != 0);
9369 break;
9370 case Py_LE:
9371 v = TEST_COND(result <= 0);
9372 break;
9373 case Py_GE:
9374 v = TEST_COND(result >= 0);
9375 break;
9376 case Py_LT:
9377 v = TEST_COND(result == -1);
9378 break;
9379 case Py_GT:
9380 v = TEST_COND(result == 1);
9381 break;
9382 default:
9383 PyErr_BadArgument();
9384 return NULL;
9385 }
9386 Py_INCREF(v);
9387 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009388 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009389
Brian Curtindfc80e32011-08-10 20:28:54 -05009390 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009391}
9392
Alexander Belopolsky40018472011-02-26 01:02:56 +00009393int
9394PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009395{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009396 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009397 int kind1, kind2, kind;
9398 void *buf1, *buf2;
9399 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009400 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009401
9402 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009403 sub = PyUnicode_FromObject(element);
9404 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009405 PyErr_Format(PyExc_TypeError,
9406 "'in <string>' requires string as left operand, not %s",
9407 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009408 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009409 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 if (PyUnicode_READY(sub) == -1)
9411 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009412
Thomas Wouters477c8d52006-05-27 19:21:47 +00009413 str = PyUnicode_FromObject(container);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 if (!str || PyUnicode_READY(container) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009415 Py_DECREF(sub);
9416 return -1;
9417 }
9418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419 kind1 = PyUnicode_KIND(str);
9420 kind2 = PyUnicode_KIND(sub);
9421 kind = kind1 > kind2 ? kind1 : kind2;
9422 buf1 = PyUnicode_DATA(str);
9423 buf2 = PyUnicode_DATA(sub);
9424 if (kind1 != kind)
9425 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9426 if (!buf1) {
9427 Py_DECREF(sub);
9428 return -1;
9429 }
9430 if (kind2 != kind)
9431 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9432 if (!buf2) {
9433 Py_DECREF(sub);
9434 if (kind1 != kind) PyMem_Free(buf1);
9435 return -1;
9436 }
9437 len1 = PyUnicode_GET_LENGTH(str);
9438 len2 = PyUnicode_GET_LENGTH(sub);
9439
9440 switch(kind) {
9441 case PyUnicode_1BYTE_KIND:
9442 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9443 break;
9444 case PyUnicode_2BYTE_KIND:
9445 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9446 break;
9447 case PyUnicode_4BYTE_KIND:
9448 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9449 break;
9450 default:
9451 result = -1;
9452 assert(0);
9453 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009454
9455 Py_DECREF(str);
9456 Py_DECREF(sub);
9457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 if (kind1 != kind)
9459 PyMem_Free(buf1);
9460 if (kind2 != kind)
9461 PyMem_Free(buf2);
9462
Guido van Rossum403d68b2000-03-13 15:55:09 +00009463 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009464}
9465
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466/* Concat to string or Unicode object giving a new Unicode object. */
9467
Alexander Belopolsky40018472011-02-26 01:02:56 +00009468PyObject *
9469PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 PyObject *u = NULL, *v = NULL, *w;
9472 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473
9474 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009477 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009480 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481
9482 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009484 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009488 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490 }
9491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 if (PyUnicode_READY(u) == -1 || PyUnicode_READY(v) == -1)
9493 goto onError;
9494
9495 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009496 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 w = PyUnicode_New(
9500 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9501 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009503 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009504 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9505 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009506 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009507 v, 0,
9508 PyUnicode_GET_LENGTH(v)) < 0)
9509 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510 Py_DECREF(u);
9511 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009513
Benjamin Peterson29060642009-01-31 22:14:21 +00009514 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515 Py_XDECREF(u);
9516 Py_XDECREF(v);
9517 return NULL;
9518}
9519
Walter Dörwald1ab83302007-05-18 17:15:44 +00009520void
9521PyUnicode_Append(PyObject **pleft, PyObject *right)
9522{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009523 PyObject *new;
9524 if (*pleft == NULL)
9525 return;
9526 if (right == NULL || !PyUnicode_Check(*pleft)) {
9527 Py_DECREF(*pleft);
9528 *pleft = NULL;
9529 return;
9530 }
9531 new = PyUnicode_Concat(*pleft, right);
9532 Py_DECREF(*pleft);
9533 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009534}
9535
9536void
9537PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9538{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009539 PyUnicode_Append(pleft, right);
9540 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009541}
9542
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009543PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009544 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009546Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009547string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009548interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009549
9550static PyObject *
9551unicode_count(PyUnicodeObject *self, PyObject *args)
9552{
9553 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009554 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009555 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 int kind1, kind2, kind;
9558 void *buf1, *buf2;
9559 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009560
Jesus Ceaac451502011-04-20 17:09:23 +02009561 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9562 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009563 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 kind1 = PyUnicode_KIND(self);
9566 kind2 = PyUnicode_KIND(substring);
9567 kind = kind1 > kind2 ? kind1 : kind2;
9568 buf1 = PyUnicode_DATA(self);
9569 buf2 = PyUnicode_DATA(substring);
9570 if (kind1 != kind)
9571 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9572 if (!buf1) {
9573 Py_DECREF(substring);
9574 return NULL;
9575 }
9576 if (kind2 != kind)
9577 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9578 if (!buf2) {
9579 Py_DECREF(substring);
9580 if (kind1 != kind) PyMem_Free(buf1);
9581 return NULL;
9582 }
9583 len1 = PyUnicode_GET_LENGTH(self);
9584 len2 = PyUnicode_GET_LENGTH(substring);
9585
9586 ADJUST_INDICES(start, end, len1);
9587 switch(kind) {
9588 case PyUnicode_1BYTE_KIND:
9589 iresult = ucs1lib_count(
9590 ((Py_UCS1*)buf1) + start, end - start,
9591 buf2, len2, PY_SSIZE_T_MAX
9592 );
9593 break;
9594 case PyUnicode_2BYTE_KIND:
9595 iresult = ucs2lib_count(
9596 ((Py_UCS2*)buf1) + start, end - start,
9597 buf2, len2, PY_SSIZE_T_MAX
9598 );
9599 break;
9600 case PyUnicode_4BYTE_KIND:
9601 iresult = ucs4lib_count(
9602 ((Py_UCS4*)buf1) + start, end - start,
9603 buf2, len2, PY_SSIZE_T_MAX
9604 );
9605 break;
9606 default:
9607 assert(0); iresult = 0;
9608 }
9609
9610 result = PyLong_FromSsize_t(iresult);
9611
9612 if (kind1 != kind)
9613 PyMem_Free(buf1);
9614 if (kind2 != kind)
9615 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616
9617 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009618
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619 return result;
9620}
9621
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009622PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009623 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009625Encode S using the codec registered for encoding. Default encoding\n\
9626is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009627handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009628a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9629'xmlcharrefreplace' as well as any other name registered with\n\
9630codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631
9632static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009633unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009635 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636 char *encoding = NULL;
9637 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009638
Benjamin Peterson308d6372009-09-18 21:42:35 +00009639 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9640 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009642 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009643}
9644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009645PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009646 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647\n\
9648Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009649If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650
9651static PyObject*
9652unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9653{
9654 Py_UNICODE *e;
9655 Py_UNICODE *p;
9656 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009657 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009659 PyUnicodeObject *u;
9660 int tabsize = 8;
9661
9662 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9666 return NULL;
9667
Thomas Wouters7e474022000-07-16 12:04:32 +00009668 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009669 i = 0; /* chars up to and including most recent \n or \r */
9670 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9672 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009674 if (tabsize > 0) {
9675 incr = tabsize - (j % tabsize); /* cannot overflow */
9676 if (j > PY_SSIZE_T_MAX - incr)
9677 goto overflow1;
9678 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009679 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009682 if (j > PY_SSIZE_T_MAX - 1)
9683 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684 j++;
9685 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009686 if (i > PY_SSIZE_T_MAX - j)
9687 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009689 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009690 }
9691 }
9692
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009693 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009694 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009695
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696 /* Second pass: create output string and fill it */
9697 u = _PyUnicode_New(i + j);
9698 if (!u)
9699 return NULL;
9700
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009701 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 q = _PyUnicode_WSTR(u); /* next output char */
9703 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009707 if (tabsize > 0) {
9708 i = tabsize - (j % tabsize);
9709 j += i;
9710 while (i--) {
9711 if (q >= qe)
9712 goto overflow2;
9713 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009714 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009715 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009716 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009717 else {
9718 if (q >= qe)
9719 goto overflow2;
9720 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009721 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722 if (*p == '\n' || *p == '\r')
9723 j = 0;
9724 }
9725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 if (PyUnicode_READY(u) == -1) {
9727 Py_DECREF(u);
9728 return NULL;
9729 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009731
9732 overflow2:
9733 Py_DECREF(u);
9734 overflow1:
9735 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009737}
9738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009739PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009740 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009741\n\
9742Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009743such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009744arguments start and end are interpreted as in slice notation.\n\
9745\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009746Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747
9748static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009749unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750{
Jesus Ceaac451502011-04-20 17:09:23 +02009751 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009752 Py_ssize_t start;
9753 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009754 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755
Jesus Ceaac451502011-04-20 17:09:23 +02009756 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9757 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 if (PyUnicode_READY(self) == -1)
9761 return NULL;
9762 if (PyUnicode_READY(substring) == -1)
9763 return NULL;
9764
9765 result = any_find_slice(
9766 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9767 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009768 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769
9770 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 if (result == -2)
9773 return NULL;
9774
Christian Heimes217cfd12007-12-02 14:31:20 +00009775 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776}
9777
9778static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009779unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 Py_UCS4 ch;
9782
9783 if (PyUnicode_READY(self) == -1)
9784 return NULL;
9785 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009786 PyErr_SetString(PyExc_IndexError, "string index out of range");
9787 return NULL;
9788 }
9789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9791 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009792}
9793
Guido van Rossumc2504932007-09-18 19:42:40 +00009794/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009795 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009796static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009797unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009798{
Guido van Rossumc2504932007-09-18 19:42:40 +00009799 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009800 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 if (_PyUnicode_HASH(self) != -1)
9803 return _PyUnicode_HASH(self);
9804 if (PyUnicode_READY(self) == -1)
9805 return -1;
9806 len = PyUnicode_GET_LENGTH(self);
9807
9808 /* The hash function as a macro, gets expanded three times below. */
9809#define HASH(P) \
9810 x = (Py_uhash_t)*P << 7; \
9811 while (--len >= 0) \
9812 x = (1000003*x) ^ (Py_uhash_t)*P++;
9813
9814 switch (PyUnicode_KIND(self)) {
9815 case PyUnicode_1BYTE_KIND: {
9816 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9817 HASH(c);
9818 break;
9819 }
9820 case PyUnicode_2BYTE_KIND: {
9821 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9822 HASH(s);
9823 break;
9824 }
9825 default: {
9826 Py_UCS4 *l;
9827 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9828 "Impossible switch case in unicode_hash");
9829 l = PyUnicode_4BYTE_DATA(self);
9830 HASH(l);
9831 break;
9832 }
9833 }
9834 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9835
Guido van Rossumc2504932007-09-18 19:42:40 +00009836 if (x == -1)
9837 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009839 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009840}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009843PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009844 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009846Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009847
9848static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009851 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009852 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009853 Py_ssize_t start;
9854 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855
Jesus Ceaac451502011-04-20 17:09:23 +02009856 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9857 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009860 if (PyUnicode_READY(self) == -1)
9861 return NULL;
9862 if (PyUnicode_READY(substring) == -1)
9863 return NULL;
9864
9865 result = any_find_slice(
9866 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9867 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009868 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869
9870 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 if (result == -2)
9873 return NULL;
9874
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875 if (result < 0) {
9876 PyErr_SetString(PyExc_ValueError, "substring not found");
9877 return NULL;
9878 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009879
Christian Heimes217cfd12007-12-02 14:31:20 +00009880 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881}
9882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009883PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009884 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009886Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009887at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888
9889static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009890unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 Py_ssize_t i, length;
9893 int kind;
9894 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895 int cased;
9896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009897 if (PyUnicode_READY(self) == -1)
9898 return NULL;
9899 length = PyUnicode_GET_LENGTH(self);
9900 kind = PyUnicode_KIND(self);
9901 data = PyUnicode_DATA(self);
9902
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 if (length == 1)
9905 return PyBool_FromLong(
9906 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009908 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009909 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009910 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009911
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 for (i = 0; i < length; i++) {
9914 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009915
Benjamin Peterson29060642009-01-31 22:14:21 +00009916 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9917 return PyBool_FromLong(0);
9918 else if (!cased && Py_UNICODE_ISLOWER(ch))
9919 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009921 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009922}
9923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009924PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009925 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009927Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009928at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929
9930static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009931unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 Py_ssize_t i, length;
9934 int kind;
9935 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936 int cased;
9937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 if (PyUnicode_READY(self) == -1)
9939 return NULL;
9940 length = PyUnicode_GET_LENGTH(self);
9941 kind = PyUnicode_KIND(self);
9942 data = PyUnicode_DATA(self);
9943
Guido van Rossumd57fd912000-03-10 22:53:23 +00009944 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 if (length == 1)
9946 return PyBool_FromLong(
9947 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009949 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009950 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009951 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009952
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 for (i = 0; i < length; i++) {
9955 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009956
Benjamin Peterson29060642009-01-31 22:14:21 +00009957 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9958 return PyBool_FromLong(0);
9959 else if (!cased && Py_UNICODE_ISUPPER(ch))
9960 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009962 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009963}
9964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009965PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009966 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009967\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009968Return True if S is a titlecased string and there is at least one\n\
9969character in S, i.e. upper- and titlecase characters may only\n\
9970follow uncased characters and lowercase characters only cased ones.\n\
9971Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009972
9973static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009974unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009975{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 Py_ssize_t i, length;
9977 int kind;
9978 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009979 int cased, previous_is_cased;
9980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 if (PyUnicode_READY(self) == -1)
9982 return NULL;
9983 length = PyUnicode_GET_LENGTH(self);
9984 kind = PyUnicode_KIND(self);
9985 data = PyUnicode_DATA(self);
9986
Guido van Rossumd57fd912000-03-10 22:53:23 +00009987 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 if (length == 1) {
9989 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
9990 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
9991 (Py_UNICODE_ISUPPER(ch) != 0));
9992 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009994 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009996 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009997
Guido van Rossumd57fd912000-03-10 22:53:23 +00009998 cased = 0;
9999 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 for (i = 0; i < length; i++) {
10001 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010002
Benjamin Peterson29060642009-01-31 22:14:21 +000010003 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10004 if (previous_is_cased)
10005 return PyBool_FromLong(0);
10006 previous_is_cased = 1;
10007 cased = 1;
10008 }
10009 else if (Py_UNICODE_ISLOWER(ch)) {
10010 if (!previous_is_cased)
10011 return PyBool_FromLong(0);
10012 previous_is_cased = 1;
10013 cased = 1;
10014 }
10015 else
10016 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010018 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019}
10020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010021PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010022 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010024Return True if all characters in S are whitespace\n\
10025and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026
10027static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010028unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 Py_ssize_t i, length;
10031 int kind;
10032 void *data;
10033
10034 if (PyUnicode_READY(self) == -1)
10035 return NULL;
10036 length = PyUnicode_GET_LENGTH(self);
10037 kind = PyUnicode_KIND(self);
10038 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010039
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 if (length == 1)
10042 return PyBool_FromLong(
10043 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010044
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010045 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010047 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 for (i = 0; i < length; i++) {
10050 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010051 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010052 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010053 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010054 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010055}
10056
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010057PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010058 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010059\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010060Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010061and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010062
10063static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010064unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 Py_ssize_t i, length;
10067 int kind;
10068 void *data;
10069
10070 if (PyUnicode_READY(self) == -1)
10071 return NULL;
10072 length = PyUnicode_GET_LENGTH(self);
10073 kind = PyUnicode_KIND(self);
10074 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010075
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010076 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 if (length == 1)
10078 return PyBool_FromLong(
10079 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010080
10081 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010083 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 for (i = 0; i < length; i++) {
10086 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010087 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010088 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010089 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010090}
10091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010092PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010093 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010094\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010095Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010096and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010097
10098static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010099unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010100{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 int kind;
10102 void *data;
10103 Py_ssize_t len, i;
10104
10105 if (PyUnicode_READY(self) == -1)
10106 return NULL;
10107
10108 kind = PyUnicode_KIND(self);
10109 data = PyUnicode_DATA(self);
10110 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010111
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010112 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 if (len == 1) {
10114 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10115 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10116 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010117
10118 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010120 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 for (i = 0; i < len; i++) {
10123 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010124 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010125 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010126 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010127 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010128}
10129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010130PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010131 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010133Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010134False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135
10136static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010137unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 Py_ssize_t i, length;
10140 int kind;
10141 void *data;
10142
10143 if (PyUnicode_READY(self) == -1)
10144 return NULL;
10145 length = PyUnicode_GET_LENGTH(self);
10146 kind = PyUnicode_KIND(self);
10147 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 if (length == 1)
10151 return PyBool_FromLong(
10152 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010154 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010156 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 for (i = 0; i < length; i++) {
10159 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010160 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010162 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163}
10164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010165PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010166 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010168Return True if all characters in S are digits\n\
10169and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170
10171static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010172unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 Py_ssize_t i, length;
10175 int kind;
10176 void *data;
10177
10178 if (PyUnicode_READY(self) == -1)
10179 return NULL;
10180 length = PyUnicode_GET_LENGTH(self);
10181 kind = PyUnicode_KIND(self);
10182 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 if (length == 1) {
10186 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10187 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10188 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010190 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010192 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 for (i = 0; i < length; i++) {
10195 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010196 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010197 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010198 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199}
10200
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010201PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010202 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010204Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010205False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206
10207static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010208unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 Py_ssize_t i, length;
10211 int kind;
10212 void *data;
10213
10214 if (PyUnicode_READY(self) == -1)
10215 return NULL;
10216 length = PyUnicode_GET_LENGTH(self);
10217 kind = PyUnicode_KIND(self);
10218 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010219
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 if (length == 1)
10222 return PyBool_FromLong(
10223 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010225 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010227 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 for (i = 0; i < length; i++) {
10230 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010231 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010233 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234}
10235
Martin v. Löwis47383402007-08-15 07:32:56 +000010236int
10237PyUnicode_IsIdentifier(PyObject *self)
10238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 int kind;
10240 void *data;
10241 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010242 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 if (PyUnicode_READY(self) == -1) {
10245 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010246 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 }
10248
10249 /* Special case for empty strings */
10250 if (PyUnicode_GET_LENGTH(self) == 0)
10251 return 0;
10252 kind = PyUnicode_KIND(self);
10253 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010254
10255 /* PEP 3131 says that the first character must be in
10256 XID_Start and subsequent characters in XID_Continue,
10257 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010258 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010259 letters, digits, underscore). However, given the current
10260 definition of XID_Start and XID_Continue, it is sufficient
10261 to check just for these, except that _ must be allowed
10262 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010264 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010265 return 0;
10266
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010267 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010269 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010270 return 1;
10271}
10272
10273PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010274 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010275\n\
10276Return True if S is a valid identifier according\n\
10277to the language definition.");
10278
10279static PyObject*
10280unicode_isidentifier(PyObject *self)
10281{
10282 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10283}
10284
Georg Brandl559e5d72008-06-11 18:37:52 +000010285PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010286 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010287\n\
10288Return True if all characters in S are considered\n\
10289printable in repr() or S is empty, False otherwise.");
10290
10291static PyObject*
10292unicode_isprintable(PyObject *self)
10293{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 Py_ssize_t i, length;
10295 int kind;
10296 void *data;
10297
10298 if (PyUnicode_READY(self) == -1)
10299 return NULL;
10300 length = PyUnicode_GET_LENGTH(self);
10301 kind = PyUnicode_KIND(self);
10302 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010303
10304 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 if (length == 1)
10306 return PyBool_FromLong(
10307 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 for (i = 0; i < length; i++) {
10310 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010311 Py_RETURN_FALSE;
10312 }
10313 }
10314 Py_RETURN_TRUE;
10315}
10316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010317PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010318 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319\n\
10320Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010321iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322
10323static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010324unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010326 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327}
10328
Martin v. Löwis18e16552006-02-15 17:27:45 +000010329static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010330unicode_length(PyUnicodeObject *self)
10331{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 if (PyUnicode_READY(self) == -1)
10333 return -1;
10334 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335}
10336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010337PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010338 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010339\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010340Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010341done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010342
10343static PyObject *
10344unicode_ljust(PyUnicodeObject *self, PyObject *args)
10345{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010346 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 Py_UCS4 fillchar = ' ';
10348
10349 if (PyUnicode_READY(self) == -1)
10350 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010351
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010352 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010353 return NULL;
10354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010356 Py_INCREF(self);
10357 return (PyObject*) self;
10358 }
10359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010361}
10362
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010363PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010364 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010366Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367
10368static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010369unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371 return fixup(self, fixlower);
10372}
10373
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010374#define LEFTSTRIP 0
10375#define RIGHTSTRIP 1
10376#define BOTHSTRIP 2
10377
10378/* Arrays indexed by above */
10379static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10380
10381#define STRIPNAME(i) (stripformat[i]+3)
10382
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010383/* externally visible for str.strip(unicode) */
10384PyObject *
10385_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10386{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 void *data;
10388 int kind;
10389 Py_ssize_t i, j, len;
10390 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10393 return NULL;
10394
10395 kind = PyUnicode_KIND(self);
10396 data = PyUnicode_DATA(self);
10397 len = PyUnicode_GET_LENGTH(self);
10398 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10399 PyUnicode_DATA(sepobj),
10400 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010401
Benjamin Peterson14339b62009-01-31 16:36:08 +000010402 i = 0;
10403 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 while (i < len &&
10405 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010406 i++;
10407 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010408 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010409
Benjamin Peterson14339b62009-01-31 16:36:08 +000010410 j = len;
10411 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010412 do {
10413 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 } while (j >= i &&
10415 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010416 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010417 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010418
Benjamin Peterson14339b62009-01-31 16:36:08 +000010419 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010420 Py_INCREF(self);
10421 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010422 }
10423 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 return PyUnicode_Substring((PyObject*)self, i, j);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010425}
10426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427/* Assumes an already ready self string. */
10428
10429static PyObject *
10430substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len)
10431{
10432 const int kind = PyUnicode_KIND(self);
10433 void *data = PyUnicode_DATA(self);
10434 Py_UCS4 maxchar = 0;
10435 Py_ssize_t i;
10436 PyObject *unicode;
10437
10438 if (start < 0 || len < 0 || (start + len) > PyUnicode_GET_LENGTH(self)) {
10439 PyErr_BadInternalCall();
10440 return NULL;
10441 }
10442
10443 if (len == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) {
10444 Py_INCREF(self);
10445 return (PyObject*)self;
10446 }
10447
10448 for (i = 0; i < len; ++i) {
10449 const Py_UCS4 ch = PyUnicode_READ(kind, data, start + i);
10450 if (ch > maxchar)
10451 maxchar = ch;
10452 }
10453
10454 unicode = PyUnicode_New(len, maxchar);
10455 if (unicode == NULL)
10456 return NULL;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010457 if (PyUnicode_CopyCharacters(unicode, 0,
10458 (PyObject*)self, start, len) < 0)
10459 {
10460 Py_DECREF(unicode);
10461 return NULL;
10462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 return unicode;
10464}
10465
10466PyObject*
10467PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10468{
10469 unsigned char *data;
10470 int kind;
10471
10472 if (start == 0 && end == PyUnicode_GET_LENGTH(self)
10473 && PyUnicode_CheckExact(self))
10474 {
10475 Py_INCREF(self);
10476 return (PyObject *)self;
10477 }
10478
10479 if ((end - start) == 1)
10480 return unicode_getitem((PyUnicodeObject*)self, start);
10481
10482 if (PyUnicode_READY(self) == -1)
10483 return NULL;
10484 kind = PyUnicode_KIND(self);
10485 data = PyUnicode_1BYTE_DATA(self);
10486 return PyUnicode_FromKindAndData(kind, data + PyUnicode_KIND_SIZE(kind, start),
10487 end-start);
10488}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489
10490static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010491do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 int kind;
10494 void *data;
10495 Py_ssize_t len, i, j;
10496
10497 if (PyUnicode_READY(self) == -1)
10498 return NULL;
10499
10500 kind = PyUnicode_KIND(self);
10501 data = PyUnicode_DATA(self);
10502 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010503
Benjamin Peterson14339b62009-01-31 16:36:08 +000010504 i = 0;
10505 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010507 i++;
10508 }
10509 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010510
Benjamin Peterson14339b62009-01-31 16:36:08 +000010511 j = len;
10512 if (striptype != LEFTSTRIP) {
10513 do {
10514 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010516 j++;
10517 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010518
Benjamin Peterson14339b62009-01-31 16:36:08 +000010519 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
10520 Py_INCREF(self);
10521 return (PyObject*)self;
10522 }
10523 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 return substring(self, i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525}
10526
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010527
10528static PyObject *
10529do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10530{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010531 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010532
Benjamin Peterson14339b62009-01-31 16:36:08 +000010533 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10534 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010535
Benjamin Peterson14339b62009-01-31 16:36:08 +000010536 if (sep != NULL && sep != Py_None) {
10537 if (PyUnicode_Check(sep))
10538 return _PyUnicode_XStrip(self, striptype, sep);
10539 else {
10540 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010541 "%s arg must be None or str",
10542 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010543 return NULL;
10544 }
10545 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010546
Benjamin Peterson14339b62009-01-31 16:36:08 +000010547 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010548}
10549
10550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010551PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010552 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010553\n\
10554Return a copy of the string S with leading and trailing\n\
10555whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010556If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010557
10558static PyObject *
10559unicode_strip(PyUnicodeObject *self, PyObject *args)
10560{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010561 if (PyTuple_GET_SIZE(args) == 0)
10562 return do_strip(self, BOTHSTRIP); /* Common case */
10563 else
10564 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010565}
10566
10567
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010568PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010569 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010570\n\
10571Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010572If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010573
10574static PyObject *
10575unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10576{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010577 if (PyTuple_GET_SIZE(args) == 0)
10578 return do_strip(self, LEFTSTRIP); /* Common case */
10579 else
10580 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010581}
10582
10583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010584PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010585 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010586\n\
10587Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010588If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010589
10590static PyObject *
10591unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10592{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010593 if (PyTuple_GET_SIZE(args) == 0)
10594 return do_strip(self, RIGHTSTRIP); /* Common case */
10595 else
10596 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010597}
10598
10599
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010601unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602{
10603 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 Py_ssize_t nchars, n;
10605 size_t nbytes, char_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606
Georg Brandl222de0f2009-04-12 12:01:50 +000010607 if (len < 1) {
10608 Py_INCREF(unicode_empty);
10609 return (PyObject *)unicode_empty;
10610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
Tim Peters7a29bd52001-09-12 03:03:31 +000010612 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613 /* no repeat, return original string */
10614 Py_INCREF(str);
10615 return (PyObject*) str;
10616 }
Tim Peters8f422462000-09-09 06:13:41 +000010617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 if (PyUnicode_READY(str) == -1)
10619 return NULL;
10620
Tim Peters8f422462000-09-09 06:13:41 +000010621 /* ensure # of chars needed doesn't overflow int and # of bytes
10622 * needed doesn't overflow size_t
10623 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 nchars = len * PyUnicode_GET_LENGTH(str);
10625 if (nchars / len != PyUnicode_GET_LENGTH(str)) {
Tim Peters8f422462000-09-09 06:13:41 +000010626 PyErr_SetString(PyExc_OverflowError,
10627 "repeated string is too long");
10628 return NULL;
10629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 char_size = PyUnicode_CHARACTER_SIZE(str);
10631 nbytes = (nchars + 1) * char_size;
10632 if (nbytes / char_size != (size_t)(nchars + 1)) {
Tim Peters8f422462000-09-09 06:13:41 +000010633 PyErr_SetString(PyExc_OverflowError,
10634 "repeated string is too long");
10635 return NULL;
10636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638 if (!u)
10639 return NULL;
10640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 if (PyUnicode_GET_LENGTH(str) == 1) {
10642 const int kind = PyUnicode_KIND(str);
10643 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10644 void *to = PyUnicode_DATA(u);
10645 for (n = 0; n < len; ++n)
10646 PyUnicode_WRITE(kind, to, n, fill_char);
10647 }
10648 else {
10649 /* number of characters copied this far */
10650 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10651 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10652 char *to = (char *) PyUnicode_DATA(u);
10653 Py_MEMCPY(to, PyUnicode_DATA(str),
10654 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010655 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 n = (done <= nchars-done) ? done : nchars-done;
10657 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010658 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010659 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010660 }
10661
10662 return (PyObject*) u;
10663}
10664
Alexander Belopolsky40018472011-02-26 01:02:56 +000010665PyObject *
10666PyUnicode_Replace(PyObject *obj,
10667 PyObject *subobj,
10668 PyObject *replobj,
10669 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670{
10671 PyObject *self;
10672 PyObject *str1;
10673 PyObject *str2;
10674 PyObject *result;
10675
10676 self = PyUnicode_FromObject(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 if (self == NULL || PyUnicode_READY(obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010678 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679 str1 = PyUnicode_FromObject(subobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 if (str1 == NULL || PyUnicode_READY(obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010681 Py_DECREF(self);
10682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683 }
10684 str2 = PyUnicode_FromObject(replobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 if (str2 == NULL || PyUnicode_READY(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010686 Py_DECREF(self);
10687 Py_DECREF(str1);
10688 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691 Py_DECREF(self);
10692 Py_DECREF(str1);
10693 Py_DECREF(str2);
10694 return result;
10695}
10696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010697PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010698 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699\n\
10700Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010701old replaced by new. If the optional argument count is\n\
10702given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703
10704static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 PyObject *str1;
10708 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010709 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710 PyObject *result;
10711
Martin v. Löwis18e16552006-02-15 17:27:45 +000010712 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010715 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 str1 = PyUnicode_FromObject(str1);
10717 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10718 return NULL;
10719 str2 = PyUnicode_FromObject(str2);
10720 if (str2 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010721 Py_DECREF(str1);
10722 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724
10725 result = replace(self, str1, str2, maxcount);
10726
10727 Py_DECREF(str1);
10728 Py_DECREF(str2);
10729 return result;
10730}
10731
Alexander Belopolsky40018472011-02-26 01:02:56 +000010732static PyObject *
10733unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010735 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 Py_ssize_t isize;
10737 Py_ssize_t osize, squote, dquote, i, o;
10738 Py_UCS4 max, quote;
10739 int ikind, okind;
10740 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010743 return NULL;
10744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 isize = PyUnicode_GET_LENGTH(unicode);
10746 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 /* Compute length of output, quote characters, and
10749 maximum character */
10750 osize = 2; /* quotes */
10751 max = 127;
10752 squote = dquote = 0;
10753 ikind = PyUnicode_KIND(unicode);
10754 for (i = 0; i < isize; i++) {
10755 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10756 switch (ch) {
10757 case '\'': squote++; osize++; break;
10758 case '"': dquote++; osize++; break;
10759 case '\\': case '\t': case '\r': case '\n':
10760 osize += 2; break;
10761 default:
10762 /* Fast-path ASCII */
10763 if (ch < ' ' || ch == 0x7f)
10764 osize += 4; /* \xHH */
10765 else if (ch < 0x7f)
10766 osize++;
10767 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10768 osize++;
10769 max = ch > max ? ch : max;
10770 }
10771 else if (ch < 0x100)
10772 osize += 4; /* \xHH */
10773 else if (ch < 0x10000)
10774 osize += 6; /* \uHHHH */
10775 else
10776 osize += 10; /* \uHHHHHHHH */
10777 }
10778 }
10779
10780 quote = '\'';
10781 if (squote) {
10782 if (dquote)
10783 /* Both squote and dquote present. Use squote,
10784 and escape them */
10785 osize += squote;
10786 else
10787 quote = '"';
10788 }
10789
10790 repr = PyUnicode_New(osize, max);
10791 if (repr == NULL)
10792 return NULL;
10793 okind = PyUnicode_KIND(repr);
10794 odata = PyUnicode_DATA(repr);
10795
10796 PyUnicode_WRITE(okind, odata, 0, quote);
10797 PyUnicode_WRITE(okind, odata, osize-1, quote);
10798
10799 for (i = 0, o = 1; i < isize; i++) {
10800 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010801
10802 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 if ((ch == quote) || (ch == '\\')) {
10804 PyUnicode_WRITE(okind, odata, o++, '\\');
10805 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010806 continue;
10807 }
10808
Benjamin Peterson29060642009-01-31 22:14:21 +000010809 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010810 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 PyUnicode_WRITE(okind, odata, o++, '\\');
10812 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010813 }
10814 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 PyUnicode_WRITE(okind, odata, o++, '\\');
10816 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010817 }
10818 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 PyUnicode_WRITE(okind, odata, o++, '\\');
10820 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010821 }
10822
10823 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010824 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825 PyUnicode_WRITE(okind, odata, o++, '\\');
10826 PyUnicode_WRITE(okind, odata, o++, 'x');
10827 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10828 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010829 }
10830
Georg Brandl559e5d72008-06-11 18:37:52 +000010831 /* Copy ASCII characters as-is */
10832 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010834 }
10835
Benjamin Peterson29060642009-01-31 22:14:21 +000010836 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010837 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010838 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010839 (categories Z* and C* except ASCII space)
10840 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010842 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 if (ch <= 0xff) {
10844 PyUnicode_WRITE(okind, odata, o++, '\\');
10845 PyUnicode_WRITE(okind, odata, o++, 'x');
10846 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10847 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010848 }
10849 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 else if (ch >= 0x10000) {
10851 PyUnicode_WRITE(okind, odata, o++, '\\');
10852 PyUnicode_WRITE(okind, odata, o++, 'U');
10853 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10854 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10855 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10856 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10857 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10858 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10859 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10860 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010861 }
10862 /* Map 16-bit characters to '\uxxxx' */
10863 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 PyUnicode_WRITE(okind, odata, o++, '\\');
10865 PyUnicode_WRITE(okind, odata, o++, 'u');
10866 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10867 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10868 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10869 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010870 }
10871 }
10872 /* Copy characters as-is */
10873 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010875 }
10876 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010877 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010878 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010879 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880}
10881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010882PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010883 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010884\n\
10885Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010886such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010887arguments start and end are interpreted as in slice notation.\n\
10888\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010889Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890
10891static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893{
Jesus Ceaac451502011-04-20 17:09:23 +020010894 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010895 Py_ssize_t start;
10896 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010897 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898
Jesus Ceaac451502011-04-20 17:09:23 +020010899 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10900 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010901 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903 if (PyUnicode_READY(self) == -1)
10904 return NULL;
10905 if (PyUnicode_READY(substring) == -1)
10906 return NULL;
10907
10908 result = any_find_slice(
10909 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10910 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010911 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912
10913 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010915 if (result == -2)
10916 return NULL;
10917
Christian Heimes217cfd12007-12-02 14:31:20 +000010918 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919}
10920
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010921PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010922 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010924Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925
10926static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928{
Jesus Ceaac451502011-04-20 17:09:23 +020010929 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010930 Py_ssize_t start;
10931 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010932 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933
Jesus Ceaac451502011-04-20 17:09:23 +020010934 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10935 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010936 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 if (PyUnicode_READY(self) == -1)
10939 return NULL;
10940 if (PyUnicode_READY(substring) == -1)
10941 return NULL;
10942
10943 result = any_find_slice(
10944 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10945 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010946 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947
10948 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 if (result == -2)
10951 return NULL;
10952
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953 if (result < 0) {
10954 PyErr_SetString(PyExc_ValueError, "substring not found");
10955 return NULL;
10956 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957
Christian Heimes217cfd12007-12-02 14:31:20 +000010958 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959}
10960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010961PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010962 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010964Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010965done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966
10967static PyObject *
10968unicode_rjust(PyUnicodeObject *self, PyObject *args)
10969{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010970 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 Py_UCS4 fillchar = ' ';
10972
10973 if (PyUnicode_READY(self) == -1)
10974 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010975
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010976 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977 return NULL;
10978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980 Py_INCREF(self);
10981 return (PyObject*) self;
10982 }
10983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985}
10986
Alexander Belopolsky40018472011-02-26 01:02:56 +000010987PyObject *
10988PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989{
10990 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010991
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992 s = PyUnicode_FromObject(s);
10993 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010994 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000010995 if (sep != NULL) {
10996 sep = PyUnicode_FromObject(sep);
10997 if (sep == NULL) {
10998 Py_DECREF(s);
10999 return NULL;
11000 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001 }
11002
11003 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11004
11005 Py_DECREF(s);
11006 Py_XDECREF(sep);
11007 return result;
11008}
11009
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011010PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011011 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012\n\
11013Return a list of the words in S, using sep as the\n\
11014delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011015splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011016whitespace string is a separator and empty strings are\n\
11017removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018
11019static PyObject*
11020unicode_split(PyUnicodeObject *self, PyObject *args)
11021{
11022 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011023 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024
Martin v. Löwis18e16552006-02-15 17:27:45 +000011025 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026 return NULL;
11027
11028 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011029 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011031 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011033 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034}
11035
Thomas Wouters477c8d52006-05-27 19:21:47 +000011036PyObject *
11037PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11038{
11039 PyObject* str_obj;
11040 PyObject* sep_obj;
11041 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 int kind1, kind2, kind;
11043 void *buf1 = NULL, *buf2 = NULL;
11044 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011045
11046 str_obj = PyUnicode_FromObject(str_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 if (!str_obj || PyUnicode_READY(str_in) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011048 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011049 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011051 Py_DECREF(str_obj);
11052 return NULL;
11053 }
11054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011055 kind1 = PyUnicode_KIND(str_in);
11056 kind2 = PyUnicode_KIND(sep_obj);
11057 kind = kind1 > kind2 ? kind1 : kind2;
11058 buf1 = PyUnicode_DATA(str_in);
11059 if (kind1 != kind)
11060 buf1 = _PyUnicode_AsKind(str_in, kind);
11061 if (!buf1)
11062 goto onError;
11063 buf2 = PyUnicode_DATA(sep_obj);
11064 if (kind2 != kind)
11065 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11066 if (!buf2)
11067 goto onError;
11068 len1 = PyUnicode_GET_LENGTH(str_obj);
11069 len2 = PyUnicode_GET_LENGTH(sep_obj);
11070
11071 switch(PyUnicode_KIND(str_in)) {
11072 case PyUnicode_1BYTE_KIND:
11073 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11074 break;
11075 case PyUnicode_2BYTE_KIND:
11076 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11077 break;
11078 case PyUnicode_4BYTE_KIND:
11079 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11080 break;
11081 default:
11082 assert(0);
11083 out = 0;
11084 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011085
11086 Py_DECREF(sep_obj);
11087 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 if (kind1 != kind)
11089 PyMem_Free(buf1);
11090 if (kind2 != kind)
11091 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011092
11093 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011094 onError:
11095 Py_DECREF(sep_obj);
11096 Py_DECREF(str_obj);
11097 if (kind1 != kind && buf1)
11098 PyMem_Free(buf1);
11099 if (kind2 != kind && buf2)
11100 PyMem_Free(buf2);
11101 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011102}
11103
11104
11105PyObject *
11106PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11107{
11108 PyObject* str_obj;
11109 PyObject* sep_obj;
11110 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111 int kind1, kind2, kind;
11112 void *buf1 = NULL, *buf2 = NULL;
11113 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011114
11115 str_obj = PyUnicode_FromObject(str_in);
11116 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011117 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011118 sep_obj = PyUnicode_FromObject(sep_in);
11119 if (!sep_obj) {
11120 Py_DECREF(str_obj);
11121 return NULL;
11122 }
11123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011124 kind1 = PyUnicode_KIND(str_in);
11125 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011126 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011127 buf1 = PyUnicode_DATA(str_in);
11128 if (kind1 != kind)
11129 buf1 = _PyUnicode_AsKind(str_in, kind);
11130 if (!buf1)
11131 goto onError;
11132 buf2 = PyUnicode_DATA(sep_obj);
11133 if (kind2 != kind)
11134 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11135 if (!buf2)
11136 goto onError;
11137 len1 = PyUnicode_GET_LENGTH(str_obj);
11138 len2 = PyUnicode_GET_LENGTH(sep_obj);
11139
11140 switch(PyUnicode_KIND(str_in)) {
11141 case PyUnicode_1BYTE_KIND:
11142 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11143 break;
11144 case PyUnicode_2BYTE_KIND:
11145 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11146 break;
11147 case PyUnicode_4BYTE_KIND:
11148 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11149 break;
11150 default:
11151 assert(0);
11152 out = 0;
11153 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011154
11155 Py_DECREF(sep_obj);
11156 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 if (kind1 != kind)
11158 PyMem_Free(buf1);
11159 if (kind2 != kind)
11160 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011161
11162 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 onError:
11164 Py_DECREF(sep_obj);
11165 Py_DECREF(str_obj);
11166 if (kind1 != kind && buf1)
11167 PyMem_Free(buf1);
11168 if (kind2 != kind && buf2)
11169 PyMem_Free(buf2);
11170 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011171}
11172
11173PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011174 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011175\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011176Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011177the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011178found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011179
11180static PyObject*
11181unicode_partition(PyUnicodeObject *self, PyObject *separator)
11182{
11183 return PyUnicode_Partition((PyObject *)self, separator);
11184}
11185
11186PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011187 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011188\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011189Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011190the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011191separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011192
11193static PyObject*
11194unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11195{
11196 return PyUnicode_RPartition((PyObject *)self, separator);
11197}
11198
Alexander Belopolsky40018472011-02-26 01:02:56 +000011199PyObject *
11200PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011201{
11202 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011203
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011204 s = PyUnicode_FromObject(s);
11205 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011206 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011207 if (sep != NULL) {
11208 sep = PyUnicode_FromObject(sep);
11209 if (sep == NULL) {
11210 Py_DECREF(s);
11211 return NULL;
11212 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011213 }
11214
11215 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11216
11217 Py_DECREF(s);
11218 Py_XDECREF(sep);
11219 return result;
11220}
11221
11222PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011223 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011224\n\
11225Return a list of the words in S, using sep as the\n\
11226delimiter string, starting at the end of the string and\n\
11227working to the front. If maxsplit is given, at most maxsplit\n\
11228splits are done. If sep is not specified, any whitespace string\n\
11229is a separator.");
11230
11231static PyObject*
11232unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11233{
11234 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011235 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011236
Martin v. Löwis18e16552006-02-15 17:27:45 +000011237 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011238 return NULL;
11239
11240 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011241 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011242 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011243 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011244 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011245 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011246}
11247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011248PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011249 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250\n\
11251Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011252Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011253is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254
11255static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011256unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011258 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011259 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011261 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11262 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263 return NULL;
11264
Guido van Rossum86662912000-04-11 15:38:46 +000011265 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266}
11267
11268static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011269PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270{
Walter Dörwald346737f2007-05-31 10:44:43 +000011271 if (PyUnicode_CheckExact(self)) {
11272 Py_INCREF(self);
11273 return self;
11274 } else
11275 /* Subtype -- return genuine unicode string with the same value. */
11276 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
11277 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278}
11279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011280PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011281 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282\n\
11283Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011284and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285
11286static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011287unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289 return fixup(self, fixswapcase);
11290}
11291
Georg Brandlceee0772007-11-27 23:48:05 +000011292PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011293 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011294\n\
11295Return a translation table usable for str.translate().\n\
11296If there is only one argument, it must be a dictionary mapping Unicode\n\
11297ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011298Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011299If there are two arguments, they must be strings of equal length, and\n\
11300in the resulting dictionary, each character in x will be mapped to the\n\
11301character at the same position in y. If there is a third argument, it\n\
11302must be a string, whose characters will be mapped to None in the result.");
11303
11304static PyObject*
11305unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11306{
11307 PyObject *x, *y = NULL, *z = NULL;
11308 PyObject *new = NULL, *key, *value;
11309 Py_ssize_t i = 0;
11310 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011311
Georg Brandlceee0772007-11-27 23:48:05 +000011312 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11313 return NULL;
11314 new = PyDict_New();
11315 if (!new)
11316 return NULL;
11317 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 int x_kind, y_kind, z_kind;
11319 void *x_data, *y_data, *z_data;
11320
Georg Brandlceee0772007-11-27 23:48:05 +000011321 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011322 if (!PyUnicode_Check(x)) {
11323 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11324 "be a string if there is a second argument");
11325 goto err;
11326 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011328 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11329 "arguments must have equal length");
11330 goto err;
11331 }
11332 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 x_kind = PyUnicode_KIND(x);
11334 y_kind = PyUnicode_KIND(y);
11335 x_data = PyUnicode_DATA(x);
11336 y_data = PyUnicode_DATA(y);
11337 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11338 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11339 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011340 if (!key || !value)
11341 goto err;
11342 res = PyDict_SetItem(new, key, value);
11343 Py_DECREF(key);
11344 Py_DECREF(value);
11345 if (res < 0)
11346 goto err;
11347 }
11348 /* create entries for deleting chars in z */
11349 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011350 z_kind = PyUnicode_KIND(z);
11351 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011352 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011354 if (!key)
11355 goto err;
11356 res = PyDict_SetItem(new, key, Py_None);
11357 Py_DECREF(key);
11358 if (res < 0)
11359 goto err;
11360 }
11361 }
11362 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 int kind;
11364 void *data;
11365
Georg Brandlceee0772007-11-27 23:48:05 +000011366 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011367 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011368 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11369 "to maketrans it must be a dict");
11370 goto err;
11371 }
11372 /* copy entries into the new dict, converting string keys to int keys */
11373 while (PyDict_Next(x, &i, &key, &value)) {
11374 if (PyUnicode_Check(key)) {
11375 /* convert string keys to integer keys */
11376 PyObject *newkey;
11377 if (PyUnicode_GET_SIZE(key) != 1) {
11378 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11379 "table must be of length 1");
11380 goto err;
11381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 kind = PyUnicode_KIND(key);
11383 data = PyUnicode_DATA(key);
11384 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011385 if (!newkey)
11386 goto err;
11387 res = PyDict_SetItem(new, newkey, value);
11388 Py_DECREF(newkey);
11389 if (res < 0)
11390 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011391 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011392 /* just keep integer keys */
11393 if (PyDict_SetItem(new, key, value) < 0)
11394 goto err;
11395 } else {
11396 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11397 "be strings or integers");
11398 goto err;
11399 }
11400 }
11401 }
11402 return new;
11403 err:
11404 Py_DECREF(new);
11405 return NULL;
11406}
11407
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011408PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011409 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410\n\
11411Return a copy of the string S, where all characters have been mapped\n\
11412through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011413Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011414Unmapped characters are left untouched. Characters mapped to None\n\
11415are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416
11417static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421}
11422
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011423PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011424 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011426Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427
11428static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011429unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431 return fixup(self, fixupper);
11432}
11433
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011434PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011435 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011437Pad a numeric string S with zeros on the left, to fill a field\n\
11438of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439
11440static PyObject *
11441unicode_zfill(PyUnicodeObject *self, PyObject *args)
11442{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011443 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011445 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 int kind;
11447 void *data;
11448 Py_UCS4 chr;
11449
11450 if (PyUnicode_READY(self) == -1)
11451 return NULL;
11452
Martin v. Löwis18e16552006-02-15 17:27:45 +000011453 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454 return NULL;
11455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011457 if (PyUnicode_CheckExact(self)) {
11458 Py_INCREF(self);
11459 return (PyObject*) self;
11460 }
11461 else
11462 return PyUnicode_FromUnicode(
11463 PyUnicode_AS_UNICODE(self),
11464 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +000011465 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466 }
11467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469
11470 u = pad(self, fill, 0, '0');
11471
Walter Dörwald068325e2002-04-15 13:36:47 +000011472 if (u == NULL)
11473 return NULL;
11474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011475 kind = PyUnicode_KIND(u);
11476 data = PyUnicode_DATA(u);
11477 chr = PyUnicode_READ(kind, data, fill);
11478
11479 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 PyUnicode_WRITE(kind, data, 0, chr);
11482 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 }
11484
11485 return (PyObject*) u;
11486}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
11488#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011489static PyObject *
11490unicode__decimal2ascii(PyObject *self)
11491{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011493}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494#endif
11495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011496PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011497 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011499Return True if S starts with the specified prefix, False otherwise.\n\
11500With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011501With optional end, stop comparing S at that position.\n\
11502prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503
11504static PyObject *
11505unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011506 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011508 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011510 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011511 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011512 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
Jesus Ceaac451502011-04-20 17:09:23 +020011514 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011515 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011516 if (PyTuple_Check(subobj)) {
11517 Py_ssize_t i;
11518 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11519 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011520 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011521 if (substring == NULL)
11522 return NULL;
11523 result = tailmatch(self, substring, start, end, -1);
11524 Py_DECREF(substring);
11525 if (result) {
11526 Py_RETURN_TRUE;
11527 }
11528 }
11529 /* nothing matched */
11530 Py_RETURN_FALSE;
11531 }
11532 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011533 if (substring == NULL) {
11534 if (PyErr_ExceptionMatches(PyExc_TypeError))
11535 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11536 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011537 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011538 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011539 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011541 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542}
11543
11544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011545PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011548Return True if S ends with the specified suffix, False otherwise.\n\
11549With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011550With optional end, stop comparing S at that position.\n\
11551suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552
11553static PyObject *
11554unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011557 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011559 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011560 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011561 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562
Jesus Ceaac451502011-04-20 17:09:23 +020011563 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011564 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011565 if (PyTuple_Check(subobj)) {
11566 Py_ssize_t i;
11567 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11568 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011569 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011570 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011571 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011572 result = tailmatch(self, substring, start, end, +1);
11573 Py_DECREF(substring);
11574 if (result) {
11575 Py_RETURN_TRUE;
11576 }
11577 }
11578 Py_RETURN_FALSE;
11579 }
11580 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011581 if (substring == NULL) {
11582 if (PyErr_ExceptionMatches(PyExc_TypeError))
11583 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11584 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011585 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011586 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011587 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011589 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590}
11591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011593
11594PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011595 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011596\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011597Return a formatted version of S, using substitutions from args and kwargs.\n\
11598The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011599
Eric Smith27bbca62010-11-04 17:06:58 +000011600PyDoc_STRVAR(format_map__doc__,
11601 "S.format_map(mapping) -> str\n\
11602\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011603Return a formatted version of S, using substitutions from mapping.\n\
11604The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011605
Eric Smith4a7d76d2008-05-30 18:10:19 +000011606static PyObject *
11607unicode__format__(PyObject* self, PyObject* args)
11608{
11609 PyObject *format_spec;
11610
11611 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11612 return NULL;
11613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11615 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011616}
11617
Eric Smith8c663262007-08-25 02:26:07 +000011618PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011620\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011621Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011622
11623static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011624unicode__sizeof__(PyUnicodeObject *v)
11625{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 Py_ssize_t size;
11627
11628 /* If it's a compact object, account for base structure +
11629 character data. */
11630 if (PyUnicode_IS_COMPACT_ASCII(v))
11631 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11632 else if (PyUnicode_IS_COMPACT(v))
11633 size = sizeof(PyCompactUnicodeObject) +
11634 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11635 else {
11636 /* If it is a two-block object, account for base object, and
11637 for character block if present. */
11638 size = sizeof(PyUnicodeObject);
11639 if (v->data.any)
11640 size += (PyUnicode_GET_LENGTH(v) + 1) *
11641 PyUnicode_CHARACTER_SIZE(v);
11642 }
11643 /* If the wstr pointer is present, account for it unless it is shared
11644 with the data pointer. Since PyUnicode_DATA will crash if the object
11645 is not ready, check whether it's either not ready (in which case the
11646 data is entirely in wstr) or if the data is not shared. */
11647 if (_PyUnicode_WSTR(v) &&
11648 (!PyUnicode_IS_READY(v) ||
11649 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11650 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11651 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11652 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11653
11654 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011655}
11656
11657PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011658 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011659
11660static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011661unicode_getnewargs(PyUnicodeObject *v)
11662{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 PyObject *copy;
11664 unsigned char *data;
11665 int kind;
11666 if (PyUnicode_READY(v) == -1)
11667 return NULL;
11668 kind = PyUnicode_KIND(v);
11669 data = PyUnicode_1BYTE_DATA(v);
11670 copy = PyUnicode_FromKindAndData(kind, data, PyUnicode_GET_LENGTH(v));
11671 if (!copy)
11672 return NULL;
11673 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011674}
11675
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676static PyMethodDef unicode_methods[] = {
11677
11678 /* Order is according to common usage: often used methods should
11679 appear first, since lookup is done sequentially. */
11680
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011681 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011682 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11683 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011684 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011685 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11686 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11687 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11688 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11689 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11690 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11691 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011692 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011693 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11694 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11695 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011696 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011697 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11698 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11699 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011700 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011701 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011702 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011703 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011704 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11705 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11706 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11707 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11708 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11709 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11710 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11711 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11712 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11713 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11714 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11715 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11716 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11717 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011718 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011719 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011720 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011721 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011722 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011723 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011724 {"maketrans", (PyCFunction) unicode_maketrans,
11725 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011726 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011727#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011728 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729#endif
11730
11731#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011732 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011733 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734#endif
11735
Benjamin Peterson14339b62009-01-31 16:36:08 +000011736 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737 {NULL, NULL}
11738};
11739
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011740static PyObject *
11741unicode_mod(PyObject *v, PyObject *w)
11742{
Brian Curtindfc80e32011-08-10 20:28:54 -050011743 if (!PyUnicode_Check(v))
11744 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011746}
11747
11748static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011749 0, /*nb_add*/
11750 0, /*nb_subtract*/
11751 0, /*nb_multiply*/
11752 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011753};
11754
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011756 (lenfunc) unicode_length, /* sq_length */
11757 PyUnicode_Concat, /* sq_concat */
11758 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11759 (ssizeargfunc) unicode_getitem, /* sq_item */
11760 0, /* sq_slice */
11761 0, /* sq_ass_item */
11762 0, /* sq_ass_slice */
11763 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764};
11765
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011766static PyObject*
11767unicode_subscript(PyUnicodeObject* self, PyObject* item)
11768{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 if (PyUnicode_READY(self) == -1)
11770 return NULL;
11771
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011772 if (PyIndex_Check(item)) {
11773 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011774 if (i == -1 && PyErr_Occurred())
11775 return NULL;
11776 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011778 return unicode_getitem(self, i);
11779 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011780 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011782 Py_UNICODE* result_buf;
11783 PyObject* result;
11784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011787 return NULL;
11788 }
11789
11790 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 return PyUnicode_New(0, 0);
11792 } else if (start == 0 && step == 1 &&
11793 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011794 PyUnicode_CheckExact(self)) {
11795 Py_INCREF(self);
11796 return (PyObject *)self;
11797 } else if (step == 1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798 return substring(self, start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011799 } else {
11800 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011801 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11802 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011803
Benjamin Peterson29060642009-01-31 22:14:21 +000011804 if (result_buf == NULL)
11805 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011806
11807 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11808 result_buf[i] = source_buf[cur];
11809 }
Tim Petersced69f82003-09-16 20:30:58 +000011810
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011811 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011812 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011813 return result;
11814 }
11815 } else {
11816 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11817 return NULL;
11818 }
11819}
11820
11821static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011822 (lenfunc)unicode_length, /* mp_length */
11823 (binaryfunc)unicode_subscript, /* mp_subscript */
11824 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011825};
11826
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828/* Helpers for PyUnicode_Format() */
11829
11830static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011831getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011833 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011835 (*p_argidx)++;
11836 if (arglen < 0)
11837 return args;
11838 else
11839 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840 }
11841 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011842 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 return NULL;
11844}
11845
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011846/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011848static PyObject *
11849formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011851 char *p;
11852 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011854
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855 x = PyFloat_AsDouble(v);
11856 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011857 return NULL;
11858
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011860 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011861
Eric Smith0923d1d2009-04-16 20:16:10 +000011862 p = PyOS_double_to_string(x, type, prec,
11863 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011864 if (p == NULL)
11865 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011867 PyMem_Free(p);
11868 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869}
11870
Tim Peters38fd5b62000-09-21 05:43:11 +000011871static PyObject*
11872formatlong(PyObject *val, int flags, int prec, int type)
11873{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011874 char *buf;
11875 int len;
11876 PyObject *str; /* temporary string object. */
11877 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011878
Benjamin Peterson14339b62009-01-31 16:36:08 +000011879 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11880 if (!str)
11881 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011883 Py_DECREF(str);
11884 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011885}
11886
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011889 size_t buflen,
11890 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011892 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011893 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 if (PyUnicode_GET_LENGTH(v) == 1) {
11895 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011896 buf[1] = '\0';
11897 return 1;
11898 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011899 goto onError;
11900 }
11901 else {
11902 /* Integer input truncated to a character */
11903 long x;
11904 x = PyLong_AsLong(v);
11905 if (x == -1 && PyErr_Occurred())
11906 goto onError;
11907
11908 if (x < 0 || x > 0x10ffff) {
11909 PyErr_SetString(PyExc_OverflowError,
11910 "%c arg not in range(0x110000)");
11911 return -1;
11912 }
11913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011915 buf[1] = '\0';
11916 return 1;
11917 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011918
Benjamin Peterson29060642009-01-31 22:14:21 +000011919 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011920 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011921 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011922 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923}
11924
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011925/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011926 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011927*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011928#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011929
Alexander Belopolsky40018472011-02-26 01:02:56 +000011930PyObject *
11931PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 void *fmt;
11934 int fmtkind;
11935 PyObject *result;
11936 Py_UCS4 *res, *res0;
11937 Py_UCS4 max;
11938 int kind;
11939 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011943
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 PyErr_BadInternalCall();
11946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11949 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011950 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 fmt = PyUnicode_DATA(uformat);
11952 fmtkind = PyUnicode_KIND(uformat);
11953 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11954 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955
11956 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11958 if (res0 == NULL) {
11959 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011960 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962
11963 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 arglen = PyTuple_Size(args);
11965 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 }
11967 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011968 arglen = -1;
11969 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011971 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011972 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011973 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974
11975 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 if (--rescnt < 0) {
11978 rescnt = fmtcnt + 100;
11979 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11981 if (res0 == NULL){
11982 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011983 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 }
11985 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011987 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011989 }
11990 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 /* Got a format specifier */
11992 int flags = 0;
11993 Py_ssize_t width = -1;
11994 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 Py_UCS4 c = '\0';
11996 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 int isnumok;
11998 PyObject *v = NULL;
11999 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 void *pbuf;
12001 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012002 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 Py_ssize_t len, len1;
12004 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 fmtpos++;
12007 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12008 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012009 Py_ssize_t keylen;
12010 PyObject *key;
12011 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012012
Benjamin Peterson29060642009-01-31 22:14:21 +000012013 if (dict == NULL) {
12014 PyErr_SetString(PyExc_TypeError,
12015 "format requires a mapping");
12016 goto onError;
12017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012019 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 /* Skip over balanced parentheses */
12022 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012024 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012026 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012028 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012029 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012030 if (fmtcnt < 0 || pcount > 0) {
12031 PyErr_SetString(PyExc_ValueError,
12032 "incomplete format key");
12033 goto onError;
12034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 key = substring(uformat, keystart, keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012036 if (key == NULL)
12037 goto onError;
12038 if (args_owned) {
12039 Py_DECREF(args);
12040 args_owned = 0;
12041 }
12042 args = PyObject_GetItem(dict, key);
12043 Py_DECREF(key);
12044 if (args == NULL) {
12045 goto onError;
12046 }
12047 args_owned = 1;
12048 arglen = -1;
12049 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012050 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012051 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012053 case '-': flags |= F_LJUST; continue;
12054 case '+': flags |= F_SIGN; continue;
12055 case ' ': flags |= F_BLANK; continue;
12056 case '#': flags |= F_ALT; continue;
12057 case '0': flags |= F_ZERO; continue;
12058 }
12059 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012060 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012061 if (c == '*') {
12062 v = getnextarg(args, arglen, &argidx);
12063 if (v == NULL)
12064 goto onError;
12065 if (!PyLong_Check(v)) {
12066 PyErr_SetString(PyExc_TypeError,
12067 "* wants int");
12068 goto onError;
12069 }
12070 width = PyLong_AsLong(v);
12071 if (width == -1 && PyErr_Occurred())
12072 goto onError;
12073 if (width < 0) {
12074 flags |= F_LJUST;
12075 width = -width;
12076 }
12077 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012079 }
12080 else if (c >= '0' && c <= '9') {
12081 width = c - '0';
12082 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012084 if (c < '0' || c > '9')
12085 break;
12086 if ((width*10) / 10 != width) {
12087 PyErr_SetString(PyExc_ValueError,
12088 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012089 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012090 }
12091 width = width*10 + (c - '0');
12092 }
12093 }
12094 if (c == '.') {
12095 prec = 0;
12096 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 if (c == '*') {
12099 v = getnextarg(args, arglen, &argidx);
12100 if (v == NULL)
12101 goto onError;
12102 if (!PyLong_Check(v)) {
12103 PyErr_SetString(PyExc_TypeError,
12104 "* wants int");
12105 goto onError;
12106 }
12107 prec = PyLong_AsLong(v);
12108 if (prec == -1 && PyErr_Occurred())
12109 goto onError;
12110 if (prec < 0)
12111 prec = 0;
12112 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 }
12115 else if (c >= '0' && c <= '9') {
12116 prec = c - '0';
12117 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012119 if (c < '0' || c > '9')
12120 break;
12121 if ((prec*10) / 10 != prec) {
12122 PyErr_SetString(PyExc_ValueError,
12123 "prec too big");
12124 goto onError;
12125 }
12126 prec = prec*10 + (c - '0');
12127 }
12128 }
12129 } /* prec */
12130 if (fmtcnt >= 0) {
12131 if (c == 'h' || c == 'l' || c == 'L') {
12132 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 }
12135 }
12136 if (fmtcnt < 0) {
12137 PyErr_SetString(PyExc_ValueError,
12138 "incomplete format");
12139 goto onError;
12140 }
12141 if (c != '%') {
12142 v = getnextarg(args, arglen, &argidx);
12143 if (v == NULL)
12144 goto onError;
12145 }
12146 sign = 0;
12147 fill = ' ';
12148 switch (c) {
12149
12150 case '%':
12151 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012153 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012155 len = 1;
12156 break;
12157
12158 case 's':
12159 case 'r':
12160 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012161 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012162 temp = v;
12163 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012164 }
12165 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012166 if (c == 's')
12167 temp = PyObject_Str(v);
12168 else if (c == 'r')
12169 temp = PyObject_Repr(v);
12170 else
12171 temp = PyObject_ASCII(v);
12172 if (temp == NULL)
12173 goto onError;
12174 if (PyUnicode_Check(temp))
12175 /* nothing to do */;
12176 else {
12177 Py_DECREF(temp);
12178 PyErr_SetString(PyExc_TypeError,
12179 "%s argument has non-string str()");
12180 goto onError;
12181 }
12182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 if (PyUnicode_READY(temp) == -1) {
12184 Py_CLEAR(temp);
12185 goto onError;
12186 }
12187 pbuf = PyUnicode_DATA(temp);
12188 kind = PyUnicode_KIND(temp);
12189 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012190 if (prec >= 0 && len > prec)
12191 len = prec;
12192 break;
12193
12194 case 'i':
12195 case 'd':
12196 case 'u':
12197 case 'o':
12198 case 'x':
12199 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012200 isnumok = 0;
12201 if (PyNumber_Check(v)) {
12202 PyObject *iobj=NULL;
12203
12204 if (PyLong_Check(v)) {
12205 iobj = v;
12206 Py_INCREF(iobj);
12207 }
12208 else {
12209 iobj = PyNumber_Long(v);
12210 }
12211 if (iobj!=NULL) {
12212 if (PyLong_Check(iobj)) {
12213 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012214 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012215 Py_DECREF(iobj);
12216 if (!temp)
12217 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 if (PyUnicode_READY(temp) == -1) {
12219 Py_CLEAR(temp);
12220 goto onError;
12221 }
12222 pbuf = PyUnicode_DATA(temp);
12223 kind = PyUnicode_KIND(temp);
12224 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012225 sign = 1;
12226 }
12227 else {
12228 Py_DECREF(iobj);
12229 }
12230 }
12231 }
12232 if (!isnumok) {
12233 PyErr_Format(PyExc_TypeError,
12234 "%%%c format: a number is required, "
12235 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12236 goto onError;
12237 }
12238 if (flags & F_ZERO)
12239 fill = '0';
12240 break;
12241
12242 case 'e':
12243 case 'E':
12244 case 'f':
12245 case 'F':
12246 case 'g':
12247 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012248 temp = formatfloat(v, flags, prec, c);
12249 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012251 if (PyUnicode_READY(temp) == -1) {
12252 Py_CLEAR(temp);
12253 goto onError;
12254 }
12255 pbuf = PyUnicode_DATA(temp);
12256 kind = PyUnicode_KIND(temp);
12257 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012258 sign = 1;
12259 if (flags & F_ZERO)
12260 fill = '0';
12261 break;
12262
12263 case 'c':
12264 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012266 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012267 if (len < 0)
12268 goto onError;
12269 break;
12270
12271 default:
12272 PyErr_Format(PyExc_ValueError,
12273 "unsupported format character '%c' (0x%x) "
12274 "at index %zd",
12275 (31<=c && c<=126) ? (char)c : '?',
12276 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012278 goto onError;
12279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280 /* pbuf is initialized here. */
12281 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012282 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12284 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12285 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 len--;
12287 }
12288 else if (flags & F_SIGN)
12289 sign = '+';
12290 else if (flags & F_BLANK)
12291 sign = ' ';
12292 else
12293 sign = 0;
12294 }
12295 if (width < len)
12296 width = len;
12297 if (rescnt - (sign != 0) < width) {
12298 reslen -= rescnt;
12299 rescnt = width + fmtcnt + 100;
12300 reslen += rescnt;
12301 if (reslen < 0) {
12302 Py_XDECREF(temp);
12303 PyErr_NoMemory();
12304 goto onError;
12305 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12307 if (res0 == 0) {
12308 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012309 Py_XDECREF(temp);
12310 goto onError;
12311 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012313 }
12314 if (sign) {
12315 if (fill != ' ')
12316 *res++ = sign;
12317 rescnt--;
12318 if (width > len)
12319 width--;
12320 }
12321 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12323 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012324 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12326 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012327 }
12328 rescnt -= 2;
12329 width -= 2;
12330 if (width < 0)
12331 width = 0;
12332 len -= 2;
12333 }
12334 if (width > len && !(flags & F_LJUST)) {
12335 do {
12336 --rescnt;
12337 *res++ = fill;
12338 } while (--width > len);
12339 }
12340 if (fill == ' ') {
12341 if (sign)
12342 *res++ = sign;
12343 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12345 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12346 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12347 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012348 }
12349 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 /* Copy all characters, preserving len */
12351 len1 = len;
12352 while (len1--) {
12353 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12354 rescnt--;
12355 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012356 while (--width >= len) {
12357 --rescnt;
12358 *res++ = ' ';
12359 }
12360 if (dict && (argidx < arglen) && c != '%') {
12361 PyErr_SetString(PyExc_TypeError,
12362 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012363 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012364 goto onError;
12365 }
12366 Py_XDECREF(temp);
12367 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368 } /* until end */
12369 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012370 PyErr_SetString(PyExc_TypeError,
12371 "not all arguments converted during string formatting");
12372 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373 }
12374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375
12376 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12377 if (*res > max)
12378 max = *res;
12379 result = PyUnicode_New(reslen - rescnt, max);
12380 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012381 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 kind = PyUnicode_KIND(result);
12383 for (res = res0; res < res0+reslen-rescnt; res++)
12384 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12385 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012387 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388 }
12389 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390 return (PyObject *)result;
12391
Benjamin Peterson29060642009-01-31 22:14:21 +000012392 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394 Py_DECREF(uformat);
12395 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012396 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397 }
12398 return NULL;
12399}
12400
Jeremy Hylton938ace62002-07-17 16:30:39 +000012401static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012402unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12403
Tim Peters6d6c1a32001-08-02 04:15:00 +000012404static PyObject *
12405unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12406{
Benjamin Peterson29060642009-01-31 22:14:21 +000012407 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012408 static char *kwlist[] = {"object", "encoding", "errors", 0};
12409 char *encoding = NULL;
12410 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012411
Benjamin Peterson14339b62009-01-31 16:36:08 +000012412 if (type != &PyUnicode_Type)
12413 return unicode_subtype_new(type, args, kwds);
12414 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012415 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012416 return NULL;
12417 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012419 if (encoding == NULL && errors == NULL)
12420 return PyObject_Str(x);
12421 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012422 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012423}
12424
Guido van Rossume023fe02001-08-30 03:12:59 +000012425static PyObject *
12426unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12427{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012428 PyUnicodeObject *tmp, *pnew;
12429 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012431
Benjamin Peterson14339b62009-01-31 16:36:08 +000012432 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12433 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12434 if (tmp == NULL)
12435 return NULL;
12436 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12438 // it seems kind of strange that tp_alloc gets passed the size
12439 // of the unicode string because there will follow another
12440 // malloc.
12441 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12442 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012443 if (pnew == NULL) {
12444 Py_DECREF(tmp);
12445 return NULL;
12446 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12448 if (_PyUnicode_WSTR(pnew) == NULL) {
12449 err = PyErr_NoMemory();
12450 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012452 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12453 _PyUnicode_WSTR_LENGTH(pnew) = n;
12454 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12455 _PyUnicode_STATE(pnew).interned = 0;
12456 _PyUnicode_STATE(pnew).kind = 0;
12457 _PyUnicode_STATE(pnew).compact = 0;
12458 _PyUnicode_STATE(pnew).ready = 0;
12459 _PyUnicode_STATE(pnew).ascii = 0;
12460 pnew->data.any = NULL;
12461 _PyUnicode_LENGTH(pnew) = 0;
12462 pnew->_base.utf8 = NULL;
12463 pnew->_base.utf8_length = 0;
12464
12465 if (PyUnicode_READY(pnew) == -1) {
12466 PyObject_FREE(_PyUnicode_WSTR(pnew));
12467 goto onError;
12468 }
12469
Benjamin Peterson14339b62009-01-31 16:36:08 +000012470 Py_DECREF(tmp);
12471 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472
12473 onError:
12474 _Py_ForgetReference((PyObject *)pnew);
12475 PyObject_Del(pnew);
12476 Py_DECREF(tmp);
12477 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012478}
12479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012480PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012481 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012482\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012483Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012484encoding defaults to the current default string encoding.\n\
12485errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012486
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012487static PyObject *unicode_iter(PyObject *seq);
12488
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012490 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012491 "str", /* tp_name */
12492 sizeof(PyUnicodeObject), /* tp_size */
12493 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012495 (destructor)unicode_dealloc, /* tp_dealloc */
12496 0, /* tp_print */
12497 0, /* tp_getattr */
12498 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012499 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012500 unicode_repr, /* tp_repr */
12501 &unicode_as_number, /* tp_as_number */
12502 &unicode_as_sequence, /* tp_as_sequence */
12503 &unicode_as_mapping, /* tp_as_mapping */
12504 (hashfunc) unicode_hash, /* tp_hash*/
12505 0, /* tp_call*/
12506 (reprfunc) unicode_str, /* tp_str */
12507 PyObject_GenericGetAttr, /* tp_getattro */
12508 0, /* tp_setattro */
12509 0, /* tp_as_buffer */
12510 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012511 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012512 unicode_doc, /* tp_doc */
12513 0, /* tp_traverse */
12514 0, /* tp_clear */
12515 PyUnicode_RichCompare, /* tp_richcompare */
12516 0, /* tp_weaklistoffset */
12517 unicode_iter, /* tp_iter */
12518 0, /* tp_iternext */
12519 unicode_methods, /* tp_methods */
12520 0, /* tp_members */
12521 0, /* tp_getset */
12522 &PyBaseObject_Type, /* tp_base */
12523 0, /* tp_dict */
12524 0, /* tp_descr_get */
12525 0, /* tp_descr_set */
12526 0, /* tp_dictoffset */
12527 0, /* tp_init */
12528 0, /* tp_alloc */
12529 unicode_new, /* tp_new */
12530 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531};
12532
12533/* Initialize the Unicode implementation */
12534
Thomas Wouters78890102000-07-22 19:25:51 +000012535void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012537 int i;
12538
Thomas Wouters477c8d52006-05-27 19:21:47 +000012539 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012540 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012541 0x000A, /* LINE FEED */
12542 0x000D, /* CARRIAGE RETURN */
12543 0x001C, /* FILE SEPARATOR */
12544 0x001D, /* GROUP SEPARATOR */
12545 0x001E, /* RECORD SEPARATOR */
12546 0x0085, /* NEXT LINE */
12547 0x2028, /* LINE SEPARATOR */
12548 0x2029, /* PARAGRAPH SEPARATOR */
12549 };
12550
Fred Drakee4315f52000-05-09 19:53:39 +000012551 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012553 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012555
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012556 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012557 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012558 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012559 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012560
12561 /* initialize the linebreak bloom filter */
12562 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012563 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012564 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012565
12566 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567}
12568
12569/* Finalize the Unicode implementation */
12570
Christian Heimesa156e092008-02-16 07:38:31 +000012571int
12572PyUnicode_ClearFreeList(void)
12573{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012575}
12576
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577void
Thomas Wouters78890102000-07-22 19:25:51 +000012578_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012580 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012582 Py_XDECREF(unicode_empty);
12583 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012584
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012585 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012586 if (unicode_latin1[i]) {
12587 Py_DECREF(unicode_latin1[i]);
12588 unicode_latin1[i] = NULL;
12589 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012590 }
Christian Heimesa156e092008-02-16 07:38:31 +000012591 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012593
Walter Dörwald16807132007-05-25 13:52:07 +000012594void
12595PyUnicode_InternInPlace(PyObject **p)
12596{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012597 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12598 PyObject *t;
12599 if (s == NULL || !PyUnicode_Check(s))
12600 Py_FatalError(
12601 "PyUnicode_InternInPlace: unicode strings only please!");
12602 /* If it's a subclass, we don't really know what putting
12603 it in the interned dict might do. */
12604 if (!PyUnicode_CheckExact(s))
12605 return;
12606 if (PyUnicode_CHECK_INTERNED(s))
12607 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 if (PyUnicode_READY(s) == -1) {
12609 assert(0 && "ready fail in intern...");
12610 return;
12611 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012612 if (interned == NULL) {
12613 interned = PyDict_New();
12614 if (interned == NULL) {
12615 PyErr_Clear(); /* Don't leave an exception */
12616 return;
12617 }
12618 }
12619 /* It might be that the GetItem call fails even
12620 though the key is present in the dictionary,
12621 namely when this happens during a stack overflow. */
12622 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012623 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012624 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012625
Benjamin Peterson29060642009-01-31 22:14:21 +000012626 if (t) {
12627 Py_INCREF(t);
12628 Py_DECREF(*p);
12629 *p = t;
12630 return;
12631 }
Walter Dörwald16807132007-05-25 13:52:07 +000012632
Benjamin Peterson14339b62009-01-31 16:36:08 +000012633 PyThreadState_GET()->recursion_critical = 1;
12634 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12635 PyErr_Clear();
12636 PyThreadState_GET()->recursion_critical = 0;
12637 return;
12638 }
12639 PyThreadState_GET()->recursion_critical = 0;
12640 /* The two references in interned are not counted by refcnt.
12641 The deallocator will take care of this */
12642 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012644}
12645
12646void
12647PyUnicode_InternImmortal(PyObject **p)
12648{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12650
Benjamin Peterson14339b62009-01-31 16:36:08 +000012651 PyUnicode_InternInPlace(p);
12652 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012654 Py_INCREF(*p);
12655 }
Walter Dörwald16807132007-05-25 13:52:07 +000012656}
12657
12658PyObject *
12659PyUnicode_InternFromString(const char *cp)
12660{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012661 PyObject *s = PyUnicode_FromString(cp);
12662 if (s == NULL)
12663 return NULL;
12664 PyUnicode_InternInPlace(&s);
12665 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012666}
12667
Alexander Belopolsky40018472011-02-26 01:02:56 +000012668void
12669_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012670{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012671 PyObject *keys;
12672 PyUnicodeObject *s;
12673 Py_ssize_t i, n;
12674 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012675
Benjamin Peterson14339b62009-01-31 16:36:08 +000012676 if (interned == NULL || !PyDict_Check(interned))
12677 return;
12678 keys = PyDict_Keys(interned);
12679 if (keys == NULL || !PyList_Check(keys)) {
12680 PyErr_Clear();
12681 return;
12682 }
Walter Dörwald16807132007-05-25 13:52:07 +000012683
Benjamin Peterson14339b62009-01-31 16:36:08 +000012684 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12685 detector, interned unicode strings are not forcibly deallocated;
12686 rather, we give them their stolen references back, and then clear
12687 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012688
Benjamin Peterson14339b62009-01-31 16:36:08 +000012689 n = PyList_GET_SIZE(keys);
12690 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012691 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012692 for (i = 0; i < n; i++) {
12693 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 if (PyUnicode_READY(s) == -1)
12695 fprintf(stderr, "could not ready string\n");
12696 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012697 case SSTATE_NOT_INTERNED:
12698 /* XXX Shouldn't happen */
12699 break;
12700 case SSTATE_INTERNED_IMMORTAL:
12701 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012702 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012703 break;
12704 case SSTATE_INTERNED_MORTAL:
12705 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012707 break;
12708 default:
12709 Py_FatalError("Inconsistent interned string state.");
12710 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012712 }
12713 fprintf(stderr, "total size of all interned strings: "
12714 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12715 "mortal/immortal\n", mortal_size, immortal_size);
12716 Py_DECREF(keys);
12717 PyDict_Clear(interned);
12718 Py_DECREF(interned);
12719 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012720}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012721
12722
12723/********************* Unicode Iterator **************************/
12724
12725typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012726 PyObject_HEAD
12727 Py_ssize_t it_index;
12728 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012729} unicodeiterobject;
12730
12731static void
12732unicodeiter_dealloc(unicodeiterobject *it)
12733{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012734 _PyObject_GC_UNTRACK(it);
12735 Py_XDECREF(it->it_seq);
12736 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012737}
12738
12739static int
12740unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12741{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012742 Py_VISIT(it->it_seq);
12743 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012744}
12745
12746static PyObject *
12747unicodeiter_next(unicodeiterobject *it)
12748{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012749 PyUnicodeObject *seq;
12750 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012751
Benjamin Peterson14339b62009-01-31 16:36:08 +000012752 assert(it != NULL);
12753 seq = it->it_seq;
12754 if (seq == NULL)
12755 return NULL;
12756 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12759 int kind = PyUnicode_KIND(seq);
12760 void *data = PyUnicode_DATA(seq);
12761 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12762 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012763 if (item != NULL)
12764 ++it->it_index;
12765 return item;
12766 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012767
Benjamin Peterson14339b62009-01-31 16:36:08 +000012768 Py_DECREF(seq);
12769 it->it_seq = NULL;
12770 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012771}
12772
12773static PyObject *
12774unicodeiter_len(unicodeiterobject *it)
12775{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012776 Py_ssize_t len = 0;
12777 if (it->it_seq)
12778 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12779 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012780}
12781
12782PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12783
12784static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012785 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012786 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012787 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012788};
12789
12790PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012791 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12792 "str_iterator", /* tp_name */
12793 sizeof(unicodeiterobject), /* tp_basicsize */
12794 0, /* tp_itemsize */
12795 /* methods */
12796 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12797 0, /* tp_print */
12798 0, /* tp_getattr */
12799 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012800 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012801 0, /* tp_repr */
12802 0, /* tp_as_number */
12803 0, /* tp_as_sequence */
12804 0, /* tp_as_mapping */
12805 0, /* tp_hash */
12806 0, /* tp_call */
12807 0, /* tp_str */
12808 PyObject_GenericGetAttr, /* tp_getattro */
12809 0, /* tp_setattro */
12810 0, /* tp_as_buffer */
12811 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12812 0, /* tp_doc */
12813 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12814 0, /* tp_clear */
12815 0, /* tp_richcompare */
12816 0, /* tp_weaklistoffset */
12817 PyObject_SelfIter, /* tp_iter */
12818 (iternextfunc)unicodeiter_next, /* tp_iternext */
12819 unicodeiter_methods, /* tp_methods */
12820 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012821};
12822
12823static PyObject *
12824unicode_iter(PyObject *seq)
12825{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012826 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012827
Benjamin Peterson14339b62009-01-31 16:36:08 +000012828 if (!PyUnicode_Check(seq)) {
12829 PyErr_BadInternalCall();
12830 return NULL;
12831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012832 if (PyUnicode_READY(seq) == -1)
12833 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012834 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12835 if (it == NULL)
12836 return NULL;
12837 it->it_index = 0;
12838 Py_INCREF(seq);
12839 it->it_seq = (PyUnicodeObject *)seq;
12840 _PyObject_GC_TRACK(it);
12841 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012842}
12843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844#define UNIOP(x) Py_UNICODE_##x
12845#define UNIOP_t Py_UNICODE
12846#include "uniops.h"
12847#undef UNIOP
12848#undef UNIOP_t
12849#define UNIOP(x) Py_UCS4_##x
12850#define UNIOP_t Py_UCS4
12851#include "uniops.h"
12852#undef UNIOP
12853#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012854
Victor Stinner71133ff2010-09-01 23:43:53 +000012855Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012856PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012857{
12858 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12859 Py_UNICODE *copy;
12860 Py_ssize_t size;
12861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012862 if (!PyUnicode_Check(unicode)) {
12863 PyErr_BadArgument();
12864 return NULL;
12865 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012866 /* Ensure we won't overflow the size. */
12867 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12868 PyErr_NoMemory();
12869 return NULL;
12870 }
12871 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12872 size *= sizeof(Py_UNICODE);
12873 copy = PyMem_Malloc(size);
12874 if (copy == NULL) {
12875 PyErr_NoMemory();
12876 return NULL;
12877 }
12878 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12879 return copy;
12880}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012881
Georg Brandl66c221e2010-10-14 07:04:07 +000012882/* A _string module, to export formatter_parser and formatter_field_name_split
12883 to the string.Formatter class implemented in Python. */
12884
12885static PyMethodDef _string_methods[] = {
12886 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12887 METH_O, PyDoc_STR("split the argument as a field name")},
12888 {"formatter_parser", (PyCFunction) formatter_parser,
12889 METH_O, PyDoc_STR("parse the argument as a format string")},
12890 {NULL, NULL}
12891};
12892
12893static struct PyModuleDef _string_module = {
12894 PyModuleDef_HEAD_INIT,
12895 "_string",
12896 PyDoc_STR("string helper module"),
12897 0,
12898 _string_methods,
12899 NULL,
12900 NULL,
12901 NULL,
12902 NULL
12903};
12904
12905PyMODINIT_FUNC
12906PyInit__string(void)
12907{
12908 return PyModule_Create(&_string_module);
12909}
12910
12911
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012912#ifdef __cplusplus
12913}
12914#endif