blob: 810ac1e9e0fa0feca6e7b444f2c47cf0fed27e9d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200108#define _PyUnicode_UTF8(op) \
109 (PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 ((PyCompactUnicodeObject*)(op))->utf8)
112#define _PyUnicode_UTF8_LENGTH(op) \
113 (PyUnicode_IS_COMPACT_ASCII(op) ? \
114 ((PyASCIIObject*)(op))->length : \
115 ((PyCompactUnicodeObject*)(op))->utf8_length)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200116#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
117#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
119#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
120#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
121#define _PyUnicode_KIND(op) \
122 (assert(PyUnicode_Check(op)), \
123 ((PyASCIIObject *)(op))->state.kind)
124#define _PyUnicode_GET_LENGTH(op) \
125 (assert(PyUnicode_Check(op)), \
126 ((PyASCIIObject *)(op))->length)
127
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200128/* The Unicode string has been modified: reset the hash */
129#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131
Walter Dörwald16807132007-05-25 13:52:07 +0000132/* This dictionary holds all interned unicode strings. Note that references
133 to strings in this dictionary are *not* counted in the string's ob_refcnt.
134 When the interned string reaches a refcnt of 0 the string deallocation
135 function will delete the reference from this dictionary.
136
137 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000138 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000139*/
140static PyObject *interned;
141
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000142/* The empty Unicode object is shared to improve performance. */
143static PyUnicodeObject *unicode_empty;
144
145/* Single character Unicode strings in the Latin-1 range are being
146 shared as well. */
147static PyUnicodeObject *unicode_latin1[256];
148
Christian Heimes190d79e2008-01-30 11:58:22 +0000149/* Fast detection of the most frequent whitespace characters */
150const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000151 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000152/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000153/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000154/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000155/* case 0x000C: * FORM FEED */
156/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000157 0, 1, 1, 1, 1, 1, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* case 0x001C: * FILE SEPARATOR */
160/* case 0x001D: * GROUP SEPARATOR */
161/* case 0x001E: * RECORD SEPARATOR */
162/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000163 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000165 1, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000169
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000178};
179
Alexander Belopolsky40018472011-02-26 01:02:56 +0000180static PyObject *
181unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000182 PyObject **errorHandler,const char *encoding, const char *reason,
183 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
184 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
185
Alexander Belopolsky40018472011-02-26 01:02:56 +0000186static void
187raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300188 const char *encoding,
189 const Py_UNICODE *unicode, Py_ssize_t size,
190 Py_ssize_t startpos, Py_ssize_t endpos,
191 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000192
Christian Heimes190d79e2008-01-30 11:58:22 +0000193/* Same for linebreaks */
194static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000196/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* 0x000B, * LINE TABULATION */
198/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000202/* 0x001C, * FILE SEPARATOR */
203/* 0x001D, * GROUP SEPARATOR */
204/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 1, 1, 1, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000210
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000219};
220
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300221/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
222 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000223Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000224PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000225{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000226#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000227 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000228#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 /* This is actually an illegal character, so it should
230 not be passed to unichr. */
231 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000232#endif
233}
234
Thomas Wouters477c8d52006-05-27 19:21:47 +0000235/* --- Bloom Filters ----------------------------------------------------- */
236
237/* stuff to implement simple "bloom filters" for Unicode characters.
238 to keep things simple, we use a single bitmask, using the least 5
239 bits from each unicode characters as the bit index. */
240
241/* the linebreak mask is set up by Unicode_Init below */
242
Antoine Pitrouf068f942010-01-13 14:19:12 +0000243#if LONG_BIT >= 128
244#define BLOOM_WIDTH 128
245#elif LONG_BIT >= 64
246#define BLOOM_WIDTH 64
247#elif LONG_BIT >= 32
248#define BLOOM_WIDTH 32
249#else
250#error "LONG_BIT is smaller than 32"
251#endif
252
Thomas Wouters477c8d52006-05-27 19:21:47 +0000253#define BLOOM_MASK unsigned long
254
255static BLOOM_MASK bloom_linebreak;
256
Antoine Pitrouf068f942010-01-13 14:19:12 +0000257#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
258#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259
Benjamin Peterson29060642009-01-31 22:14:21 +0000260#define BLOOM_LINEBREAK(ch) \
261 ((ch) < 128U ? ascii_linebreak[(ch)] : \
262 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000263
Alexander Belopolsky40018472011-02-26 01:02:56 +0000264Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200265make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266{
267 /* calculate simple bloom-style bitmask for a given unicode string */
268
Antoine Pitrouf068f942010-01-13 14:19:12 +0000269 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270 Py_ssize_t i;
271
272 mask = 0;
273 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200274 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000275
276 return mask;
277}
278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279#define BLOOM_MEMBER(mask, chr, str) \
280 (BLOOM(mask, chr) \
281 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
Guido van Rossumd57fd912000-03-10 22:53:23 +0000283/* --- Unicode Object ----------------------------------------------------- */
284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200285static PyObject *
286substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len);
287
288static PyObject *
289fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
290
291Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
292 Py_ssize_t size, Py_UCS4 ch,
293 int direction)
294{
295 /* like wcschr, but doesn't stop at NULL characters */
296 Py_ssize_t i;
297 if (direction == 1) {
298 for(i = 0; i < size; i++)
299 if (PyUnicode_READ(kind, s, i) == ch)
300 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
301 }
302 else {
303 for(i = size-1; i >= 0; i--)
304 if (PyUnicode_READ(kind, s, i) == ch)
305 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
306 }
307 return NULL;
308}
309
Alexander Belopolsky40018472011-02-26 01:02:56 +0000310static int
311unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200312 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313{
314 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 /* Resizing is only supported for old unicode objects. */
317 assert(!PyUnicode_IS_COMPACT(unicode));
318 assert(_PyUnicode_WSTR(unicode) != NULL);
319
320 /* ... and only if they have not been readied yet, because
321 callees usually rely on the wstr representation when resizing. */
322 assert(unicode->data.any == NULL);
323
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000324 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200325 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000326 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 /* Resizing shared object (unicode_empty or single character
329 objects) in-place is not allowed. Use PyUnicode_Resize()
330 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000331
Benjamin Peterson14339b62009-01-31 16:36:08 +0000332 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200333 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
334 _PyUnicode_WSTR(unicode)[0] < 256U &&
335 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000337 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 return -1;
339 }
340
Thomas Wouters477c8d52006-05-27 19:21:47 +0000341 /* We allocate one more byte to make sure the string is Ux0000 terminated.
342 The overallocation is also used by fastsearch, which assumes that it's
343 safe to look at str[length] (without making any assumptions about what
344 it contains). */
345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 oldstr = _PyUnicode_WSTR(unicode);
347 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
348 sizeof(Py_UNICODE) * (length + 1));
349 if (!_PyUnicode_WSTR(unicode)) {
350 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 PyErr_NoMemory();
352 return -1;
353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 _PyUnicode_WSTR(unicode)[length] = 0;
355 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356
Benjamin Peterson29060642009-01-31 22:14:21 +0000357 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 if (unicode->data.any != NULL) {
359 PyObject_FREE(unicode->data.any);
360 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
361 PyObject_FREE(unicode->_base.utf8);
362 }
363 unicode->_base.utf8 = NULL;
364 unicode->_base.utf8_length = 0;
365 unicode->data.any = NULL;
366 _PyUnicode_LENGTH(unicode) = 0;
367 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
368 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200370 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000371
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 return 0;
373}
374
375/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000376 Ux0000 terminated; some code (e.g. new_identifier)
377 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378
379 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000380 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381
382*/
383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384#ifdef Py_DEBUG
385int unicode_old_new_calls = 0;
386#endif
387
Alexander Belopolsky40018472011-02-26 01:02:56 +0000388static PyUnicodeObject *
389_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000390{
391 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200392 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393
Thomas Wouters477c8d52006-05-27 19:21:47 +0000394 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000395 if (length == 0 && unicode_empty != NULL) {
396 Py_INCREF(unicode_empty);
397 return unicode_empty;
398 }
399
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000400 /* Ensure we won't overflow the size. */
401 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
402 return (PyUnicodeObject *)PyErr_NoMemory();
403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200404 if (length < 0) {
405 PyErr_SetString(PyExc_SystemError,
406 "Negative size passed to _PyUnicode_New");
407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 }
409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410#ifdef Py_DEBUG
411 ++unicode_old_new_calls;
412#endif
413
414 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
415 if (unicode == NULL)
416 return NULL;
417 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
418 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
419 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 PyErr_NoMemory();
421 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200423
Jeremy Hyltond8082792003-09-16 19:41:39 +0000424 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000425 * the caller fails before initializing str -- unicode_resize()
426 * reads str[0], and the Keep-Alive optimization can keep memory
427 * allocated for str alive across a call to unicode_dealloc(unicode).
428 * We don't want unicode_resize to read uninitialized memory in
429 * that case.
430 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431 _PyUnicode_WSTR(unicode)[0] = 0;
432 _PyUnicode_WSTR(unicode)[length] = 0;
433 _PyUnicode_WSTR_LENGTH(unicode) = length;
434 _PyUnicode_HASH(unicode) = -1;
435 _PyUnicode_STATE(unicode).interned = 0;
436 _PyUnicode_STATE(unicode).kind = 0;
437 _PyUnicode_STATE(unicode).compact = 0;
438 _PyUnicode_STATE(unicode).ready = 0;
439 _PyUnicode_STATE(unicode).ascii = 0;
440 unicode->data.any = NULL;
441 _PyUnicode_LENGTH(unicode) = 0;
442 unicode->_base.utf8 = NULL;
443 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000445
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000447 /* XXX UNREF/NEWREF interface should be more symmetrical */
448 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000449 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000450 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000451 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452}
453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454#ifdef Py_DEBUG
455int unicode_new_new_calls = 0;
456
457/* Functions wrapping macros for use in debugger */
458char *_PyUnicode_utf8(void *unicode){
459 return _PyUnicode_UTF8(unicode);
460}
461
462void *_PyUnicode_compact_data(void *unicode) {
463 return _PyUnicode_COMPACT_DATA(unicode);
464}
465void *_PyUnicode_data(void *unicode){
466 printf("obj %p\n", unicode);
467 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
468 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
469 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
470 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
471 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
472 return PyUnicode_DATA(unicode);
473}
474#endif
475
476PyObject *
477PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
478{
479 PyObject *obj;
480 PyCompactUnicodeObject *unicode;
481 void *data;
482 int kind_state;
483 int is_sharing = 0, is_ascii = 0;
484 Py_ssize_t char_size;
485 Py_ssize_t struct_size;
486
487 /* Optimization for empty strings */
488 if (size == 0 && unicode_empty != NULL) {
489 Py_INCREF(unicode_empty);
490 return (PyObject *)unicode_empty;
491 }
492
493#ifdef Py_DEBUG
494 ++unicode_new_new_calls;
495#endif
496
497 struct_size = sizeof(PyCompactUnicodeObject);
498 if (maxchar < 128) {
499 kind_state = PyUnicode_1BYTE_KIND;
500 char_size = 1;
501 is_ascii = 1;
502 struct_size = sizeof(PyASCIIObject);
503 }
504 else if (maxchar < 256) {
505 kind_state = PyUnicode_1BYTE_KIND;
506 char_size = 1;
507 }
508 else if (maxchar < 65536) {
509 kind_state = PyUnicode_2BYTE_KIND;
510 char_size = 2;
511 if (sizeof(wchar_t) == 2)
512 is_sharing = 1;
513 }
514 else {
515 kind_state = PyUnicode_4BYTE_KIND;
516 char_size = 4;
517 if (sizeof(wchar_t) == 4)
518 is_sharing = 1;
519 }
520
521 /* Ensure we won't overflow the size. */
522 if (size < 0) {
523 PyErr_SetString(PyExc_SystemError,
524 "Negative size passed to PyUnicode_New");
525 return NULL;
526 }
527 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
528 return PyErr_NoMemory();
529
530 /* Duplicated allocation code from _PyObject_New() instead of a call to
531 * PyObject_New() so we are able to allocate space for the object and
532 * it's data buffer.
533 */
534 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
535 if (obj == NULL)
536 return PyErr_NoMemory();
537 obj = PyObject_INIT(obj, &PyUnicode_Type);
538 if (obj == NULL)
539 return NULL;
540
541 unicode = (PyCompactUnicodeObject *)obj;
542 if (is_ascii)
543 data = ((PyASCIIObject*)obj) + 1;
544 else
545 data = unicode + 1;
546 _PyUnicode_LENGTH(unicode) = size;
547 _PyUnicode_HASH(unicode) = -1;
548 _PyUnicode_STATE(unicode).interned = 0;
549 _PyUnicode_STATE(unicode).kind = kind_state;
550 _PyUnicode_STATE(unicode).compact = 1;
551 _PyUnicode_STATE(unicode).ready = 1;
552 _PyUnicode_STATE(unicode).ascii = is_ascii;
553 if (is_ascii) {
554 ((char*)data)[size] = 0;
555 _PyUnicode_WSTR(unicode) = NULL;
556 }
557 else if (kind_state == PyUnicode_1BYTE_KIND) {
558 ((char*)data)[size] = 0;
559 _PyUnicode_WSTR(unicode) = NULL;
560 _PyUnicode_WSTR_LENGTH(unicode) = 0;
561 unicode->utf8_length = 0;
562 unicode->utf8 = NULL;
563 }
564 else {
565 unicode->utf8 = NULL;
566 if (kind_state == PyUnicode_2BYTE_KIND)
567 ((Py_UCS2*)data)[size] = 0;
568 else /* kind_state == PyUnicode_4BYTE_KIND */
569 ((Py_UCS4*)data)[size] = 0;
570 if (is_sharing) {
571 _PyUnicode_WSTR_LENGTH(unicode) = size;
572 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
573 }
574 else {
575 _PyUnicode_WSTR_LENGTH(unicode) = 0;
576 _PyUnicode_WSTR(unicode) = NULL;
577 }
578 }
579 return obj;
580}
581
582#if SIZEOF_WCHAR_T == 2
583/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
584 will decode surrogate pairs, the other conversions are implemented as macros
585 for efficency.
586
587 This function assumes that unicode can hold one more code point than wstr
588 characters for a terminating null character. */
589static int
590unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
591 PyUnicodeObject *unicode)
592{
593 const wchar_t *iter;
594 Py_UCS4 *ucs4_out;
595
596 assert(unicode && PyUnicode_Check(unicode));
597 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
598 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
599
600 for (iter = begin; iter < end; ) {
601 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
602 _PyUnicode_GET_LENGTH(unicode)));
603 if (*iter >= 0xD800 && *iter <= 0xDBFF
604 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
605 {
606 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
607 iter += 2;
608 }
609 else {
610 *ucs4_out++ = *iter;
611 iter++;
612 }
613 }
614 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
615 _PyUnicode_GET_LENGTH(unicode)));
616
617 return 0;
618}
619#endif
620
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200621Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200622PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
623 PyObject *from, Py_ssize_t from_start,
624 Py_ssize_t how_many)
625{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200626 unsigned int from_kind, to_kind;
627 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200628
Victor Stinnerb1536152011-09-30 02:26:10 +0200629 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
630 PyErr_BadInternalCall();
631 return -1;
632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200633
634 if (PyUnicode_READY(from))
635 return -1;
636 if (PyUnicode_READY(to))
637 return -1;
638
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200639 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200640 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
641 PyErr_Format(PyExc_ValueError,
642 "Cannot write %zi characters at %zi "
643 "in a string of %zi characters",
644 how_many, to_start, PyUnicode_GET_LENGTH(to));
645 return -1;
646 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200647 if (how_many == 0)
648 return 0;
649
650 if (Py_REFCNT(to) != 1) {
651 PyErr_SetString(PyExc_ValueError,
652 "Cannot modify a string having more than 1 reference");
653 return -1;
654 }
Victor Stinnerc17f5402011-09-29 00:16:58 +0200655 _PyUnicode_DIRTY(to);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200657 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200658 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200659 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200660 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200661
662 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200663 /* fast path */
Victor Stinnera0702ab2011-09-29 14:14:38 +0200664 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200665 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200666 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200667 + PyUnicode_KIND_SIZE(from_kind, from_start),
668 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200669 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200670 else if (from_kind == PyUnicode_1BYTE_KIND
671 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200672 {
673 _PyUnicode_CONVERT_BYTES(
674 Py_UCS1, Py_UCS2,
675 PyUnicode_1BYTE_DATA(from) + from_start,
676 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
677 PyUnicode_2BYTE_DATA(to) + to_start
678 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200679 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200680 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200681 && to_kind == PyUnicode_4BYTE_KIND)
682 {
683 _PyUnicode_CONVERT_BYTES(
684 Py_UCS1, Py_UCS4,
685 PyUnicode_1BYTE_DATA(from) + from_start,
686 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
687 PyUnicode_4BYTE_DATA(to) + to_start
688 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200689 }
690 else if (from_kind == PyUnicode_2BYTE_KIND
691 && to_kind == PyUnicode_4BYTE_KIND)
692 {
693 _PyUnicode_CONVERT_BYTES(
694 Py_UCS2, Py_UCS4,
695 PyUnicode_2BYTE_DATA(from) + from_start,
696 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
697 PyUnicode_4BYTE_DATA(to) + to_start
698 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200699 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200700 else {
701 int invalid_kinds;
702 if (from_kind > to_kind) {
703 /* slow path to check for character overflow */
704 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
705 Py_UCS4 ch, maxchar;
706 Py_ssize_t i;
707
708 maxchar = 0;
709 invalid_kinds = 0;
710 for (i=0; i < how_many; i++) {
711 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
712 if (ch > maxchar) {
713 maxchar = ch;
714 if (maxchar > to_maxchar) {
715 invalid_kinds = 1;
716 break;
717 }
718 }
719 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
720 }
721 }
722 else
723 invalid_kinds = 1;
724 if (invalid_kinds) {
725 PyErr_Format(PyExc_ValueError,
726 "Cannot copy UCS%u characters "
727 "into a string of UCS%u characters",
728 1 << (from_kind - 1),
729 1 << (to_kind -1));
730 return -1;
731 }
732 }
733 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200734}
735
Victor Stinner17222162011-09-28 22:15:37 +0200736/* Find the maximum code point and count the number of surrogate pairs so a
737 correct string length can be computed before converting a string to UCS4.
738 This function counts single surrogates as a character and not as a pair.
739
740 Return 0 on success, or -1 on error. */
741static int
742find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
743 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744{
745 const wchar_t *iter;
746
747 if (num_surrogates == NULL || maxchar == NULL) {
748 PyErr_SetString(PyExc_SystemError,
749 "unexpected NULL arguments to "
750 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
751 return -1;
752 }
753
754 *num_surrogates = 0;
755 *maxchar = 0;
756
757 for (iter = begin; iter < end; ) {
758 if (*iter > *maxchar)
759 *maxchar = *iter;
760#if SIZEOF_WCHAR_T == 2
761 if (*iter >= 0xD800 && *iter <= 0xDBFF
762 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
763 {
764 Py_UCS4 surrogate_val;
765 surrogate_val = (((iter[0] & 0x3FF)<<10)
766 | (iter[1] & 0x3FF)) + 0x10000;
767 ++(*num_surrogates);
768 if (surrogate_val > *maxchar)
769 *maxchar = surrogate_val;
770 iter += 2;
771 }
772 else
773 iter++;
774#else
775 iter++;
776#endif
777 }
778 return 0;
779}
780
781#ifdef Py_DEBUG
782int unicode_ready_calls = 0;
783#endif
784
785int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200786_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200788 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200789 wchar_t *end;
790 Py_UCS4 maxchar = 0;
791 Py_ssize_t num_surrogates;
792#if SIZEOF_WCHAR_T == 2
793 Py_ssize_t length_wo_surrogates;
794#endif
795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200796 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200797 strings were created using _PyObject_New() and where no canonical
798 representation (the str field) has been set yet aka strings
799 which are not yet ready. */
800 assert(PyUnicode_Check(obj));
801 assert(!PyUnicode_IS_READY(obj));
802 assert(!PyUnicode_IS_COMPACT(obj));
803 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200804 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200805 assert(unicode->data.any == NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200806 assert(unicode->_base.utf8 == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200807 /* Actually, it should neither be interned nor be anything else: */
808 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809
810#ifdef Py_DEBUG
811 ++unicode_ready_calls;
812#endif
813
814 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200815 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200816 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200817 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200818
819 if (maxchar < 256) {
820 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
821 if (!unicode->data.any) {
822 PyErr_NoMemory();
823 return -1;
824 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200825 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826 _PyUnicode_WSTR(unicode), end,
827 PyUnicode_1BYTE_DATA(unicode));
828 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
829 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
830 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
831 if (maxchar < 128) {
832 unicode->_base.utf8 = unicode->data.any;
833 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
834 }
835 else {
836 unicode->_base.utf8 = NULL;
837 unicode->_base.utf8_length = 0;
838 }
839 PyObject_FREE(_PyUnicode_WSTR(unicode));
840 _PyUnicode_WSTR(unicode) = NULL;
841 _PyUnicode_WSTR_LENGTH(unicode) = 0;
842 }
843 /* In this case we might have to convert down from 4-byte native
844 wchar_t to 2-byte unicode. */
845 else if (maxchar < 65536) {
846 assert(num_surrogates == 0 &&
847 "FindMaxCharAndNumSurrogatePairs() messed up");
848
Victor Stinner506f5922011-09-28 22:34:18 +0200849#if SIZEOF_WCHAR_T == 2
850 /* We can share representations and are done. */
851 unicode->data.any = _PyUnicode_WSTR(unicode);
852 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
853 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
854 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
855 unicode->_base.utf8 = NULL;
856 unicode->_base.utf8_length = 0;
857#else
858 /* sizeof(wchar_t) == 4 */
859 unicode->data.any = PyObject_MALLOC(
860 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
861 if (!unicode->data.any) {
862 PyErr_NoMemory();
863 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200864 }
Victor Stinner506f5922011-09-28 22:34:18 +0200865 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
866 _PyUnicode_WSTR(unicode), end,
867 PyUnicode_2BYTE_DATA(unicode));
868 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
869 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
870 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
871 unicode->_base.utf8 = NULL;
872 unicode->_base.utf8_length = 0;
873 PyObject_FREE(_PyUnicode_WSTR(unicode));
874 _PyUnicode_WSTR(unicode) = NULL;
875 _PyUnicode_WSTR_LENGTH(unicode) = 0;
876#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877 }
878 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
879 else {
880#if SIZEOF_WCHAR_T == 2
881 /* in case the native representation is 2-bytes, we need to allocate a
882 new normalized 4-byte version. */
883 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
884 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
885 if (!unicode->data.any) {
886 PyErr_NoMemory();
887 return -1;
888 }
889 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
890 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
891 unicode->_base.utf8 = NULL;
892 unicode->_base.utf8_length = 0;
893 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
894 unicode) < 0) {
895 assert(0 && "ConvertWideCharToUCS4 failed");
896 return -1;
897 }
898 PyObject_FREE(_PyUnicode_WSTR(unicode));
899 _PyUnicode_WSTR(unicode) = NULL;
900 _PyUnicode_WSTR_LENGTH(unicode) = 0;
901#else
902 assert(num_surrogates == 0);
903
904 unicode->data.any = _PyUnicode_WSTR(unicode);
905 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
906 unicode->_base.utf8 = NULL;
907 unicode->_base.utf8_length = 0;
908 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
909#endif
910 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
911 }
912 _PyUnicode_STATE(unicode).ready = 1;
913 return 0;
914}
915
Alexander Belopolsky40018472011-02-26 01:02:56 +0000916static void
917unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000918{
Walter Dörwald16807132007-05-25 13:52:07 +0000919 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 case SSTATE_NOT_INTERNED:
921 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000922
Benjamin Peterson29060642009-01-31 22:14:21 +0000923 case SSTATE_INTERNED_MORTAL:
924 /* revive dead object temporarily for DelItem */
925 Py_REFCNT(unicode) = 3;
926 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
927 Py_FatalError(
928 "deletion of interned string failed");
929 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000930
Benjamin Peterson29060642009-01-31 22:14:21 +0000931 case SSTATE_INTERNED_IMMORTAL:
932 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000933
Benjamin Peterson29060642009-01-31 22:14:21 +0000934 default:
935 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000936 }
937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200938 if (_PyUnicode_WSTR(unicode) &&
939 (!PyUnicode_IS_READY(unicode) ||
940 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
941 PyObject_DEL(_PyUnicode_WSTR(unicode));
942 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
943 PyObject_DEL(unicode->_base.utf8);
944
945 if (PyUnicode_IS_COMPACT(unicode)) {
946 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947 }
948 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200949 if (unicode->data.any)
950 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000951 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000952 }
953}
954
Alexander Belopolsky40018472011-02-26 01:02:56 +0000955static int
956_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000957{
958 register PyUnicodeObject *v;
959
960 /* Argument checks */
961 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000962 PyErr_BadInternalCall();
963 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000964 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000965 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
967 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000968 PyErr_BadInternalCall();
969 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000970 }
971
972 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973 possible since these are being shared.
974 The same goes for new-representation unicode objects or objects which
975 have already been readied.
976 For these, we simply return a fresh copy with the same Unicode content.
977 */
978 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
979 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
980 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000981 PyUnicodeObject *w = _PyUnicode_New(length);
982 if (w == NULL)
983 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
985 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000986 Py_DECREF(*unicode);
987 *unicode = w;
988 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000989 }
990
991 /* Note that we don't have to modify *unicode for unshared Unicode
992 objects, since we can modify them in-place. */
993 return unicode_resize(v, length);
994}
995
Alexander Belopolsky40018472011-02-26 01:02:56 +0000996int
997PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000998{
999 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
1000}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002static PyObject*
1003get_latin1_char(unsigned char ch)
1004{
1005 PyUnicodeObject *unicode = unicode_latin1[ch];
1006 if (!unicode) {
1007 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
1008 if (!unicode)
1009 return NULL;
1010 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1011 unicode_latin1[ch] = unicode;
1012 }
1013 Py_INCREF(unicode);
1014 return (PyObject *)unicode;
1015}
1016
Alexander Belopolsky40018472011-02-26 01:02:56 +00001017PyObject *
1018PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019{
1020 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001021 Py_UCS4 maxchar = 0;
1022 Py_ssize_t num_surrogates;
1023
1024 if (u == NULL)
1025 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001027 /* If the Unicode data is known at construction time, we can apply
1028 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030 /* Optimization for empty strings */
1031 if (size == 0 && unicode_empty != NULL) {
1032 Py_INCREF(unicode_empty);
1033 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001034 }
Tim Petersced69f82003-09-16 20:30:58 +00001035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001036 /* Single character Unicode objects in the Latin-1 range are
1037 shared when using this constructor */
1038 if (size == 1 && *u < 256)
1039 return get_latin1_char((unsigned char)*u);
1040
1041 /* If not empty and not single character, copy the Unicode data
1042 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001043 if (find_maxchar_surrogates(u, u + size,
1044 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 return NULL;
1046
1047 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1048 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049 if (!unicode)
1050 return NULL;
1051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 switch (PyUnicode_KIND(unicode)) {
1053 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001054 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001055 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1056 break;
1057 case PyUnicode_2BYTE_KIND:
1058#if Py_UNICODE_SIZE == 2
1059 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1060#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001061 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1063#endif
1064 break;
1065 case PyUnicode_4BYTE_KIND:
1066#if SIZEOF_WCHAR_T == 2
1067 /* This is the only case which has to process surrogates, thus
1068 a simple copy loop is not enough and we need a function. */
1069 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1070 Py_DECREF(unicode);
1071 return NULL;
1072 }
1073#else
1074 assert(num_surrogates == 0);
1075 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1076#endif
1077 break;
1078 default:
1079 assert(0 && "Impossible state");
1080 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001081
1082 return (PyObject *)unicode;
1083}
1084
Alexander Belopolsky40018472011-02-26 01:02:56 +00001085PyObject *
1086PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001087{
1088 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001089
Benjamin Peterson14339b62009-01-31 16:36:08 +00001090 if (size < 0) {
1091 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001092 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001093 return NULL;
1094 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001095
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001096 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001097 some optimizations which share commonly used objects.
1098 Also, this means the input must be UTF-8, so fall back to the
1099 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001100 if (u != NULL) {
1101
Benjamin Peterson29060642009-01-31 22:14:21 +00001102 /* Optimization for empty strings */
1103 if (size == 0 && unicode_empty != NULL) {
1104 Py_INCREF(unicode_empty);
1105 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001106 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001107
1108 /* Single characters are shared when using this constructor.
1109 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110 if (size == 1 && Py_CHARMASK(*u) < 128)
1111 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001112
1113 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001114 }
1115
Walter Dörwald55507312007-05-18 13:12:10 +00001116 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001117 if (!unicode)
1118 return NULL;
1119
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001120 return (PyObject *)unicode;
1121}
1122
Alexander Belopolsky40018472011-02-26 01:02:56 +00001123PyObject *
1124PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001125{
1126 size_t size = strlen(u);
1127 if (size > PY_SSIZE_T_MAX) {
1128 PyErr_SetString(PyExc_OverflowError, "input too long");
1129 return NULL;
1130 }
1131
1132 return PyUnicode_FromStringAndSize(u, size);
1133}
1134
Victor Stinnere57b1c02011-09-28 22:20:48 +02001135static PyObject*
1136_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138 PyObject *res;
1139 unsigned char max = 127;
1140 Py_ssize_t i;
1141 for (i = 0; i < size; i++) {
1142 if (u[i] & 0x80) {
1143 max = 255;
1144 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001145 }
1146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147 res = PyUnicode_New(size, max);
1148 if (!res)
1149 return NULL;
1150 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1151 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001152}
1153
Victor Stinnere57b1c02011-09-28 22:20:48 +02001154static PyObject*
1155_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156{
1157 PyObject *res;
1158 Py_UCS2 max = 0;
1159 Py_ssize_t i;
1160 for (i = 0; i < size; i++)
1161 if (u[i] > max)
1162 max = u[i];
1163 res = PyUnicode_New(size, max);
1164 if (!res)
1165 return NULL;
1166 if (max >= 256)
1167 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1168 else
1169 for (i = 0; i < size; i++)
1170 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1171 return res;
1172}
1173
Victor Stinnere57b1c02011-09-28 22:20:48 +02001174static PyObject*
1175_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176{
1177 PyObject *res;
1178 Py_UCS4 max = 0;
1179 Py_ssize_t i;
1180 for (i = 0; i < size; i++)
1181 if (u[i] > max)
1182 max = u[i];
1183 res = PyUnicode_New(size, max);
1184 if (!res)
1185 return NULL;
1186 if (max >= 0x10000)
1187 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1188 else {
1189 int kind = PyUnicode_KIND(res);
1190 void *data = PyUnicode_DATA(res);
1191 for (i = 0; i < size; i++)
1192 PyUnicode_WRITE(kind, data, i, u[i]);
1193 }
1194 return res;
1195}
1196
1197PyObject*
1198PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1199{
1200 switch(kind) {
1201 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001202 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001204 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001205 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001206 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 }
1208 assert(0);
1209 return NULL;
1210}
1211
Victor Stinner034f6cf2011-09-30 02:26:44 +02001212PyObject*
1213PyUnicode_Copy(PyObject *unicode)
1214{
1215 if (!PyUnicode_Check(unicode)) {
1216 PyErr_BadInternalCall();
1217 return NULL;
1218 }
1219 if (PyUnicode_READY(unicode))
1220 return NULL;
1221 return PyUnicode_FromKindAndData(PyUnicode_KIND(unicode),
1222 PyUnicode_DATA(unicode),
1223 PyUnicode_GET_LENGTH(unicode));
1224}
1225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226
1227/* Widen Unicode objects to larger buffers.
1228 Return NULL if the string is too wide already. */
1229
1230void*
1231_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1232{
1233 Py_ssize_t i;
1234 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1235 void *d = PyUnicode_DATA(s);
1236 unsigned int skind = PyUnicode_KIND(s);
1237 if (PyUnicode_KIND(s) >= kind) {
1238 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1239 return NULL;
1240 }
1241 switch(kind) {
1242 case PyUnicode_2BYTE_KIND: {
1243 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1244 if (!result) {
1245 PyErr_NoMemory();
1246 return 0;
1247 }
1248 for (i = 0; i < len; i++)
1249 result[i] = ((Py_UCS1*)d)[i];
1250 return result;
1251 }
1252 case PyUnicode_4BYTE_KIND: {
1253 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1254 if (!result) {
1255 PyErr_NoMemory();
1256 return 0;
1257 }
1258 for (i = 0; i < len; i++)
1259 result[i] = PyUnicode_READ(skind, d, i);
1260 return result;
1261 }
1262 }
1263 Py_FatalError("invalid kind");
1264 return NULL;
1265}
1266
1267static Py_UCS4*
1268as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1269 int copy_null)
1270{
1271 int kind;
1272 void *data;
1273 Py_ssize_t len, targetlen;
1274 if (PyUnicode_READY(string) == -1)
1275 return NULL;
1276 kind = PyUnicode_KIND(string);
1277 data = PyUnicode_DATA(string);
1278 len = PyUnicode_GET_LENGTH(string);
1279 targetlen = len;
1280 if (copy_null)
1281 targetlen++;
1282 if (!target) {
1283 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1284 PyErr_NoMemory();
1285 return NULL;
1286 }
1287 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1288 if (!target) {
1289 PyErr_NoMemory();
1290 return NULL;
1291 }
1292 }
1293 else {
1294 if (targetsize < targetlen) {
1295 PyErr_Format(PyExc_SystemError,
1296 "string is longer than the buffer");
1297 if (copy_null && 0 < targetsize)
1298 target[0] = 0;
1299 return NULL;
1300 }
1301 }
1302 if (kind != PyUnicode_4BYTE_KIND) {
1303 Py_ssize_t i;
1304 for (i = 0; i < len; i++)
1305 target[i] = PyUnicode_READ(kind, data, i);
1306 }
1307 else
1308 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1309 if (copy_null)
1310 target[len] = 0;
1311 return target;
1312}
1313
1314Py_UCS4*
1315PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1316 int copy_null)
1317{
1318 if (target == NULL || targetsize < 1) {
1319 PyErr_BadInternalCall();
1320 return NULL;
1321 }
1322 return as_ucs4(string, target, targetsize, copy_null);
1323}
1324
1325Py_UCS4*
1326PyUnicode_AsUCS4Copy(PyObject *string)
1327{
1328 return as_ucs4(string, NULL, 0, 1);
1329}
1330
1331#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001332
Alexander Belopolsky40018472011-02-26 01:02:56 +00001333PyObject *
1334PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001335{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001336 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001337 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001339 PyErr_BadInternalCall();
1340 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341 }
1342
Martin v. Löwis790465f2008-04-05 20:41:37 +00001343 if (size == -1) {
1344 size = wcslen(w);
1345 }
1346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348}
1349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001351
Walter Dörwald346737f2007-05-31 10:44:43 +00001352static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001353makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1354 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001355{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001356 *fmt++ = '%';
1357 if (width) {
1358 if (zeropad)
1359 *fmt++ = '0';
1360 fmt += sprintf(fmt, "%d", width);
1361 }
1362 if (precision)
1363 fmt += sprintf(fmt, ".%d", precision);
1364 if (longflag)
1365 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001366 else if (longlongflag) {
1367 /* longlongflag should only ever be nonzero on machines with
1368 HAVE_LONG_LONG defined */
1369#ifdef HAVE_LONG_LONG
1370 char *f = PY_FORMAT_LONG_LONG;
1371 while (*f)
1372 *fmt++ = *f++;
1373#else
1374 /* we shouldn't ever get here */
1375 assert(0);
1376 *fmt++ = 'l';
1377#endif
1378 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001379 else if (size_tflag) {
1380 char *f = PY_FORMAT_SIZE_T;
1381 while (*f)
1382 *fmt++ = *f++;
1383 }
1384 *fmt++ = c;
1385 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001386}
1387
Victor Stinner96865452011-03-01 23:44:09 +00001388/* helper for PyUnicode_FromFormatV() */
1389
1390static const char*
1391parse_format_flags(const char *f,
1392 int *p_width, int *p_precision,
1393 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1394{
1395 int width, precision, longflag, longlongflag, size_tflag;
1396
1397 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1398 f++;
1399 width = 0;
1400 while (Py_ISDIGIT((unsigned)*f))
1401 width = (width*10) + *f++ - '0';
1402 precision = 0;
1403 if (*f == '.') {
1404 f++;
1405 while (Py_ISDIGIT((unsigned)*f))
1406 precision = (precision*10) + *f++ - '0';
1407 if (*f == '%') {
1408 /* "%.3%s" => f points to "3" */
1409 f--;
1410 }
1411 }
1412 if (*f == '\0') {
1413 /* bogus format "%.1" => go backward, f points to "1" */
1414 f--;
1415 }
1416 if (p_width != NULL)
1417 *p_width = width;
1418 if (p_precision != NULL)
1419 *p_precision = precision;
1420
1421 /* Handle %ld, %lu, %lld and %llu. */
1422 longflag = 0;
1423 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001424 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001425
1426 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001427 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001428 longflag = 1;
1429 ++f;
1430 }
1431#ifdef HAVE_LONG_LONG
1432 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001433 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001434 longlongflag = 1;
1435 f += 2;
1436 }
1437#endif
1438 }
1439 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001440 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001441 size_tflag = 1;
1442 ++f;
1443 }
1444 if (p_longflag != NULL)
1445 *p_longflag = longflag;
1446 if (p_longlongflag != NULL)
1447 *p_longlongflag = longlongflag;
1448 if (p_size_tflag != NULL)
1449 *p_size_tflag = size_tflag;
1450 return f;
1451}
1452
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001453/* maximum number of characters required for output of %ld. 21 characters
1454 allows for 64-bit integers (in decimal) and an optional sign. */
1455#define MAX_LONG_CHARS 21
1456/* maximum number of characters required for output of %lld.
1457 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1458 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1459#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1460
Walter Dörwaldd2034312007-05-18 16:29:38 +00001461PyObject *
1462PyUnicode_FromFormatV(const char *format, va_list vargs)
1463{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001464 va_list count;
1465 Py_ssize_t callcount = 0;
1466 PyObject **callresults = NULL;
1467 PyObject **callresult = NULL;
1468 Py_ssize_t n = 0;
1469 int width = 0;
1470 int precision = 0;
1471 int zeropad;
1472 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001474 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001475 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1477 Py_UCS4 argmaxchar;
1478 Py_ssize_t numbersize = 0;
1479 char *numberresults = NULL;
1480 char *numberresult = NULL;
1481 Py_ssize_t i;
1482 int kind;
1483 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001484
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001485 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001486 /* step 1: count the number of %S/%R/%A/%s format specifications
1487 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1488 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 * result in an array)
1490 * also esimate a upper bound for all the number formats in the string,
1491 * numbers will be formated in step 3 and be keept in a '\0'-separated
1492 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001493 for (f = format; *f; f++) {
1494 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001495 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1497 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1498 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1499 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001502#ifdef HAVE_LONG_LONG
1503 if (longlongflag) {
1504 if (width < MAX_LONG_LONG_CHARS)
1505 width = MAX_LONG_LONG_CHARS;
1506 }
1507 else
1508#endif
1509 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1510 including sign. Decimal takes the most space. This
1511 isn't enough for octal. If a width is specified we
1512 need more (which we allocate later). */
1513 if (width < MAX_LONG_CHARS)
1514 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515
1516 /* account for the size + '\0' to separate numbers
1517 inside of the numberresults buffer */
1518 numbersize += (width + 1);
1519 }
1520 }
1521 else if ((unsigned char)*f > 127) {
1522 PyErr_Format(PyExc_ValueError,
1523 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1524 "string, got a non-ASCII byte: 0x%02x",
1525 (unsigned char)*f);
1526 return NULL;
1527 }
1528 }
1529 /* step 2: allocate memory for the results of
1530 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1531 if (callcount) {
1532 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1533 if (!callresults) {
1534 PyErr_NoMemory();
1535 return NULL;
1536 }
1537 callresult = callresults;
1538 }
1539 /* step 2.5: allocate memory for the results of formating numbers */
1540 if (numbersize) {
1541 numberresults = PyObject_Malloc(numbersize);
1542 if (!numberresults) {
1543 PyErr_NoMemory();
1544 goto fail;
1545 }
1546 numberresult = numberresults;
1547 }
1548
1549 /* step 3: format numbers and figure out how large a buffer we need */
1550 for (f = format; *f; f++) {
1551 if (*f == '%') {
1552 const char* p;
1553 int longflag;
1554 int longlongflag;
1555 int size_tflag;
1556 int numprinted;
1557
1558 p = f;
1559 zeropad = (f[1] == '0');
1560 f = parse_format_flags(f, &width, &precision,
1561 &longflag, &longlongflag, &size_tflag);
1562 switch (*f) {
1563 case 'c':
1564 {
1565 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001566 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001567 n++;
1568 break;
1569 }
1570 case '%':
1571 n++;
1572 break;
1573 case 'i':
1574 case 'd':
1575 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1576 width, precision, *f);
1577 if (longflag)
1578 numprinted = sprintf(numberresult, fmt,
1579 va_arg(count, long));
1580#ifdef HAVE_LONG_LONG
1581 else if (longlongflag)
1582 numprinted = sprintf(numberresult, fmt,
1583 va_arg(count, PY_LONG_LONG));
1584#endif
1585 else if (size_tflag)
1586 numprinted = sprintf(numberresult, fmt,
1587 va_arg(count, Py_ssize_t));
1588 else
1589 numprinted = sprintf(numberresult, fmt,
1590 va_arg(count, int));
1591 n += numprinted;
1592 /* advance by +1 to skip over the '\0' */
1593 numberresult += (numprinted + 1);
1594 assert(*(numberresult - 1) == '\0');
1595 assert(*(numberresult - 2) != '\0');
1596 assert(numprinted >= 0);
1597 assert(numberresult <= numberresults + numbersize);
1598 break;
1599 case 'u':
1600 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1601 width, precision, 'u');
1602 if (longflag)
1603 numprinted = sprintf(numberresult, fmt,
1604 va_arg(count, unsigned long));
1605#ifdef HAVE_LONG_LONG
1606 else if (longlongflag)
1607 numprinted = sprintf(numberresult, fmt,
1608 va_arg(count, unsigned PY_LONG_LONG));
1609#endif
1610 else if (size_tflag)
1611 numprinted = sprintf(numberresult, fmt,
1612 va_arg(count, size_t));
1613 else
1614 numprinted = sprintf(numberresult, fmt,
1615 va_arg(count, unsigned int));
1616 n += numprinted;
1617 numberresult += (numprinted + 1);
1618 assert(*(numberresult - 1) == '\0');
1619 assert(*(numberresult - 2) != '\0');
1620 assert(numprinted >= 0);
1621 assert(numberresult <= numberresults + numbersize);
1622 break;
1623 case 'x':
1624 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1625 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1626 n += numprinted;
1627 numberresult += (numprinted + 1);
1628 assert(*(numberresult - 1) == '\0');
1629 assert(*(numberresult - 2) != '\0');
1630 assert(numprinted >= 0);
1631 assert(numberresult <= numberresults + numbersize);
1632 break;
1633 case 'p':
1634 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1635 /* %p is ill-defined: ensure leading 0x. */
1636 if (numberresult[1] == 'X')
1637 numberresult[1] = 'x';
1638 else if (numberresult[1] != 'x') {
1639 memmove(numberresult + 2, numberresult,
1640 strlen(numberresult) + 1);
1641 numberresult[0] = '0';
1642 numberresult[1] = 'x';
1643 numprinted += 2;
1644 }
1645 n += numprinted;
1646 numberresult += (numprinted + 1);
1647 assert(*(numberresult - 1) == '\0');
1648 assert(*(numberresult - 2) != '\0');
1649 assert(numprinted >= 0);
1650 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001651 break;
1652 case 's':
1653 {
1654 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001655 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001656 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1657 if (!str)
1658 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001659 /* since PyUnicode_DecodeUTF8 returns already flexible
1660 unicode objects, there is no need to call ready on them */
1661 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001662 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001664 /* Remember the str and switch to the next slot */
1665 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001666 break;
1667 }
1668 case 'U':
1669 {
1670 PyObject *obj = va_arg(count, PyObject *);
1671 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 if (PyUnicode_READY(obj) == -1)
1673 goto fail;
1674 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001675 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001677 break;
1678 }
1679 case 'V':
1680 {
1681 PyObject *obj = va_arg(count, PyObject *);
1682 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001683 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001684 assert(obj || str);
1685 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001686 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687 if (PyUnicode_READY(obj) == -1)
1688 goto fail;
1689 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001690 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001692 *callresult++ = NULL;
1693 }
1694 else {
1695 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1696 if (!str_obj)
1697 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001699 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001701 *callresult++ = str_obj;
1702 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001703 break;
1704 }
1705 case 'S':
1706 {
1707 PyObject *obj = va_arg(count, PyObject *);
1708 PyObject *str;
1709 assert(obj);
1710 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001712 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001714 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001716 /* Remember the str and switch to the next slot */
1717 *callresult++ = str;
1718 break;
1719 }
1720 case 'R':
1721 {
1722 PyObject *obj = va_arg(count, PyObject *);
1723 PyObject *repr;
1724 assert(obj);
1725 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001726 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001727 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001728 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001729 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001731 /* Remember the repr and switch to the next slot */
1732 *callresult++ = repr;
1733 break;
1734 }
1735 case 'A':
1736 {
1737 PyObject *obj = va_arg(count, PyObject *);
1738 PyObject *ascii;
1739 assert(obj);
1740 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001742 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001744 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001746 /* Remember the repr and switch to the next slot */
1747 *callresult++ = ascii;
1748 break;
1749 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001750 default:
1751 /* if we stumble upon an unknown
1752 formatting code, copy the rest of
1753 the format string to the output
1754 string. (we cannot just skip the
1755 code, since there's no way to know
1756 what's in the argument list) */
1757 n += strlen(p);
1758 goto expand;
1759 }
1760 } else
1761 n++;
1762 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001763 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001764 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001766 we don't have to resize the string.
1767 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001769 if (!string)
1770 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 kind = PyUnicode_KIND(string);
1772 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001773 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001777 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001778 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001779
1780 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1782 /* checking for == because the last argument could be a empty
1783 string, which causes i to point to end, the assert at the end of
1784 the loop */
1785 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001786
Benjamin Peterson14339b62009-01-31 16:36:08 +00001787 switch (*f) {
1788 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001789 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 const int ordinal = va_arg(vargs, int);
1791 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001792 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001793 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001794 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001795 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001796 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001797 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 case 'p':
1799 /* unused, since we already have the result */
1800 if (*f == 'p')
1801 (void) va_arg(vargs, void *);
1802 else
1803 (void) va_arg(vargs, int);
1804 /* extract the result from numberresults and append. */
1805 for (; *numberresult; ++i, ++numberresult)
1806 PyUnicode_WRITE(kind, data, i, *numberresult);
1807 /* skip over the separating '\0' */
1808 assert(*numberresult == '\0');
1809 numberresult++;
1810 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001811 break;
1812 case 's':
1813 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001814 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001816 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 size = PyUnicode_GET_LENGTH(*callresult);
1818 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001819 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1820 *callresult, 0,
1821 size) < 0)
1822 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001824 /* We're done with the unicode()/repr() => forget it */
1825 Py_DECREF(*callresult);
1826 /* switch to next unicode()/repr() result */
1827 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001828 break;
1829 }
1830 case 'U':
1831 {
1832 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833 Py_ssize_t size;
1834 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1835 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001836 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1837 obj, 0,
1838 size) < 0)
1839 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001841 break;
1842 }
1843 case 'V':
1844 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001846 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001847 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001848 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 size = PyUnicode_GET_LENGTH(obj);
1850 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001851 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1852 obj, 0,
1853 size) < 0)
1854 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001856 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 size = PyUnicode_GET_LENGTH(*callresult);
1858 assert(PyUnicode_KIND(*callresult) <=
1859 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001860 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1861 *callresult,
1862 0, size) < 0)
1863 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001865 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001866 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001867 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001868 break;
1869 }
1870 case 'S':
1871 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001872 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001873 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001874 /* unused, since we already have the result */
1875 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001876 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001877 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1878 *callresult, 0,
1879 PyUnicode_GET_LENGTH(*callresult)) < 0)
1880 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001881 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001882 /* We're done with the unicode()/repr() => forget it */
1883 Py_DECREF(*callresult);
1884 /* switch to next unicode()/repr() result */
1885 ++callresult;
1886 break;
1887 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001888 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001889 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001890 break;
1891 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 for (; *p; ++p, ++i)
1893 PyUnicode_WRITE(kind, data, i, *p);
1894 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001895 goto end;
1896 }
Victor Stinner1205f272010-09-11 00:54:47 +00001897 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 else {
1899 assert(i < PyUnicode_GET_LENGTH(string));
1900 PyUnicode_WRITE(kind, data, i++, *f);
1901 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001904
Benjamin Peterson29060642009-01-31 22:14:21 +00001905 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001906 if (callresults)
1907 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908 if (numberresults)
1909 PyObject_Free(numberresults);
1910 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001911 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001912 if (callresults) {
1913 PyObject **callresult2 = callresults;
1914 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001915 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001916 ++callresult2;
1917 }
1918 PyObject_Free(callresults);
1919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920 if (numberresults)
1921 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001922 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001923}
1924
Walter Dörwaldd2034312007-05-18 16:29:38 +00001925PyObject *
1926PyUnicode_FromFormat(const char *format, ...)
1927{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001928 PyObject* ret;
1929 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001930
1931#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001932 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001933#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001934 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001935#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001936 ret = PyUnicode_FromFormatV(format, vargs);
1937 va_end(vargs);
1938 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001939}
1940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001941#ifdef HAVE_WCHAR_H
1942
Victor Stinner5593d8a2010-10-02 11:11:27 +00001943/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1944 convert a Unicode object to a wide character string.
1945
Victor Stinnerd88d9832011-09-06 02:00:05 +02001946 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001947 character) required to convert the unicode object. Ignore size argument.
1948
Victor Stinnerd88d9832011-09-06 02:00:05 +02001949 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001950 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001951 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001952static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001953unicode_aswidechar(PyUnicodeObject *unicode,
1954 wchar_t *w,
1955 Py_ssize_t size)
1956{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001957 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 const wchar_t *wstr;
1959
1960 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1961 if (wstr == NULL)
1962 return -1;
1963
Victor Stinner5593d8a2010-10-02 11:11:27 +00001964 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001965 if (size > res)
1966 size = res + 1;
1967 else
1968 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001970 return res;
1971 }
1972 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001974}
1975
1976Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001977PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001978 wchar_t *w,
1979 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980{
1981 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001982 PyErr_BadInternalCall();
1983 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001985 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986}
1987
Victor Stinner137c34c2010-09-29 10:25:54 +00001988wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001989PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001990 Py_ssize_t *size)
1991{
1992 wchar_t* buffer;
1993 Py_ssize_t buflen;
1994
1995 if (unicode == NULL) {
1996 PyErr_BadInternalCall();
1997 return NULL;
1998 }
1999
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002000 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 if (buflen == -1)
2002 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002003 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002004 PyErr_NoMemory();
2005 return NULL;
2006 }
2007
Victor Stinner137c34c2010-09-29 10:25:54 +00002008 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2009 if (buffer == NULL) {
2010 PyErr_NoMemory();
2011 return NULL;
2012 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002013 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 if (buflen == -1)
2015 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002016 if (size != NULL)
2017 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002018 return buffer;
2019}
2020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022
Alexander Belopolsky40018472011-02-26 01:02:56 +00002023PyObject *
2024PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002025{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002027 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002028 PyErr_SetString(PyExc_ValueError,
2029 "chr() arg not in range(0x110000)");
2030 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002031 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 if (ordinal < 256)
2034 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 v = PyUnicode_New(1, ordinal);
2037 if (v == NULL)
2038 return NULL;
2039 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2040 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002041}
2042
Alexander Belopolsky40018472011-02-26 01:02:56 +00002043PyObject *
2044PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002046 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002047 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002048 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002049 Py_INCREF(obj);
2050 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002051 }
2052 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002053 /* For a Unicode subtype that's not a Unicode object,
2054 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002055 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002056 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002057 PyErr_Format(PyExc_TypeError,
2058 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002059 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002060 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002061}
2062
Alexander Belopolsky40018472011-02-26 01:02:56 +00002063PyObject *
2064PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002065 const char *encoding,
2066 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002067{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002068 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002069 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002070
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002072 PyErr_BadInternalCall();
2073 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002075
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002076 /* Decoding bytes objects is the most common case and should be fast */
2077 if (PyBytes_Check(obj)) {
2078 if (PyBytes_GET_SIZE(obj) == 0) {
2079 Py_INCREF(unicode_empty);
2080 v = (PyObject *) unicode_empty;
2081 }
2082 else {
2083 v = PyUnicode_Decode(
2084 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2085 encoding, errors);
2086 }
2087 return v;
2088 }
2089
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002090 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002091 PyErr_SetString(PyExc_TypeError,
2092 "decoding str is not supported");
2093 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002094 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002095
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002096 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2097 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2098 PyErr_Format(PyExc_TypeError,
2099 "coercing to str: need bytes, bytearray "
2100 "or buffer-like object, %.80s found",
2101 Py_TYPE(obj)->tp_name);
2102 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002103 }
Tim Petersced69f82003-09-16 20:30:58 +00002104
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002105 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002106 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002107 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108 }
Tim Petersced69f82003-09-16 20:30:58 +00002109 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002110 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002111
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002112 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002113 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114}
2115
Victor Stinner600d3be2010-06-10 12:00:55 +00002116/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002117 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2118 1 on success. */
2119static int
2120normalize_encoding(const char *encoding,
2121 char *lower,
2122 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002124 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002125 char *l;
2126 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002128 e = encoding;
2129 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002130 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002131 while (*e) {
2132 if (l == l_end)
2133 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002134 if (Py_ISUPPER(*e)) {
2135 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002136 }
2137 else if (*e == '_') {
2138 *l++ = '-';
2139 e++;
2140 }
2141 else {
2142 *l++ = *e++;
2143 }
2144 }
2145 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002146 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002147}
2148
Alexander Belopolsky40018472011-02-26 01:02:56 +00002149PyObject *
2150PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002151 Py_ssize_t size,
2152 const char *encoding,
2153 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002154{
2155 PyObject *buffer = NULL, *unicode;
2156 Py_buffer info;
2157 char lower[11]; /* Enough for any encoding shortcut */
2158
2159 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002160 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002161
2162 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002163 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002164 if ((strcmp(lower, "utf-8") == 0) ||
2165 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002166 return PyUnicode_DecodeUTF8(s, size, errors);
2167 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002168 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002169 (strcmp(lower, "iso-8859-1") == 0))
2170 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002171#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002172 else if (strcmp(lower, "mbcs") == 0)
2173 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002174#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002175 else if (strcmp(lower, "ascii") == 0)
2176 return PyUnicode_DecodeASCII(s, size, errors);
2177 else if (strcmp(lower, "utf-16") == 0)
2178 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2179 else if (strcmp(lower, "utf-32") == 0)
2180 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182
2183 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002184 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002185 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002186 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002187 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 if (buffer == NULL)
2189 goto onError;
2190 unicode = PyCodec_Decode(buffer, encoding, errors);
2191 if (unicode == NULL)
2192 goto onError;
2193 if (!PyUnicode_Check(unicode)) {
2194 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002195 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002196 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 Py_DECREF(unicode);
2198 goto onError;
2199 }
2200 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 if (PyUnicode_READY(unicode)) {
2202 Py_DECREF(unicode);
2203 return NULL;
2204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002206
Benjamin Peterson29060642009-01-31 22:14:21 +00002207 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208 Py_XDECREF(buffer);
2209 return NULL;
2210}
2211
Alexander Belopolsky40018472011-02-26 01:02:56 +00002212PyObject *
2213PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002214 const char *encoding,
2215 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002216{
2217 PyObject *v;
2218
2219 if (!PyUnicode_Check(unicode)) {
2220 PyErr_BadArgument();
2221 goto onError;
2222 }
2223
2224 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002225 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002226
2227 /* Decode via the codec registry */
2228 v = PyCodec_Decode(unicode, encoding, errors);
2229 if (v == NULL)
2230 goto onError;
2231 return v;
2232
Benjamin Peterson29060642009-01-31 22:14:21 +00002233 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002234 return NULL;
2235}
2236
Alexander Belopolsky40018472011-02-26 01:02:56 +00002237PyObject *
2238PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002239 const char *encoding,
2240 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002241{
2242 PyObject *v;
2243
2244 if (!PyUnicode_Check(unicode)) {
2245 PyErr_BadArgument();
2246 goto onError;
2247 }
2248
2249 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002250 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002251
2252 /* Decode via the codec registry */
2253 v = PyCodec_Decode(unicode, encoding, errors);
2254 if (v == NULL)
2255 goto onError;
2256 if (!PyUnicode_Check(v)) {
2257 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002258 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002259 Py_TYPE(v)->tp_name);
2260 Py_DECREF(v);
2261 goto onError;
2262 }
2263 return v;
2264
Benjamin Peterson29060642009-01-31 22:14:21 +00002265 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002266 return NULL;
2267}
2268
Alexander Belopolsky40018472011-02-26 01:02:56 +00002269PyObject *
2270PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002271 Py_ssize_t size,
2272 const char *encoding,
2273 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274{
2275 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002276
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277 unicode = PyUnicode_FromUnicode(s, size);
2278 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2281 Py_DECREF(unicode);
2282 return v;
2283}
2284
Alexander Belopolsky40018472011-02-26 01:02:56 +00002285PyObject *
2286PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002287 const char *encoding,
2288 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002289{
2290 PyObject *v;
2291
2292 if (!PyUnicode_Check(unicode)) {
2293 PyErr_BadArgument();
2294 goto onError;
2295 }
2296
2297 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002298 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002299
2300 /* Encode via the codec registry */
2301 v = PyCodec_Encode(unicode, encoding, errors);
2302 if (v == NULL)
2303 goto onError;
2304 return v;
2305
Benjamin Peterson29060642009-01-31 22:14:21 +00002306 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002307 return NULL;
2308}
2309
Victor Stinnerad158722010-10-27 00:25:46 +00002310PyObject *
2311PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002312{
Victor Stinner99b95382011-07-04 14:23:54 +02002313#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002314 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2315 PyUnicode_GET_SIZE(unicode),
2316 NULL);
2317#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002318 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002319#else
Victor Stinner793b5312011-04-27 00:24:21 +02002320 PyInterpreterState *interp = PyThreadState_GET()->interp;
2321 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2322 cannot use it to encode and decode filenames before it is loaded. Load
2323 the Python codec requires to encode at least its own filename. Use the C
2324 version of the locale codec until the codec registry is initialized and
2325 the Python codec is loaded.
2326
2327 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2328 cannot only rely on it: check also interp->fscodec_initialized for
2329 subinterpreters. */
2330 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002331 return PyUnicode_AsEncodedString(unicode,
2332 Py_FileSystemDefaultEncoding,
2333 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002334 }
2335 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002336 /* locale encoding with surrogateescape */
2337 wchar_t *wchar;
2338 char *bytes;
2339 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002340 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002341
2342 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2343 if (wchar == NULL)
2344 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002345 bytes = _Py_wchar2char(wchar, &error_pos);
2346 if (bytes == NULL) {
2347 if (error_pos != (size_t)-1) {
2348 char *errmsg = strerror(errno);
2349 PyObject *exc = NULL;
2350 if (errmsg == NULL)
2351 errmsg = "Py_wchar2char() failed";
2352 raise_encode_exception(&exc,
2353 "filesystemencoding",
2354 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2355 error_pos, error_pos+1,
2356 errmsg);
2357 Py_XDECREF(exc);
2358 }
2359 else
2360 PyErr_NoMemory();
2361 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002362 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002363 }
2364 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002365
2366 bytes_obj = PyBytes_FromString(bytes);
2367 PyMem_Free(bytes);
2368 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002369 }
Victor Stinnerad158722010-10-27 00:25:46 +00002370#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002371}
2372
Alexander Belopolsky40018472011-02-26 01:02:56 +00002373PyObject *
2374PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002375 const char *encoding,
2376 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377{
2378 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002379 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002380
Guido van Rossumd57fd912000-03-10 22:53:23 +00002381 if (!PyUnicode_Check(unicode)) {
2382 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384 }
Fred Drakee4315f52000-05-09 19:53:39 +00002385
Victor Stinner2f283c22011-03-02 01:21:46 +00002386 if (encoding == NULL) {
2387 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002388 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002389 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002391 }
Fred Drakee4315f52000-05-09 19:53:39 +00002392
2393 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002394 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002395 if ((strcmp(lower, "utf-8") == 0) ||
2396 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002397 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002398 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002399 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002400 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002401 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002402 }
Victor Stinner37296e82010-06-10 13:36:23 +00002403 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002404 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002405 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002407#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002408 else if (strcmp(lower, "mbcs") == 0)
2409 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2410 PyUnicode_GET_SIZE(unicode),
2411 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002412#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002413 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002414 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002416
2417 /* Encode via the codec registry */
2418 v = PyCodec_Encode(unicode, encoding, errors);
2419 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002420 return NULL;
2421
2422 /* The normal path */
2423 if (PyBytes_Check(v))
2424 return v;
2425
2426 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002427 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002428 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002429 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002430
2431 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2432 "encoder %s returned bytearray instead of bytes",
2433 encoding);
2434 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002435 Py_DECREF(v);
2436 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002437 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002438
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002439 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2440 Py_DECREF(v);
2441 return b;
2442 }
2443
2444 PyErr_Format(PyExc_TypeError,
2445 "encoder did not return a bytes object (type=%.400s)",
2446 Py_TYPE(v)->tp_name);
2447 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002448 return NULL;
2449}
2450
Alexander Belopolsky40018472011-02-26 01:02:56 +00002451PyObject *
2452PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002453 const char *encoding,
2454 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002455{
2456 PyObject *v;
2457
2458 if (!PyUnicode_Check(unicode)) {
2459 PyErr_BadArgument();
2460 goto onError;
2461 }
2462
2463 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002464 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002465
2466 /* Encode via the codec registry */
2467 v = PyCodec_Encode(unicode, encoding, errors);
2468 if (v == NULL)
2469 goto onError;
2470 if (!PyUnicode_Check(v)) {
2471 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002472 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002473 Py_TYPE(v)->tp_name);
2474 Py_DECREF(v);
2475 goto onError;
2476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002477 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002478
Benjamin Peterson29060642009-01-31 22:14:21 +00002479 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 return NULL;
2481}
2482
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002483PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002484PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002485 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002486 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2487}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002488
Christian Heimes5894ba72007-11-04 11:43:14 +00002489PyObject*
2490PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2491{
Victor Stinner99b95382011-07-04 14:23:54 +02002492#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002493 return PyUnicode_DecodeMBCS(s, size, NULL);
2494#elif defined(__APPLE__)
2495 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2496#else
Victor Stinner793b5312011-04-27 00:24:21 +02002497 PyInterpreterState *interp = PyThreadState_GET()->interp;
2498 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2499 cannot use it to encode and decode filenames before it is loaded. Load
2500 the Python codec requires to encode at least its own filename. Use the C
2501 version of the locale codec until the codec registry is initialized and
2502 the Python codec is loaded.
2503
2504 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2505 cannot only rely on it: check also interp->fscodec_initialized for
2506 subinterpreters. */
2507 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002508 return PyUnicode_Decode(s, size,
2509 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002510 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002511 }
2512 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002513 /* locale encoding with surrogateescape */
2514 wchar_t *wchar;
2515 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002516 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002517
2518 if (s[size] != '\0' || size != strlen(s)) {
2519 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2520 return NULL;
2521 }
2522
Victor Stinner168e1172010-10-16 23:16:16 +00002523 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002524 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002525 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002526
Victor Stinner168e1172010-10-16 23:16:16 +00002527 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002528 PyMem_Free(wchar);
2529 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002530 }
Victor Stinnerad158722010-10-27 00:25:46 +00002531#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002532}
2533
Martin v. Löwis011e8422009-05-05 04:43:17 +00002534
2535int
2536PyUnicode_FSConverter(PyObject* arg, void* addr)
2537{
2538 PyObject *output = NULL;
2539 Py_ssize_t size;
2540 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002541 if (arg == NULL) {
2542 Py_DECREF(*(PyObject**)addr);
2543 return 1;
2544 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002545 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002546 output = arg;
2547 Py_INCREF(output);
2548 }
2549 else {
2550 arg = PyUnicode_FromObject(arg);
2551 if (!arg)
2552 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002553 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002554 Py_DECREF(arg);
2555 if (!output)
2556 return 0;
2557 if (!PyBytes_Check(output)) {
2558 Py_DECREF(output);
2559 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2560 return 0;
2561 }
2562 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002563 size = PyBytes_GET_SIZE(output);
2564 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002565 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002566 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002567 Py_DECREF(output);
2568 return 0;
2569 }
2570 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002571 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002572}
2573
2574
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002575int
2576PyUnicode_FSDecoder(PyObject* arg, void* addr)
2577{
2578 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002579 if (arg == NULL) {
2580 Py_DECREF(*(PyObject**)addr);
2581 return 1;
2582 }
2583 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 if (PyUnicode_READY(arg))
2585 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002586 output = arg;
2587 Py_INCREF(output);
2588 }
2589 else {
2590 arg = PyBytes_FromObject(arg);
2591 if (!arg)
2592 return 0;
2593 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2594 PyBytes_GET_SIZE(arg));
2595 Py_DECREF(arg);
2596 if (!output)
2597 return 0;
2598 if (!PyUnicode_Check(output)) {
2599 Py_DECREF(output);
2600 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2601 return 0;
2602 }
2603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2605 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002606 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2607 Py_DECREF(output);
2608 return 0;
2609 }
2610 *(PyObject**)addr = output;
2611 return Py_CLEANUP_SUPPORTED;
2612}
2613
2614
Martin v. Löwis5b222132007-06-10 09:51:05 +00002615char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002616PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002617{
Christian Heimesf3863112007-11-22 07:46:41 +00002618 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2620
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002621 if (!PyUnicode_Check(unicode)) {
2622 PyErr_BadArgument();
2623 return NULL;
2624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002626 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002627
2628 if (_PyUnicode_UTF8(unicode) == NULL) {
2629 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2630 if (bytes == NULL)
2631 return NULL;
2632 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2633 if (u->_base.utf8 == NULL) {
2634 Py_DECREF(bytes);
2635 return NULL;
2636 }
2637 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2638 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2639 Py_DECREF(bytes);
2640 }
2641
2642 if (psize)
2643 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2644 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002645}
2646
2647char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002648PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002650 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2651}
2652
2653#ifdef Py_DEBUG
2654int unicode_as_unicode_calls = 0;
2655#endif
2656
2657
2658Py_UNICODE *
2659PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2660{
2661 PyUnicodeObject *u;
2662 const unsigned char *one_byte;
2663#if SIZEOF_WCHAR_T == 4
2664 const Py_UCS2 *two_bytes;
2665#else
2666 const Py_UCS4 *four_bytes;
2667 const Py_UCS4 *ucs4_end;
2668 Py_ssize_t num_surrogates;
2669#endif
2670 wchar_t *w;
2671 wchar_t *wchar_end;
2672
2673 if (!PyUnicode_Check(unicode)) {
2674 PyErr_BadArgument();
2675 return NULL;
2676 }
2677 u = (PyUnicodeObject*)unicode;
2678 if (_PyUnicode_WSTR(u) == NULL) {
2679 /* Non-ASCII compact unicode object */
2680 assert(_PyUnicode_KIND(u) != 0);
2681 assert(PyUnicode_IS_READY(u));
2682
2683#ifdef Py_DEBUG
2684 ++unicode_as_unicode_calls;
2685#endif
2686
2687 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2688#if SIZEOF_WCHAR_T == 2
2689 four_bytes = PyUnicode_4BYTE_DATA(u);
2690 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2691 num_surrogates = 0;
2692
2693 for (; four_bytes < ucs4_end; ++four_bytes) {
2694 if (*four_bytes > 0xFFFF)
2695 ++num_surrogates;
2696 }
2697
2698 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2699 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2700 if (!_PyUnicode_WSTR(u)) {
2701 PyErr_NoMemory();
2702 return NULL;
2703 }
2704 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2705
2706 w = _PyUnicode_WSTR(u);
2707 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2708 four_bytes = PyUnicode_4BYTE_DATA(u);
2709 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2710 if (*four_bytes > 0xFFFF) {
2711 /* encode surrogate pair in this case */
2712 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2713 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2714 }
2715 else
2716 *w = *four_bytes;
2717
2718 if (w > wchar_end) {
2719 assert(0 && "Miscalculated string end");
2720 }
2721 }
2722 *w = 0;
2723#else
2724 /* sizeof(wchar_t) == 4 */
2725 Py_FatalError("Impossible unicode object state, wstr and str "
2726 "should share memory already.");
2727 return NULL;
2728#endif
2729 }
2730 else {
2731 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2732 (_PyUnicode_LENGTH(u) + 1));
2733 if (!_PyUnicode_WSTR(u)) {
2734 PyErr_NoMemory();
2735 return NULL;
2736 }
2737 if (!PyUnicode_IS_COMPACT_ASCII(u))
2738 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2739 w = _PyUnicode_WSTR(u);
2740 wchar_end = w + _PyUnicode_LENGTH(u);
2741
2742 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2743 one_byte = PyUnicode_1BYTE_DATA(u);
2744 for (; w < wchar_end; ++one_byte, ++w)
2745 *w = *one_byte;
2746 /* null-terminate the wstr */
2747 *w = 0;
2748 }
2749 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2750#if SIZEOF_WCHAR_T == 4
2751 two_bytes = PyUnicode_2BYTE_DATA(u);
2752 for (; w < wchar_end; ++two_bytes, ++w)
2753 *w = *two_bytes;
2754 /* null-terminate the wstr */
2755 *w = 0;
2756#else
2757 /* sizeof(wchar_t) == 2 */
2758 PyObject_FREE(_PyUnicode_WSTR(u));
2759 _PyUnicode_WSTR(u) = NULL;
2760 Py_FatalError("Impossible unicode object state, wstr "
2761 "and str should share memory already.");
2762 return NULL;
2763#endif
2764 }
2765 else {
2766 assert(0 && "This should never happen.");
2767 }
2768 }
2769 }
2770 if (size != NULL)
2771 *size = PyUnicode_WSTR_LENGTH(u);
2772 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002773}
2774
Alexander Belopolsky40018472011-02-26 01:02:56 +00002775Py_UNICODE *
2776PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002778 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779}
2780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002781
Alexander Belopolsky40018472011-02-26 01:02:56 +00002782Py_ssize_t
2783PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784{
2785 if (!PyUnicode_Check(unicode)) {
2786 PyErr_BadArgument();
2787 goto onError;
2788 }
2789 return PyUnicode_GET_SIZE(unicode);
2790
Benjamin Peterson29060642009-01-31 22:14:21 +00002791 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 return -1;
2793}
2794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002795Py_ssize_t
2796PyUnicode_GetLength(PyObject *unicode)
2797{
2798 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2799 PyErr_BadArgument();
2800 return -1;
2801 }
2802
2803 return PyUnicode_GET_LENGTH(unicode);
2804}
2805
2806Py_UCS4
2807PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2808{
2809 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2810 return PyErr_BadArgument();
2811 return (Py_UCS4)-1;
2812 }
2813 return PyUnicode_READ_CHAR(unicode, index);
2814}
2815
2816int
2817PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2818{
2819 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2820 return PyErr_BadArgument();
2821 return -1;
2822 }
2823
2824 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2825 index, ch);
2826 return 0;
2827}
2828
Alexander Belopolsky40018472011-02-26 01:02:56 +00002829const char *
2830PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002831{
Victor Stinner42cb4622010-09-01 19:39:01 +00002832 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002833}
2834
Victor Stinner554f3f02010-06-16 23:33:54 +00002835/* create or adjust a UnicodeDecodeError */
2836static void
2837make_decode_exception(PyObject **exceptionObject,
2838 const char *encoding,
2839 const char *input, Py_ssize_t length,
2840 Py_ssize_t startpos, Py_ssize_t endpos,
2841 const char *reason)
2842{
2843 if (*exceptionObject == NULL) {
2844 *exceptionObject = PyUnicodeDecodeError_Create(
2845 encoding, input, length, startpos, endpos, reason);
2846 }
2847 else {
2848 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2849 goto onError;
2850 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2851 goto onError;
2852 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2853 goto onError;
2854 }
2855 return;
2856
2857onError:
2858 Py_DECREF(*exceptionObject);
2859 *exceptionObject = NULL;
2860}
2861
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002862/* error handling callback helper:
2863 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002864 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002865 and adjust various state variables.
2866 return 0 on success, -1 on error
2867*/
2868
Alexander Belopolsky40018472011-02-26 01:02:56 +00002869static int
2870unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002871 const char *encoding, const char *reason,
2872 const char **input, const char **inend, Py_ssize_t *startinpos,
2873 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2874 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002875{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002876 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002877
2878 PyObject *restuple = NULL;
2879 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002880 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002881 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002882 Py_ssize_t requiredsize;
2883 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002884 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002885 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002886 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887 int res = -1;
2888
2889 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002890 *errorHandler = PyCodec_LookupError(errors);
2891 if (*errorHandler == NULL)
2892 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002893 }
2894
Victor Stinner554f3f02010-06-16 23:33:54 +00002895 make_decode_exception(exceptionObject,
2896 encoding,
2897 *input, *inend - *input,
2898 *startinpos, *endinpos,
2899 reason);
2900 if (*exceptionObject == NULL)
2901 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002902
2903 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2904 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002905 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002906 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002907 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002908 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002909 }
2910 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002911 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002912
2913 /* Copy back the bytes variables, which might have been modified by the
2914 callback */
2915 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2916 if (!inputobj)
2917 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002918 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002920 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002921 *input = PyBytes_AS_STRING(inputobj);
2922 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002923 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002924 /* we can DECREF safely, as the exception has another reference,
2925 so the object won't go away. */
2926 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002927
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002928 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002929 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002930 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002931 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2932 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002933 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002934
2935 /* need more space? (at least enough for what we
2936 have+the replacement+the rest of the string (starting
2937 at the new input position), so we won't have to check space
2938 when there are no errors in the rest of the string) */
2939 repptr = PyUnicode_AS_UNICODE(repunicode);
2940 repsize = PyUnicode_GET_SIZE(repunicode);
2941 requiredsize = *outpos + repsize + insize-newpos;
2942 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002943 if (requiredsize<2*outsize)
2944 requiredsize = 2*outsize;
2945 if (_PyUnicode_Resize(output, requiredsize) < 0)
2946 goto onError;
2947 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002948 }
2949 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002950 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002951 Py_UNICODE_COPY(*outptr, repptr, repsize);
2952 *outptr += repsize;
2953 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002954
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002955 /* we made it! */
2956 res = 0;
2957
Benjamin Peterson29060642009-01-31 22:14:21 +00002958 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002959 Py_XDECREF(restuple);
2960 return res;
2961}
2962
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002963/* --- UTF-7 Codec -------------------------------------------------------- */
2964
Antoine Pitrou244651a2009-05-04 18:56:13 +00002965/* See RFC2152 for details. We encode conservatively and decode liberally. */
2966
2967/* Three simple macros defining base-64. */
2968
2969/* Is c a base-64 character? */
2970
2971#define IS_BASE64(c) \
2972 (((c) >= 'A' && (c) <= 'Z') || \
2973 ((c) >= 'a' && (c) <= 'z') || \
2974 ((c) >= '0' && (c) <= '9') || \
2975 (c) == '+' || (c) == '/')
2976
2977/* given that c is a base-64 character, what is its base-64 value? */
2978
2979#define FROM_BASE64(c) \
2980 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2981 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2982 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2983 (c) == '+' ? 62 : 63)
2984
2985/* What is the base-64 character of the bottom 6 bits of n? */
2986
2987#define TO_BASE64(n) \
2988 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2989
2990/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2991 * decoded as itself. We are permissive on decoding; the only ASCII
2992 * byte not decoding to itself is the + which begins a base64
2993 * string. */
2994
2995#define DECODE_DIRECT(c) \
2996 ((c) <= 127 && (c) != '+')
2997
2998/* The UTF-7 encoder treats ASCII characters differently according to
2999 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3000 * the above). See RFC2152. This array identifies these different
3001 * sets:
3002 * 0 : "Set D"
3003 * alphanumeric and '(),-./:?
3004 * 1 : "Set O"
3005 * !"#$%&*;<=>@[]^_`{|}
3006 * 2 : "whitespace"
3007 * ht nl cr sp
3008 * 3 : special (must be base64 encoded)
3009 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3010 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003011
Tim Petersced69f82003-09-16 20:30:58 +00003012static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003013char utf7_category[128] = {
3014/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3015 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3016/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3017 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3018/* sp ! " # $ % & ' ( ) * + , - . / */
3019 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3020/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3021 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3022/* @ A B C D E F G H I J K L M N O */
3023 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3024/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3025 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3026/* ` a b c d e f g h i j k l m n o */
3027 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3028/* p q r s t u v w x y z { | } ~ del */
3029 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003030};
3031
Antoine Pitrou244651a2009-05-04 18:56:13 +00003032/* ENCODE_DIRECT: this character should be encoded as itself. The
3033 * answer depends on whether we are encoding set O as itself, and also
3034 * on whether we are encoding whitespace as itself. RFC2152 makes it
3035 * clear that the answers to these questions vary between
3036 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003037
Antoine Pitrou244651a2009-05-04 18:56:13 +00003038#define ENCODE_DIRECT(c, directO, directWS) \
3039 ((c) < 128 && (c) > 0 && \
3040 ((utf7_category[(c)] == 0) || \
3041 (directWS && (utf7_category[(c)] == 2)) || \
3042 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003043
Alexander Belopolsky40018472011-02-26 01:02:56 +00003044PyObject *
3045PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003046 Py_ssize_t size,
3047 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003048{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003049 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3050}
3051
Antoine Pitrou244651a2009-05-04 18:56:13 +00003052/* The decoder. The only state we preserve is our read position,
3053 * i.e. how many characters we have consumed. So if we end in the
3054 * middle of a shift sequence we have to back off the read position
3055 * and the output to the beginning of the sequence, otherwise we lose
3056 * all the shift state (seen bits, number of bits seen, high
3057 * surrogate). */
3058
Alexander Belopolsky40018472011-02-26 01:02:56 +00003059PyObject *
3060PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003061 Py_ssize_t size,
3062 const char *errors,
3063 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003064{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003065 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003066 Py_ssize_t startinpos;
3067 Py_ssize_t endinpos;
3068 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003069 const char *e;
3070 PyUnicodeObject *unicode;
3071 Py_UNICODE *p;
3072 const char *errmsg = "";
3073 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003074 Py_UNICODE *shiftOutStart;
3075 unsigned int base64bits = 0;
3076 unsigned long base64buffer = 0;
3077 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003078 PyObject *errorHandler = NULL;
3079 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003080
3081 unicode = _PyUnicode_New(size);
3082 if (!unicode)
3083 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003084 if (size == 0) {
3085 if (consumed)
3086 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003087 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003088 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003090 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003091 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003092 e = s + size;
3093
3094 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003095 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003096 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003097 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003098
Antoine Pitrou244651a2009-05-04 18:56:13 +00003099 if (inShift) { /* in a base-64 section */
3100 if (IS_BASE64(ch)) { /* consume a base-64 character */
3101 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3102 base64bits += 6;
3103 s++;
3104 if (base64bits >= 16) {
3105 /* we have enough bits for a UTF-16 value */
3106 Py_UNICODE outCh = (Py_UNICODE)
3107 (base64buffer >> (base64bits-16));
3108 base64bits -= 16;
3109 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3110 if (surrogate) {
3111 /* expecting a second surrogate */
3112 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3113#ifdef Py_UNICODE_WIDE
3114 *p++ = (((surrogate & 0x3FF)<<10)
3115 | (outCh & 0x3FF)) + 0x10000;
3116#else
3117 *p++ = surrogate;
3118 *p++ = outCh;
3119#endif
3120 surrogate = 0;
3121 }
3122 else {
3123 surrogate = 0;
3124 errmsg = "second surrogate missing";
3125 goto utf7Error;
3126 }
3127 }
3128 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3129 /* first surrogate */
3130 surrogate = outCh;
3131 }
3132 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3133 errmsg = "unexpected second surrogate";
3134 goto utf7Error;
3135 }
3136 else {
3137 *p++ = outCh;
3138 }
3139 }
3140 }
3141 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003142 inShift = 0;
3143 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003144 if (surrogate) {
3145 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003146 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003147 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003148 if (base64bits > 0) { /* left-over bits */
3149 if (base64bits >= 6) {
3150 /* We've seen at least one base-64 character */
3151 errmsg = "partial character in shift sequence";
3152 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003153 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003154 else {
3155 /* Some bits remain; they should be zero */
3156 if (base64buffer != 0) {
3157 errmsg = "non-zero padding bits in shift sequence";
3158 goto utf7Error;
3159 }
3160 }
3161 }
3162 if (ch != '-') {
3163 /* '-' is absorbed; other terminating
3164 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003165 *p++ = ch;
3166 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003167 }
3168 }
3169 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003170 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003171 s++; /* consume '+' */
3172 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003173 s++;
3174 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003175 }
3176 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003177 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003178 shiftOutStart = p;
3179 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003180 }
3181 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003182 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003183 *p++ = ch;
3184 s++;
3185 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003186 else {
3187 startinpos = s-starts;
3188 s++;
3189 errmsg = "unexpected special character";
3190 goto utf7Error;
3191 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003192 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003193utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003194 outpos = p-PyUnicode_AS_UNICODE(unicode);
3195 endinpos = s-starts;
3196 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003197 errors, &errorHandler,
3198 "utf7", errmsg,
3199 &starts, &e, &startinpos, &endinpos, &exc, &s,
3200 &unicode, &outpos, &p))
3201 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003202 }
3203
Antoine Pitrou244651a2009-05-04 18:56:13 +00003204 /* end of string */
3205
3206 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3207 /* if we're in an inconsistent state, that's an error */
3208 if (surrogate ||
3209 (base64bits >= 6) ||
3210 (base64bits > 0 && base64buffer != 0)) {
3211 outpos = p-PyUnicode_AS_UNICODE(unicode);
3212 endinpos = size;
3213 if (unicode_decode_call_errorhandler(
3214 errors, &errorHandler,
3215 "utf7", "unterminated shift sequence",
3216 &starts, &e, &startinpos, &endinpos, &exc, &s,
3217 &unicode, &outpos, &p))
3218 goto onError;
3219 if (s < e)
3220 goto restart;
3221 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003222 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003223
3224 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003225 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003226 if (inShift) {
3227 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003228 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003229 }
3230 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003231 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003232 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003233 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003234
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003235 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003236 goto onError;
3237
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238 Py_XDECREF(errorHandler);
3239 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003240 if (PyUnicode_READY(unicode) == -1) {
3241 Py_DECREF(unicode);
3242 return NULL;
3243 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003244 return (PyObject *)unicode;
3245
Benjamin Peterson29060642009-01-31 22:14:21 +00003246 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003247 Py_XDECREF(errorHandler);
3248 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003249 Py_DECREF(unicode);
3250 return NULL;
3251}
3252
3253
Alexander Belopolsky40018472011-02-26 01:02:56 +00003254PyObject *
3255PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003256 Py_ssize_t size,
3257 int base64SetO,
3258 int base64WhiteSpace,
3259 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003260{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003261 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003262 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003263 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003264 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003265 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003266 unsigned int base64bits = 0;
3267 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003268 char * out;
3269 char * start;
3270
3271 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003272 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003273
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003274 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003275 return PyErr_NoMemory();
3276
Antoine Pitrou244651a2009-05-04 18:56:13 +00003277 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003278 if (v == NULL)
3279 return NULL;
3280
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003281 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003282 for (;i < size; ++i) {
3283 Py_UNICODE ch = s[i];
3284
Antoine Pitrou244651a2009-05-04 18:56:13 +00003285 if (inShift) {
3286 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3287 /* shifting out */
3288 if (base64bits) { /* output remaining bits */
3289 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3290 base64buffer = 0;
3291 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003292 }
3293 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003294 /* Characters not in the BASE64 set implicitly unshift the sequence
3295 so no '-' is required, except if the character is itself a '-' */
3296 if (IS_BASE64(ch) || ch == '-') {
3297 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003298 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003299 *out++ = (char) ch;
3300 }
3301 else {
3302 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003303 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003304 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003305 else { /* not in a shift sequence */
3306 if (ch == '+') {
3307 *out++ = '+';
3308 *out++ = '-';
3309 }
3310 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3311 *out++ = (char) ch;
3312 }
3313 else {
3314 *out++ = '+';
3315 inShift = 1;
3316 goto encode_char;
3317 }
3318 }
3319 continue;
3320encode_char:
3321#ifdef Py_UNICODE_WIDE
3322 if (ch >= 0x10000) {
3323 /* code first surrogate */
3324 base64bits += 16;
3325 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3326 while (base64bits >= 6) {
3327 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3328 base64bits -= 6;
3329 }
3330 /* prepare second surrogate */
3331 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3332 }
3333#endif
3334 base64bits += 16;
3335 base64buffer = (base64buffer << 16) | ch;
3336 while (base64bits >= 6) {
3337 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3338 base64bits -= 6;
3339 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003340 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003341 if (base64bits)
3342 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3343 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003344 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003345 if (_PyBytes_Resize(&v, out - start) < 0)
3346 return NULL;
3347 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003348}
3349
Antoine Pitrou244651a2009-05-04 18:56:13 +00003350#undef IS_BASE64
3351#undef FROM_BASE64
3352#undef TO_BASE64
3353#undef DECODE_DIRECT
3354#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003355
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356/* --- UTF-8 Codec -------------------------------------------------------- */
3357
Tim Petersced69f82003-09-16 20:30:58 +00003358static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003360 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3361 illegal prefix. See RFC 3629 for details */
3362 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3363 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003364 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3366 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3367 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3368 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003369 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3370 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3372 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003373 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3374 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3375 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3376 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3377 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378};
3379
Alexander Belopolsky40018472011-02-26 01:02:56 +00003380PyObject *
3381PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003382 Py_ssize_t size,
3383 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384{
Walter Dörwald69652032004-09-07 20:24:22 +00003385 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3386}
3387
Antoine Pitrouab868312009-01-10 15:40:25 +00003388/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3389#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3390
3391/* Mask to quickly check whether a C 'long' contains a
3392 non-ASCII, UTF8-encoded char. */
3393#if (SIZEOF_LONG == 8)
3394# define ASCII_CHAR_MASK 0x8080808080808080L
3395#elif (SIZEOF_LONG == 4)
3396# define ASCII_CHAR_MASK 0x80808080L
3397#else
3398# error C 'long' size should be either 4 or 8!
3399#endif
3400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003401/* Scans a UTF-8 string and returns the maximum character to be expected,
3402 the size of the decoded unicode string and if any major errors were
3403 encountered.
3404
3405 This function does check basic UTF-8 sanity, it does however NOT CHECK
3406 if the string contains surrogates, and if all continuation bytes are
3407 within the correct ranges, these checks are performed in
3408 PyUnicode_DecodeUTF8Stateful.
3409
3410 If it sets has_errors to 1, it means the value of unicode_size and max_char
3411 will be bogus and you should not rely on useful information in them.
3412 */
3413static Py_UCS4
3414utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3415 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3416 int *has_errors)
3417{
3418 Py_ssize_t n;
3419 Py_ssize_t char_count = 0;
3420 Py_UCS4 max_char = 127, new_max;
3421 Py_UCS4 upper_bound;
3422 const unsigned char *p = (const unsigned char *)s;
3423 const unsigned char *end = p + string_size;
3424 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3425 int err = 0;
3426
3427 for (; p < end && !err; ++p, ++char_count) {
3428 /* Only check value if it's not a ASCII char... */
3429 if (*p < 0x80) {
3430 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3431 an explanation. */
3432 if (!((size_t) p & LONG_PTR_MASK)) {
3433 /* Help register allocation */
3434 register const unsigned char *_p = p;
3435 while (_p < aligned_end) {
3436 unsigned long value = *(unsigned long *) _p;
3437 if (value & ASCII_CHAR_MASK)
3438 break;
3439 _p += SIZEOF_LONG;
3440 char_count += SIZEOF_LONG;
3441 }
3442 p = _p;
3443 if (p == end)
3444 break;
3445 }
3446 }
3447 if (*p >= 0x80) {
3448 n = utf8_code_length[*p];
3449 new_max = max_char;
3450 switch (n) {
3451 /* invalid start byte */
3452 case 0:
3453 err = 1;
3454 break;
3455 case 2:
3456 /* Code points between 0x00FF and 0x07FF inclusive.
3457 Approximate the upper bound of the code point,
3458 if this flips over 255 we can be sure it will be more
3459 than 255 and the string will need 2 bytes per code coint,
3460 if it stays under or equal to 255, we can be sure 1 byte
3461 is enough.
3462 ((*p & 0b00011111) << 6) | 0b00111111 */
3463 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3464 if (max_char < upper_bound)
3465 new_max = upper_bound;
3466 /* Ensure we track at least that we left ASCII space. */
3467 if (new_max < 128)
3468 new_max = 128;
3469 break;
3470 case 3:
3471 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3472 always > 255 and <= 65535 and will always need 2 bytes. */
3473 if (max_char < 65535)
3474 new_max = 65535;
3475 break;
3476 case 4:
3477 /* Code point will be above 0xFFFF for sure in this case. */
3478 new_max = 65537;
3479 break;
3480 /* Internal error, this should be caught by the first if */
3481 case 1:
3482 default:
3483 assert(0 && "Impossible case in utf8_max_char_and_size");
3484 err = 1;
3485 }
3486 /* Instead of number of overall bytes for this code point,
3487 n containts the number of following bytes: */
3488 --n;
3489 /* Check if the follow up chars are all valid continuation bytes */
3490 if (n >= 1) {
3491 const unsigned char *cont;
3492 if ((p + n) >= end) {
3493 if (consumed == 0)
3494 /* incomplete data, non-incremental decoding */
3495 err = 1;
3496 break;
3497 }
3498 for (cont = p + 1; cont < (p + n); ++cont) {
3499 if ((*cont & 0xc0) != 0x80) {
3500 err = 1;
3501 break;
3502 }
3503 }
3504 p += n;
3505 }
3506 else
3507 err = 1;
3508 max_char = new_max;
3509 }
3510 }
3511
3512 if (unicode_size)
3513 *unicode_size = char_count;
3514 if (has_errors)
3515 *has_errors = err;
3516 return max_char;
3517}
3518
3519/* Similar to PyUnicode_WRITE but can also write into wstr field
3520 of the legacy unicode representation */
3521#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3522 do { \
3523 const int k_ = (kind); \
3524 if (k_ == PyUnicode_WCHAR_KIND) \
3525 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3526 else if (k_ == PyUnicode_1BYTE_KIND) \
3527 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3528 else if (k_ == PyUnicode_2BYTE_KIND) \
3529 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3530 else \
3531 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3532 } while (0)
3533
Alexander Belopolsky40018472011-02-26 01:02:56 +00003534PyObject *
3535PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003536 Py_ssize_t size,
3537 const char *errors,
3538 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003539{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003541 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003542 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003543 Py_ssize_t startinpos;
3544 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003545 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003547 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 PyObject *errorHandler = NULL;
3549 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003550 Py_UCS4 maxchar = 0;
3551 Py_ssize_t unicode_size;
3552 Py_ssize_t i;
3553 int kind;
3554 void *data;
3555 int has_errors;
3556 Py_UNICODE *error_outptr;
3557#if SIZEOF_WCHAR_T == 2
3558 Py_ssize_t wchar_offset = 0;
3559#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560
Walter Dörwald69652032004-09-07 20:24:22 +00003561 if (size == 0) {
3562 if (consumed)
3563 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003564 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003565 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003566 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3567 consumed, &has_errors);
3568 if (has_errors) {
3569 unicode = _PyUnicode_New(size);
3570 if (!unicode)
3571 return NULL;
3572 kind = PyUnicode_WCHAR_KIND;
3573 data = PyUnicode_AS_UNICODE(unicode);
3574 assert(data != NULL);
3575 }
3576 else {
3577 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3578 if (!unicode)
3579 return NULL;
3580 /* When the string is ASCII only, just use memcpy and return.
3581 unicode_size may be != size if there is an incomplete UTF-8
3582 sequence at the end of the ASCII block. */
3583 if (maxchar < 128 && size == unicode_size) {
3584 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3585 return (PyObject *)unicode;
3586 }
3587 kind = PyUnicode_KIND(unicode);
3588 data = PyUnicode_DATA(unicode);
3589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003591 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003593 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594
3595 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003596 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597
3598 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003599 /* Fast path for runs of ASCII characters. Given that common UTF-8
3600 input will consist of an overwhelming majority of ASCII
3601 characters, we try to optimize for this case by checking
3602 as many characters as a C 'long' can contain.
3603 First, check if we can do an aligned read, as most CPUs have
3604 a penalty for unaligned reads.
3605 */
3606 if (!((size_t) s & LONG_PTR_MASK)) {
3607 /* Help register allocation */
3608 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003609 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003610 while (_s < aligned_end) {
3611 /* Read a whole long at a time (either 4 or 8 bytes),
3612 and do a fast unrolled copy if it only contains ASCII
3613 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003614 unsigned long value = *(unsigned long *) _s;
3615 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003616 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003617 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3618 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3619 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3620 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003621#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003622 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3623 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3624 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3625 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003626#endif
3627 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003628 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003629 }
3630 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003631 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003632 if (s == e)
3633 break;
3634 ch = (unsigned char)*s;
3635 }
3636 }
3637
3638 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003639 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640 s++;
3641 continue;
3642 }
3643
3644 n = utf8_code_length[ch];
3645
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003646 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003647 if (consumed)
3648 break;
3649 else {
3650 errmsg = "unexpected end of data";
3651 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003652 endinpos = startinpos+1;
3653 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3654 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003655 goto utf8Error;
3656 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658
3659 switch (n) {
3660
3661 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003662 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003663 startinpos = s-starts;
3664 endinpos = startinpos+1;
3665 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666
3667 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003668 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003669 startinpos = s-starts;
3670 endinpos = startinpos+1;
3671 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003672
3673 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003674 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003675 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003677 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003678 goto utf8Error;
3679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003681 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003682 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 break;
3684
3685 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003686 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3687 will result in surrogates in range d800-dfff. Surrogates are
3688 not valid UTF-8 so they are rejected.
3689 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3690 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003691 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003692 (s[2] & 0xc0) != 0x80 ||
3693 ((unsigned char)s[0] == 0xE0 &&
3694 (unsigned char)s[1] < 0xA0) ||
3695 ((unsigned char)s[0] == 0xED &&
3696 (unsigned char)s[1] > 0x9F)) {
3697 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003699 endinpos = startinpos + 1;
3700
3701 /* if s[1] first two bits are 1 and 0, then the invalid
3702 continuation byte is s[2], so increment endinpos by 1,
3703 if not, s[1] is invalid and endinpos doesn't need to
3704 be incremented. */
3705 if ((s[1] & 0xC0) == 0x80)
3706 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003707 goto utf8Error;
3708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003710 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003712 break;
3713
3714 case 4:
3715 if ((s[1] & 0xc0) != 0x80 ||
3716 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003717 (s[3] & 0xc0) != 0x80 ||
3718 ((unsigned char)s[0] == 0xF0 &&
3719 (unsigned char)s[1] < 0x90) ||
3720 ((unsigned char)s[0] == 0xF4 &&
3721 (unsigned char)s[1] > 0x8F)) {
3722 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003723 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003724 endinpos = startinpos + 1;
3725 if ((s[1] & 0xC0) == 0x80) {
3726 endinpos++;
3727 if ((s[2] & 0xC0) == 0x80)
3728 endinpos++;
3729 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003730 goto utf8Error;
3731 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003732 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003733 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3734 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003736 /* If the string is flexible or we have native UCS-4, write
3737 directly.. */
3738 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3739 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003741 else {
3742 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003744 /* translate from 10000..10FFFF to 0..FFFF */
3745 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003747 /* high surrogate = top 10 bits added to D800 */
3748 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3749 (Py_UNICODE)(0xD800 + (ch >> 10)));
3750
3751 /* low surrogate = bottom 10 bits added to DC00 */
3752 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3753 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3754 }
3755#if SIZEOF_WCHAR_T == 2
3756 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003757#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759 }
3760 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003761 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003762
Benjamin Peterson29060642009-01-31 22:14:21 +00003763 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764 /* If this is not yet a resizable string, make it one.. */
3765 if (kind != PyUnicode_WCHAR_KIND) {
3766 const Py_UNICODE *u;
3767 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3768 if (!new_unicode)
3769 goto onError;
3770 u = PyUnicode_AsUnicode((PyObject *)unicode);
3771 if (!u)
3772 goto onError;
3773#if SIZEOF_WCHAR_T == 2
3774 i += wchar_offset;
3775#endif
3776 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3777 Py_DECREF(unicode);
3778 unicode = new_unicode;
3779 kind = 0;
3780 data = PyUnicode_AS_UNICODE(new_unicode);
3781 assert(data != NULL);
3782 }
3783 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003784 if (unicode_decode_call_errorhandler(
3785 errors, &errorHandler,
3786 "utf8", errmsg,
3787 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003789 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003790 /* Update data because unicode_decode_call_errorhandler might have
3791 re-created or resized the unicode object. */
3792 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003793 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003795 /* Ensure the unicode_size calculation above was correct: */
3796 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3797
Walter Dörwald69652032004-09-07 20:24:22 +00003798 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003799 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003801 /* Adjust length and ready string when it contained errors and
3802 is of the old resizable kind. */
3803 if (kind == PyUnicode_WCHAR_KIND) {
3804 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3805 PyUnicode_READY(unicode) == -1)
3806 goto onError;
3807 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003809 Py_XDECREF(errorHandler);
3810 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 if (PyUnicode_READY(unicode) == -1) {
3812 Py_DECREF(unicode);
3813 return NULL;
3814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 return (PyObject *)unicode;
3816
Benjamin Peterson29060642009-01-31 22:14:21 +00003817 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003818 Py_XDECREF(errorHandler);
3819 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820 Py_DECREF(unicode);
3821 return NULL;
3822}
3823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003824#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003825
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003826#ifdef __APPLE__
3827
3828/* Simplified UTF-8 decoder using surrogateescape error handler,
3829 used to decode the command line arguments on Mac OS X. */
3830
3831wchar_t*
3832_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3833{
3834 int n;
3835 const char *e;
3836 wchar_t *unicode, *p;
3837
3838 /* Note: size will always be longer than the resulting Unicode
3839 character count */
3840 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3841 PyErr_NoMemory();
3842 return NULL;
3843 }
3844 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3845 if (!unicode)
3846 return NULL;
3847
3848 /* Unpack UTF-8 encoded data */
3849 p = unicode;
3850 e = s + size;
3851 while (s < e) {
3852 Py_UCS4 ch = (unsigned char)*s;
3853
3854 if (ch < 0x80) {
3855 *p++ = (wchar_t)ch;
3856 s++;
3857 continue;
3858 }
3859
3860 n = utf8_code_length[ch];
3861 if (s + n > e) {
3862 goto surrogateescape;
3863 }
3864
3865 switch (n) {
3866 case 0:
3867 case 1:
3868 goto surrogateescape;
3869
3870 case 2:
3871 if ((s[1] & 0xc0) != 0x80)
3872 goto surrogateescape;
3873 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3874 assert ((ch > 0x007F) && (ch <= 0x07FF));
3875 *p++ = (wchar_t)ch;
3876 break;
3877
3878 case 3:
3879 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3880 will result in surrogates in range d800-dfff. Surrogates are
3881 not valid UTF-8 so they are rejected.
3882 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3883 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3884 if ((s[1] & 0xc0) != 0x80 ||
3885 (s[2] & 0xc0) != 0x80 ||
3886 ((unsigned char)s[0] == 0xE0 &&
3887 (unsigned char)s[1] < 0xA0) ||
3888 ((unsigned char)s[0] == 0xED &&
3889 (unsigned char)s[1] > 0x9F)) {
3890
3891 goto surrogateescape;
3892 }
3893 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3894 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003896 break;
3897
3898 case 4:
3899 if ((s[1] & 0xc0) != 0x80 ||
3900 (s[2] & 0xc0) != 0x80 ||
3901 (s[3] & 0xc0) != 0x80 ||
3902 ((unsigned char)s[0] == 0xF0 &&
3903 (unsigned char)s[1] < 0x90) ||
3904 ((unsigned char)s[0] == 0xF4 &&
3905 (unsigned char)s[1] > 0x8F)) {
3906 goto surrogateescape;
3907 }
3908 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3909 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3910 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3911
3912#if SIZEOF_WCHAR_T == 4
3913 *p++ = (wchar_t)ch;
3914#else
3915 /* compute and append the two surrogates: */
3916
3917 /* translate from 10000..10FFFF to 0..FFFF */
3918 ch -= 0x10000;
3919
3920 /* high surrogate = top 10 bits added to D800 */
3921 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3922
3923 /* low surrogate = bottom 10 bits added to DC00 */
3924 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3925#endif
3926 break;
3927 }
3928 s += n;
3929 continue;
3930
3931 surrogateescape:
3932 *p++ = 0xDC00 + ch;
3933 s++;
3934 }
3935 *p = L'\0';
3936 return unicode;
3937}
3938
3939#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941/* Primary internal function which creates utf8 encoded bytes objects.
3942
3943 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003944 and allocate exactly as much space needed at the end. Else allocate the
3945 maximum possible needed (4 result bytes per Unicode character), and return
3946 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003947*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003948PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950{
Tim Peters602f7402002-04-27 18:03:26 +00003951#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003952
Guido van Rossum98297ee2007-11-06 21:34:58 +00003953 Py_ssize_t i; /* index into s of next input byte */
3954 PyObject *result; /* result string object */
3955 char *p; /* next free byte in output buffer */
3956 Py_ssize_t nallocated; /* number of result bytes allocated */
3957 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003958 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003959 PyObject *errorHandler = NULL;
3960 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003961 int kind;
3962 void *data;
3963 Py_ssize_t size;
3964 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3965#if SIZEOF_WCHAR_T == 2
3966 Py_ssize_t wchar_offset = 0;
3967#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003969 if (!PyUnicode_Check(unicode)) {
3970 PyErr_BadArgument();
3971 return NULL;
3972 }
3973
3974 if (PyUnicode_READY(unicode) == -1)
3975 return NULL;
3976
3977 if (_PyUnicode_UTF8(unicode))
3978 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
3979 _PyUnicode_UTF8_LENGTH(unicode));
3980
3981 kind = PyUnicode_KIND(unicode);
3982 data = PyUnicode_DATA(unicode);
3983 size = PyUnicode_GET_LENGTH(unicode);
3984
Tim Peters602f7402002-04-27 18:03:26 +00003985 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986
Tim Peters602f7402002-04-27 18:03:26 +00003987 if (size <= MAX_SHORT_UNICHARS) {
3988 /* Write into the stack buffer; nallocated can't overflow.
3989 * At the end, we'll allocate exactly as much heap space as it
3990 * turns out we need.
3991 */
3992 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003993 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00003994 p = stackbuf;
3995 }
3996 else {
3997 /* Overallocate on the heap, and give the excess back at the end. */
3998 nallocated = size * 4;
3999 if (nallocated / 4 != size) /* overflow! */
4000 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004001 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004002 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004003 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004004 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004005 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004006
Tim Peters602f7402002-04-27 18:03:26 +00004007 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004009
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004010 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004011 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004013
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004015 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004016 *p++ = (char)(0xc0 | (ch >> 6));
4017 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004018 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 Py_ssize_t newpos;
4020 PyObject *rep;
4021 Py_ssize_t repsize, k, startpos;
4022 startpos = i-1;
4023#if SIZEOF_WCHAR_T == 2
4024 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004025#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004026 rep = unicode_encode_call_errorhandler(
4027 errors, &errorHandler, "utf-8", "surrogates not allowed",
4028 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4029 &exc, startpos, startpos+1, &newpos);
4030 if (!rep)
4031 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033 if (PyBytes_Check(rep))
4034 repsize = PyBytes_GET_SIZE(rep);
4035 else
4036 repsize = PyUnicode_GET_SIZE(rep);
4037
4038 if (repsize > 4) {
4039 Py_ssize_t offset;
4040
4041 if (result == NULL)
4042 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004043 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004044 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004046 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4047 /* integer overflow */
4048 PyErr_NoMemory();
4049 goto error;
4050 }
4051 nallocated += repsize - 4;
4052 if (result != NULL) {
4053 if (_PyBytes_Resize(&result, nallocated) < 0)
4054 goto error;
4055 } else {
4056 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004057 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058 goto error;
4059 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4060 }
4061 p = PyBytes_AS_STRING(result) + offset;
4062 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 if (PyBytes_Check(rep)) {
4065 char *prep = PyBytes_AS_STRING(rep);
4066 for(k = repsize; k > 0; k--)
4067 *p++ = *prep++;
4068 } else /* rep is unicode */ {
4069 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4070 Py_UNICODE c;
4071
4072 for(k=0; k<repsize; k++) {
4073 c = prep[k];
4074 if (0x80 <= c) {
4075 raise_encode_exception(&exc, "utf-8",
4076 PyUnicode_AS_UNICODE(unicode),
4077 size, i-1, i,
4078 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004079 goto error;
4080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004081 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004082 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004084 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004085 } else if (ch < 0x10000) {
4086 *p++ = (char)(0xe0 | (ch >> 12));
4087 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4088 *p++ = (char)(0x80 | (ch & 0x3f));
4089 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004090 /* Encode UCS4 Unicode ordinals */
4091 *p++ = (char)(0xf0 | (ch >> 18));
4092 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4093 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4094 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004095#if SIZEOF_WCHAR_T == 2
4096 wchar_offset++;
4097#endif
Tim Peters602f7402002-04-27 18:03:26 +00004098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004100
Guido van Rossum98297ee2007-11-06 21:34:58 +00004101 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004102 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004103 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004104 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004105 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004106 }
4107 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004108 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004109 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004110 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004111 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004112 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004113
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004114 Py_XDECREF(errorHandler);
4115 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004116 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004117 error:
4118 Py_XDECREF(errorHandler);
4119 Py_XDECREF(exc);
4120 Py_XDECREF(result);
4121 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004122
Tim Peters602f7402002-04-27 18:03:26 +00004123#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124}
4125
Alexander Belopolsky40018472011-02-26 01:02:56 +00004126PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004127PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4128 Py_ssize_t size,
4129 const char *errors)
4130{
4131 PyObject *v, *unicode;
4132
4133 unicode = PyUnicode_FromUnicode(s, size);
4134 if (unicode == NULL)
4135 return NULL;
4136 v = _PyUnicode_AsUTF8String(unicode, errors);
4137 Py_DECREF(unicode);
4138 return v;
4139}
4140
4141PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004142PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145}
4146
Walter Dörwald41980ca2007-08-16 21:55:45 +00004147/* --- UTF-32 Codec ------------------------------------------------------- */
4148
4149PyObject *
4150PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 Py_ssize_t size,
4152 const char *errors,
4153 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004154{
4155 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4156}
4157
4158PyObject *
4159PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004160 Py_ssize_t size,
4161 const char *errors,
4162 int *byteorder,
4163 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004164{
4165 const char *starts = s;
4166 Py_ssize_t startinpos;
4167 Py_ssize_t endinpos;
4168 Py_ssize_t outpos;
4169 PyUnicodeObject *unicode;
4170 Py_UNICODE *p;
4171#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004172 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004173 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004174#else
4175 const int pairs = 0;
4176#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004177 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004178 int bo = 0; /* assume native ordering by default */
4179 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004180 /* Offsets from q for retrieving bytes in the right order. */
4181#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4182 int iorder[] = {0, 1, 2, 3};
4183#else
4184 int iorder[] = {3, 2, 1, 0};
4185#endif
4186 PyObject *errorHandler = NULL;
4187 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004188
Walter Dörwald41980ca2007-08-16 21:55:45 +00004189 q = (unsigned char *)s;
4190 e = q + size;
4191
4192 if (byteorder)
4193 bo = *byteorder;
4194
4195 /* Check for BOM marks (U+FEFF) in the input and adjust current
4196 byte order setting accordingly. In native mode, the leading BOM
4197 mark is skipped, in all other modes, it is copied to the output
4198 stream as-is (giving a ZWNBSP character). */
4199 if (bo == 0) {
4200 if (size >= 4) {
4201 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004202 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004203#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004204 if (bom == 0x0000FEFF) {
4205 q += 4;
4206 bo = -1;
4207 }
4208 else if (bom == 0xFFFE0000) {
4209 q += 4;
4210 bo = 1;
4211 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004212#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004213 if (bom == 0x0000FEFF) {
4214 q += 4;
4215 bo = 1;
4216 }
4217 else if (bom == 0xFFFE0000) {
4218 q += 4;
4219 bo = -1;
4220 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004221#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004222 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004223 }
4224
4225 if (bo == -1) {
4226 /* force LE */
4227 iorder[0] = 0;
4228 iorder[1] = 1;
4229 iorder[2] = 2;
4230 iorder[3] = 3;
4231 }
4232 else if (bo == 1) {
4233 /* force BE */
4234 iorder[0] = 3;
4235 iorder[1] = 2;
4236 iorder[2] = 1;
4237 iorder[3] = 0;
4238 }
4239
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004240 /* On narrow builds we split characters outside the BMP into two
4241 codepoints => count how much extra space we need. */
4242#ifndef Py_UNICODE_WIDE
4243 for (qq = q; qq < e; qq += 4)
4244 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4245 pairs++;
4246#endif
4247
4248 /* This might be one to much, because of a BOM */
4249 unicode = _PyUnicode_New((size+3)/4+pairs);
4250 if (!unicode)
4251 return NULL;
4252 if (size == 0)
4253 return (PyObject *)unicode;
4254
4255 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004256 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004257
Walter Dörwald41980ca2007-08-16 21:55:45 +00004258 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 Py_UCS4 ch;
4260 /* remaining bytes at the end? (size should be divisible by 4) */
4261 if (e-q<4) {
4262 if (consumed)
4263 break;
4264 errmsg = "truncated data";
4265 startinpos = ((const char *)q)-starts;
4266 endinpos = ((const char *)e)-starts;
4267 goto utf32Error;
4268 /* The remaining input chars are ignored if the callback
4269 chooses to skip the input */
4270 }
4271 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4272 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004273
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 if (ch >= 0x110000)
4275 {
4276 errmsg = "codepoint not in range(0x110000)";
4277 startinpos = ((const char *)q)-starts;
4278 endinpos = startinpos+4;
4279 goto utf32Error;
4280 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004281#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 if (ch >= 0x10000)
4283 {
4284 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4285 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4286 }
4287 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004288#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 *p++ = ch;
4290 q += 4;
4291 continue;
4292 utf32Error:
4293 outpos = p-PyUnicode_AS_UNICODE(unicode);
4294 if (unicode_decode_call_errorhandler(
4295 errors, &errorHandler,
4296 "utf32", errmsg,
4297 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4298 &unicode, &outpos, &p))
4299 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004300 }
4301
4302 if (byteorder)
4303 *byteorder = bo;
4304
4305 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004306 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004307
4308 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004309 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004310 goto onError;
4311
4312 Py_XDECREF(errorHandler);
4313 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004314 if (PyUnicode_READY(unicode) == -1) {
4315 Py_DECREF(unicode);
4316 return NULL;
4317 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004318 return (PyObject *)unicode;
4319
Benjamin Peterson29060642009-01-31 22:14:21 +00004320 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004321 Py_DECREF(unicode);
4322 Py_XDECREF(errorHandler);
4323 Py_XDECREF(exc);
4324 return NULL;
4325}
4326
4327PyObject *
4328PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004329 Py_ssize_t size,
4330 const char *errors,
4331 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004332{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004333 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004334 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004335 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004336#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004337 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004338#else
4339 const int pairs = 0;
4340#endif
4341 /* Offsets from p for storing byte pairs in the right order. */
4342#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4343 int iorder[] = {0, 1, 2, 3};
4344#else
4345 int iorder[] = {3, 2, 1, 0};
4346#endif
4347
Benjamin Peterson29060642009-01-31 22:14:21 +00004348#define STORECHAR(CH) \
4349 do { \
4350 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4351 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4352 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4353 p[iorder[0]] = (CH) & 0xff; \
4354 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004355 } while(0)
4356
4357 /* In narrow builds we can output surrogate pairs as one codepoint,
4358 so we need less space. */
4359#ifndef Py_UNICODE_WIDE
4360 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004361 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4362 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4363 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004364#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004365 nsize = (size - pairs + (byteorder == 0));
4366 bytesize = nsize * 4;
4367 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004368 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004369 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004370 if (v == NULL)
4371 return NULL;
4372
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004373 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004374 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004375 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004376 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004377 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004378
4379 if (byteorder == -1) {
4380 /* force LE */
4381 iorder[0] = 0;
4382 iorder[1] = 1;
4383 iorder[2] = 2;
4384 iorder[3] = 3;
4385 }
4386 else if (byteorder == 1) {
4387 /* force BE */
4388 iorder[0] = 3;
4389 iorder[1] = 2;
4390 iorder[2] = 1;
4391 iorder[3] = 0;
4392 }
4393
4394 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004396#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004397 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4398 Py_UCS4 ch2 = *s;
4399 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4400 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4401 s++;
4402 size--;
4403 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004404 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004405#endif
4406 STORECHAR(ch);
4407 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004408
4409 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004410 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004411#undef STORECHAR
4412}
4413
Alexander Belopolsky40018472011-02-26 01:02:56 +00004414PyObject *
4415PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004416{
4417 if (!PyUnicode_Check(unicode)) {
4418 PyErr_BadArgument();
4419 return NULL;
4420 }
4421 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 PyUnicode_GET_SIZE(unicode),
4423 NULL,
4424 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004425}
4426
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427/* --- UTF-16 Codec ------------------------------------------------------- */
4428
Tim Peters772747b2001-08-09 22:21:55 +00004429PyObject *
4430PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 Py_ssize_t size,
4432 const char *errors,
4433 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434{
Walter Dörwald69652032004-09-07 20:24:22 +00004435 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4436}
4437
Antoine Pitrouab868312009-01-10 15:40:25 +00004438/* Two masks for fast checking of whether a C 'long' may contain
4439 UTF16-encoded surrogate characters. This is an efficient heuristic,
4440 assuming that non-surrogate characters with a code point >= 0x8000 are
4441 rare in most input.
4442 FAST_CHAR_MASK is used when the input is in native byte ordering,
4443 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004444*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004445#if (SIZEOF_LONG == 8)
4446# define FAST_CHAR_MASK 0x8000800080008000L
4447# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4448#elif (SIZEOF_LONG == 4)
4449# define FAST_CHAR_MASK 0x80008000L
4450# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4451#else
4452# error C 'long' size should be either 4 or 8!
4453#endif
4454
Walter Dörwald69652032004-09-07 20:24:22 +00004455PyObject *
4456PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004457 Py_ssize_t size,
4458 const char *errors,
4459 int *byteorder,
4460 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004461{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004463 Py_ssize_t startinpos;
4464 Py_ssize_t endinpos;
4465 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466 PyUnicodeObject *unicode;
4467 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004468 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004469 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004470 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004471 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004472 /* Offsets from q for retrieving byte pairs in the right order. */
4473#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4474 int ihi = 1, ilo = 0;
4475#else
4476 int ihi = 0, ilo = 1;
4477#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 PyObject *errorHandler = NULL;
4479 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480
4481 /* Note: size will always be longer than the resulting Unicode
4482 character count */
4483 unicode = _PyUnicode_New(size);
4484 if (!unicode)
4485 return NULL;
4486 if (size == 0)
4487 return (PyObject *)unicode;
4488
4489 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004490 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004491 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004492 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493
4494 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004495 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004497 /* Check for BOM marks (U+FEFF) in the input and adjust current
4498 byte order setting accordingly. In native mode, the leading BOM
4499 mark is skipped, in all other modes, it is copied to the output
4500 stream as-is (giving a ZWNBSP character). */
4501 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004502 if (size >= 2) {
4503 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004504#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004505 if (bom == 0xFEFF) {
4506 q += 2;
4507 bo = -1;
4508 }
4509 else if (bom == 0xFFFE) {
4510 q += 2;
4511 bo = 1;
4512 }
Tim Petersced69f82003-09-16 20:30:58 +00004513#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 if (bom == 0xFEFF) {
4515 q += 2;
4516 bo = 1;
4517 }
4518 else if (bom == 0xFFFE) {
4519 q += 2;
4520 bo = -1;
4521 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004522#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004523 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004524 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525
Tim Peters772747b2001-08-09 22:21:55 +00004526 if (bo == -1) {
4527 /* force LE */
4528 ihi = 1;
4529 ilo = 0;
4530 }
4531 else if (bo == 1) {
4532 /* force BE */
4533 ihi = 0;
4534 ilo = 1;
4535 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004536#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4537 native_ordering = ilo < ihi;
4538#else
4539 native_ordering = ilo > ihi;
4540#endif
Tim Peters772747b2001-08-09 22:21:55 +00004541
Antoine Pitrouab868312009-01-10 15:40:25 +00004542 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004543 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004544 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004545 /* First check for possible aligned read of a C 'long'. Unaligned
4546 reads are more expensive, better to defer to another iteration. */
4547 if (!((size_t) q & LONG_PTR_MASK)) {
4548 /* Fast path for runs of non-surrogate chars. */
4549 register const unsigned char *_q = q;
4550 Py_UNICODE *_p = p;
4551 if (native_ordering) {
4552 /* Native ordering is simple: as long as the input cannot
4553 possibly contain a surrogate char, do an unrolled copy
4554 of several 16-bit code points to the target object.
4555 The non-surrogate check is done on several input bytes
4556 at a time (as many as a C 'long' can contain). */
4557 while (_q < aligned_end) {
4558 unsigned long data = * (unsigned long *) _q;
4559 if (data & FAST_CHAR_MASK)
4560 break;
4561 _p[0] = ((unsigned short *) _q)[0];
4562 _p[1] = ((unsigned short *) _q)[1];
4563#if (SIZEOF_LONG == 8)
4564 _p[2] = ((unsigned short *) _q)[2];
4565 _p[3] = ((unsigned short *) _q)[3];
4566#endif
4567 _q += SIZEOF_LONG;
4568 _p += SIZEOF_LONG / 2;
4569 }
4570 }
4571 else {
4572 /* Byteswapped ordering is similar, but we must decompose
4573 the copy bytewise, and take care of zero'ing out the
4574 upper bytes if the target object is in 32-bit units
4575 (that is, in UCS-4 builds). */
4576 while (_q < aligned_end) {
4577 unsigned long data = * (unsigned long *) _q;
4578 if (data & SWAPPED_FAST_CHAR_MASK)
4579 break;
4580 /* Zero upper bytes in UCS-4 builds */
4581#if (Py_UNICODE_SIZE > 2)
4582 _p[0] = 0;
4583 _p[1] = 0;
4584#if (SIZEOF_LONG == 8)
4585 _p[2] = 0;
4586 _p[3] = 0;
4587#endif
4588#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004589 /* Issue #4916; UCS-4 builds on big endian machines must
4590 fill the two last bytes of each 4-byte unit. */
4591#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4592# define OFF 2
4593#else
4594# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004595#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004596 ((unsigned char *) _p)[OFF + 1] = _q[0];
4597 ((unsigned char *) _p)[OFF + 0] = _q[1];
4598 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4599 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4600#if (SIZEOF_LONG == 8)
4601 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4602 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4603 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4604 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4605#endif
4606#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004607 _q += SIZEOF_LONG;
4608 _p += SIZEOF_LONG / 2;
4609 }
4610 }
4611 p = _p;
4612 q = _q;
4613 if (q >= e)
4614 break;
4615 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617
Benjamin Peterson14339b62009-01-31 16:36:08 +00004618 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004619
4620 if (ch < 0xD800 || ch > 0xDFFF) {
4621 *p++ = ch;
4622 continue;
4623 }
4624
4625 /* UTF-16 code pair: */
4626 if (q > e) {
4627 errmsg = "unexpected end of data";
4628 startinpos = (((const char *)q) - 2) - starts;
4629 endinpos = ((const char *)e) + 1 - starts;
4630 goto utf16Error;
4631 }
4632 if (0xD800 <= ch && ch <= 0xDBFF) {
4633 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4634 q += 2;
4635 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004636#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004637 *p++ = ch;
4638 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004639#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004640 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004641#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004642 continue;
4643 }
4644 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004645 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004646 startinpos = (((const char *)q)-4)-starts;
4647 endinpos = startinpos+2;
4648 goto utf16Error;
4649 }
4650
Benjamin Peterson14339b62009-01-31 16:36:08 +00004651 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004652 errmsg = "illegal encoding";
4653 startinpos = (((const char *)q)-2)-starts;
4654 endinpos = startinpos+2;
4655 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004656
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 utf16Error:
4658 outpos = p - PyUnicode_AS_UNICODE(unicode);
4659 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004660 errors,
4661 &errorHandler,
4662 "utf16", errmsg,
4663 &starts,
4664 (const char **)&e,
4665 &startinpos,
4666 &endinpos,
4667 &exc,
4668 (const char **)&q,
4669 &unicode,
4670 &outpos,
4671 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004672 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004673 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004674 /* remaining byte at the end? (size should be even) */
4675 if (e == q) {
4676 if (!consumed) {
4677 errmsg = "truncated data";
4678 startinpos = ((const char *)q) - starts;
4679 endinpos = ((const char *)e) + 1 - starts;
4680 outpos = p - PyUnicode_AS_UNICODE(unicode);
4681 if (unicode_decode_call_errorhandler(
4682 errors,
4683 &errorHandler,
4684 "utf16", errmsg,
4685 &starts,
4686 (const char **)&e,
4687 &startinpos,
4688 &endinpos,
4689 &exc,
4690 (const char **)&q,
4691 &unicode,
4692 &outpos,
4693 &p))
4694 goto onError;
4695 /* The remaining input chars are ignored if the callback
4696 chooses to skip the input */
4697 }
4698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699
4700 if (byteorder)
4701 *byteorder = bo;
4702
Walter Dörwald69652032004-09-07 20:24:22 +00004703 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004704 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004705
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004707 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708 goto onError;
4709
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004710 Py_XDECREF(errorHandler);
4711 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004712 if (PyUnicode_READY(unicode) == -1) {
4713 Py_DECREF(unicode);
4714 return NULL;
4715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004716 return (PyObject *)unicode;
4717
Benjamin Peterson29060642009-01-31 22:14:21 +00004718 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004720 Py_XDECREF(errorHandler);
4721 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722 return NULL;
4723}
4724
Antoine Pitrouab868312009-01-10 15:40:25 +00004725#undef FAST_CHAR_MASK
4726#undef SWAPPED_FAST_CHAR_MASK
4727
Tim Peters772747b2001-08-09 22:21:55 +00004728PyObject *
4729PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 Py_ssize_t size,
4731 const char *errors,
4732 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004734 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004735 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004736 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004737#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004738 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004739#else
4740 const int pairs = 0;
4741#endif
Tim Peters772747b2001-08-09 22:21:55 +00004742 /* Offsets from p for storing byte pairs in the right order. */
4743#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4744 int ihi = 1, ilo = 0;
4745#else
4746 int ihi = 0, ilo = 1;
4747#endif
4748
Benjamin Peterson29060642009-01-31 22:14:21 +00004749#define STORECHAR(CH) \
4750 do { \
4751 p[ihi] = ((CH) >> 8) & 0xff; \
4752 p[ilo] = (CH) & 0xff; \
4753 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004754 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004756#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004757 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004758 if (s[i] >= 0x10000)
4759 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004760#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004761 /* 2 * (size + pairs + (byteorder == 0)) */
4762 if (size > PY_SSIZE_T_MAX ||
4763 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004765 nsize = size + pairs + (byteorder == 0);
4766 bytesize = nsize * 2;
4767 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004768 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004769 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770 if (v == NULL)
4771 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004773 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004775 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004776 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004777 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004778
4779 if (byteorder == -1) {
4780 /* force LE */
4781 ihi = 1;
4782 ilo = 0;
4783 }
4784 else if (byteorder == 1) {
4785 /* force BE */
4786 ihi = 0;
4787 ilo = 1;
4788 }
4789
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004790 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 Py_UNICODE ch = *s++;
4792 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004793#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 if (ch >= 0x10000) {
4795 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4796 ch = 0xD800 | ((ch-0x10000) >> 10);
4797 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004798#endif
Tim Peters772747b2001-08-09 22:21:55 +00004799 STORECHAR(ch);
4800 if (ch2)
4801 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004802 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004803
4804 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004805 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004806#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807}
4808
Alexander Belopolsky40018472011-02-26 01:02:56 +00004809PyObject *
4810PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811{
4812 if (!PyUnicode_Check(unicode)) {
4813 PyErr_BadArgument();
4814 return NULL;
4815 }
4816 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004817 PyUnicode_GET_SIZE(unicode),
4818 NULL,
4819 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820}
4821
4822/* --- Unicode Escape Codec ----------------------------------------------- */
4823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004824/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4825 if all the escapes in the string make it still a valid ASCII string.
4826 Returns -1 if any escapes were found which cause the string to
4827 pop out of ASCII range. Otherwise returns the length of the
4828 required buffer to hold the string.
4829 */
4830Py_ssize_t
4831length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4832{
4833 const unsigned char *p = (const unsigned char *)s;
4834 const unsigned char *end = p + size;
4835 Py_ssize_t length = 0;
4836
4837 if (size < 0)
4838 return -1;
4839
4840 for (; p < end; ++p) {
4841 if (*p > 127) {
4842 /* Non-ASCII */
4843 return -1;
4844 }
4845 else if (*p != '\\') {
4846 /* Normal character */
4847 ++length;
4848 }
4849 else {
4850 /* Backslash-escape, check next char */
4851 ++p;
4852 /* Escape sequence reaches till end of string or
4853 non-ASCII follow-up. */
4854 if (p >= end || *p > 127)
4855 return -1;
4856 switch (*p) {
4857 case '\n':
4858 /* backslash + \n result in zero characters */
4859 break;
4860 case '\\': case '\'': case '\"':
4861 case 'b': case 'f': case 't':
4862 case 'n': case 'r': case 'v': case 'a':
4863 ++length;
4864 break;
4865 case '0': case '1': case '2': case '3':
4866 case '4': case '5': case '6': case '7':
4867 case 'x': case 'u': case 'U': case 'N':
4868 /* these do not guarantee ASCII characters */
4869 return -1;
4870 default:
4871 /* count the backslash + the other character */
4872 length += 2;
4873 }
4874 }
4875 }
4876 return length;
4877}
4878
4879/* Similar to PyUnicode_WRITE but either write into wstr field
4880 or treat string as ASCII. */
4881#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4882 do { \
4883 if ((kind) != PyUnicode_WCHAR_KIND) \
4884 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4885 else \
4886 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4887 } while (0)
4888
4889#define WRITE_WSTR(buf, index, value) \
4890 assert(kind == PyUnicode_WCHAR_KIND), \
4891 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4892
4893
Fredrik Lundh06d12682001-01-24 07:59:11 +00004894static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004895
Alexander Belopolsky40018472011-02-26 01:02:56 +00004896PyObject *
4897PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004898 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02004899 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004901 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004902 Py_ssize_t startinpos;
4903 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004904 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004906 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004908 char* message;
4909 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004910 PyObject *errorHandler = NULL;
4911 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004912 Py_ssize_t ascii_length;
4913 Py_ssize_t i;
4914 int kind;
4915 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004917 ascii_length = length_of_escaped_ascii_string(s, size);
4918
4919 /* After length_of_escaped_ascii_string() there are two alternatives,
4920 either the string is pure ASCII with named escapes like \n, etc.
4921 and we determined it's exact size (common case)
4922 or it contains \x, \u, ... escape sequences. then we create a
4923 legacy wchar string and resize it at the end of this function. */
4924 if (ascii_length >= 0) {
4925 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4926 if (!v)
4927 goto onError;
4928 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4929 kind = PyUnicode_1BYTE_KIND;
4930 data = PyUnicode_DATA(v);
4931 }
4932 else {
4933 /* Escaped strings will always be longer than the resulting
4934 Unicode string, so we start with size here and then reduce the
4935 length after conversion to the true value.
4936 (but if the error callback returns a long replacement string
4937 we'll have to allocate more space) */
4938 v = _PyUnicode_New(size);
4939 if (!v)
4940 goto onError;
4941 kind = PyUnicode_WCHAR_KIND;
4942 data = PyUnicode_AS_UNICODE(v);
4943 }
4944
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 if (size == 0)
4946 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004947 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004949
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950 while (s < end) {
4951 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004952 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004953 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004955 if (kind == PyUnicode_WCHAR_KIND) {
4956 assert(i < _PyUnicode_WSTR_LENGTH(v));
4957 }
4958 else {
4959 /* The only case in which i == ascii_length is a backslash
4960 followed by a newline. */
4961 assert(i <= ascii_length);
4962 }
4963
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964 /* Non-escape characters are interpreted as Unicode ordinals */
4965 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004966 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967 continue;
4968 }
4969
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004970 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971 /* \ - Escapes */
4972 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004973 c = *s++;
4974 if (s > end)
4975 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004976
4977 if (kind == PyUnicode_WCHAR_KIND) {
4978 assert(i < _PyUnicode_WSTR_LENGTH(v));
4979 }
4980 else {
4981 /* The only case in which i == ascii_length is a backslash
4982 followed by a newline. */
4983 assert(i < ascii_length || (i == ascii_length && c == '\n'));
4984 }
4985
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004986 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987
Benjamin Peterson29060642009-01-31 22:14:21 +00004988 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004990 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
4991 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
4992 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
4993 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
4994 /* FF */
4995 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
4996 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
4997 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
4998 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
4999 /* VT */
5000 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5001 /* BEL, not classic C */
5002 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005 case '0': case '1': case '2': case '3':
5006 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005007 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005008 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005009 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005010 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005011 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005013 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014 break;
5015
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 /* hex escapes */
5017 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005018 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005019 digits = 2;
5020 message = "truncated \\xXX escape";
5021 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005025 digits = 4;
5026 message = "truncated \\uXXXX escape";
5027 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005030 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005031 digits = 8;
5032 message = "truncated \\UXXXXXXXX escape";
5033 hexescape:
5034 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005035 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005036 if (s+digits>end) {
5037 endinpos = size;
5038 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 errors, &errorHandler,
5040 "unicodeescape", "end of string in escape sequence",
5041 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005042 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005043 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005044 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 goto nextByte;
5046 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005047 for (j = 0; j < digits; ++j) {
5048 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005049 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005050 endinpos = (s+j+1)-starts;
5051 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005052 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 errors, &errorHandler,
5054 "unicodeescape", message,
5055 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005056 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005057 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005058 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005059 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005060 }
5061 chr = (chr<<4) & ~0xF;
5062 if (c >= '0' && c <= '9')
5063 chr += c - '0';
5064 else if (c >= 'a' && c <= 'f')
5065 chr += 10 + c - 'a';
5066 else
5067 chr += 10 + c - 'A';
5068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005069 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005070 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005071 /* _decoding_error will have already written into the
5072 target buffer. */
5073 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005074 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005075 /* when we get here, chr is a 32-bit unicode character */
5076 if (chr <= 0xffff)
5077 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005078 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005079 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005080 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005081 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005082#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005083 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005084#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005085 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005086 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5087 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005088#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005089 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005090 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005091 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005092 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 errors, &errorHandler,
5094 "unicodeescape", "illegal Unicode character",
5095 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005096 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005097 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005098 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005099 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005100 break;
5101
Benjamin Peterson29060642009-01-31 22:14:21 +00005102 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005103 case 'N':
5104 message = "malformed \\N character escape";
5105 if (ucnhash_CAPI == NULL) {
5106 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005107 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5108 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005109 if (ucnhash_CAPI == NULL)
5110 goto ucnhashError;
5111 }
5112 if (*s == '{') {
5113 const char *start = s+1;
5114 /* look for the closing brace */
5115 while (*s != '}' && s < end)
5116 s++;
5117 if (s > start && s < end && *s == '}') {
5118 /* found a name. look it up in the unicode database */
5119 message = "unknown Unicode character name";
5120 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005121 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5122 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005123 goto store;
5124 }
5125 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005127 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005128 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 errors, &errorHandler,
5130 "unicodeescape", message,
5131 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005132 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005133 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005134 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005135 break;
5136
5137 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005138 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005139 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005140 message = "\\ at end of string";
5141 s--;
5142 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005143 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005144 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 errors, &errorHandler,
5146 "unicodeescape", message,
5147 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005148 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005149 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005150 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005151 }
5152 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005153 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5154 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005155 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005156 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005158 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005159 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005161 /* Ensure the length prediction worked in case of ASCII strings */
5162 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5163
5164 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5165 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005166 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005167 Py_XDECREF(errorHandler);
5168 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005170
Benjamin Peterson29060642009-01-31 22:14:21 +00005171 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005172 PyErr_SetString(
5173 PyExc_UnicodeError,
5174 "\\N escapes not supported (can't load unicodedata module)"
5175 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005176 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005177 Py_XDECREF(errorHandler);
5178 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005179 return NULL;
5180
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005183 Py_XDECREF(errorHandler);
5184 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 return NULL;
5186}
5187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005188#undef WRITE_ASCII_OR_WSTR
5189#undef WRITE_WSTR
5190
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191/* Return a Unicode-Escape string version of the Unicode object.
5192
5193 If quotes is true, the string is enclosed in u"" or u'' quotes as
5194 appropriate.
5195
5196*/
5197
Walter Dörwald79e913e2007-05-12 11:08:06 +00005198static const char *hexdigits = "0123456789abcdef";
5199
Alexander Belopolsky40018472011-02-26 01:02:56 +00005200PyObject *
5201PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005202 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005204 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005207#ifdef Py_UNICODE_WIDE
5208 const Py_ssize_t expandsize = 10;
5209#else
5210 const Py_ssize_t expandsize = 6;
5211#endif
5212
Thomas Wouters89f507f2006-12-13 04:49:30 +00005213 /* XXX(nnorwitz): rather than over-allocating, it would be
5214 better to choose a different scheme. Perhaps scan the
5215 first N-chars of the string and allocate based on that size.
5216 */
5217 /* Initial allocation is based on the longest-possible unichr
5218 escape.
5219
5220 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5221 unichr, so in this case it's the longest unichr escape. In
5222 narrow (UTF-16) builds this is five chars per source unichr
5223 since there are two unichrs in the surrogate pair, so in narrow
5224 (UTF-16) builds it's not the longest unichr escape.
5225
5226 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5227 so in the narrow (UTF-16) build case it's the longest unichr
5228 escape.
5229 */
5230
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005231 if (size == 0)
5232 return PyBytes_FromStringAndSize(NULL, 0);
5233
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005234 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005235 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005236
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005237 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005238 2
5239 + expandsize*size
5240 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 if (repr == NULL)
5242 return NULL;
5243
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005244 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 while (size-- > 0) {
5247 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005248
Walter Dörwald79e913e2007-05-12 11:08:06 +00005249 /* Escape backslashes */
5250 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251 *p++ = '\\';
5252 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005253 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005254 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005255
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005256#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005257 /* Map 21-bit characters to '\U00xxxxxx' */
5258 else if (ch >= 0x10000) {
5259 *p++ = '\\';
5260 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005261 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5262 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5263 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5264 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5265 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5266 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5267 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5268 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005269 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005270 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005271#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005272 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5273 else if (ch >= 0xD800 && ch < 0xDC00) {
5274 Py_UNICODE ch2;
5275 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005276
Benjamin Peterson29060642009-01-31 22:14:21 +00005277 ch2 = *s++;
5278 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005279 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005280 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5281 *p++ = '\\';
5282 *p++ = 'U';
5283 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5284 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5285 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5286 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5287 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5288 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5289 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5290 *p++ = hexdigits[ucs & 0x0000000F];
5291 continue;
5292 }
5293 /* Fall through: isolated surrogates are copied as-is */
5294 s--;
5295 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005296 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005297#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005298
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005300 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 *p++ = '\\';
5302 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005303 *p++ = hexdigits[(ch >> 12) & 0x000F];
5304 *p++ = hexdigits[(ch >> 8) & 0x000F];
5305 *p++ = hexdigits[(ch >> 4) & 0x000F];
5306 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005308
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005309 /* Map special whitespace to '\t', \n', '\r' */
5310 else if (ch == '\t') {
5311 *p++ = '\\';
5312 *p++ = 't';
5313 }
5314 else if (ch == '\n') {
5315 *p++ = '\\';
5316 *p++ = 'n';
5317 }
5318 else if (ch == '\r') {
5319 *p++ = '\\';
5320 *p++ = 'r';
5321 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005322
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005323 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005324 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005326 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005327 *p++ = hexdigits[(ch >> 4) & 0x000F];
5328 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005329 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005330
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 /* Copy everything else as-is */
5332 else
5333 *p++ = (char) ch;
5334 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005336 assert(p - PyBytes_AS_STRING(repr) > 0);
5337 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5338 return NULL;
5339 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340}
5341
Alexander Belopolsky40018472011-02-26 01:02:56 +00005342PyObject *
5343PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005345 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346 if (!PyUnicode_Check(unicode)) {
5347 PyErr_BadArgument();
5348 return NULL;
5349 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005350 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5351 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005352 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353}
5354
5355/* --- Raw Unicode Escape Codec ------------------------------------------- */
5356
Alexander Belopolsky40018472011-02-26 01:02:56 +00005357PyObject *
5358PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005359 Py_ssize_t size,
5360 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005362 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005363 Py_ssize_t startinpos;
5364 Py_ssize_t endinpos;
5365 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005367 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 const char *end;
5369 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005370 PyObject *errorHandler = NULL;
5371 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005372
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373 /* Escaped strings will always be longer than the resulting
5374 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005375 length after conversion to the true value. (But decoding error
5376 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 v = _PyUnicode_New(size);
5378 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005382 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 end = s + size;
5384 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005385 unsigned char c;
5386 Py_UCS4 x;
5387 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005388 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389
Benjamin Peterson29060642009-01-31 22:14:21 +00005390 /* Non-escape characters are interpreted as Unicode ordinals */
5391 if (*s != '\\') {
5392 *p++ = (unsigned char)*s++;
5393 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005394 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005395 startinpos = s-starts;
5396
5397 /* \u-escapes are only interpreted iff the number of leading
5398 backslashes if odd */
5399 bs = s;
5400 for (;s < end;) {
5401 if (*s != '\\')
5402 break;
5403 *p++ = (unsigned char)*s++;
5404 }
5405 if (((s - bs) & 1) == 0 ||
5406 s >= end ||
5407 (*s != 'u' && *s != 'U')) {
5408 continue;
5409 }
5410 p--;
5411 count = *s=='u' ? 4 : 8;
5412 s++;
5413
5414 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5415 outpos = p-PyUnicode_AS_UNICODE(v);
5416 for (x = 0, i = 0; i < count; ++i, ++s) {
5417 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005418 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005419 endinpos = s-starts;
5420 if (unicode_decode_call_errorhandler(
5421 errors, &errorHandler,
5422 "rawunicodeescape", "truncated \\uXXXX",
5423 &starts, &end, &startinpos, &endinpos, &exc, &s,
5424 &v, &outpos, &p))
5425 goto onError;
5426 goto nextByte;
5427 }
5428 x = (x<<4) & ~0xF;
5429 if (c >= '0' && c <= '9')
5430 x += c - '0';
5431 else if (c >= 'a' && c <= 'f')
5432 x += 10 + c - 'a';
5433 else
5434 x += 10 + c - 'A';
5435 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005436 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 /* UCS-2 character */
5438 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005439 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 /* UCS-4 character. Either store directly, or as
5441 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005442#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005443 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005444#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005445 x -= 0x10000L;
5446 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5447 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005448#endif
5449 } else {
5450 endinpos = s-starts;
5451 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005452 if (unicode_decode_call_errorhandler(
5453 errors, &errorHandler,
5454 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 &starts, &end, &startinpos, &endinpos, &exc, &s,
5456 &v, &outpos, &p))
5457 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005458 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005459 nextByte:
5460 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005462 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005463 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005464 Py_XDECREF(errorHandler);
5465 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005466 if (PyUnicode_READY(v) == -1) {
5467 Py_DECREF(v);
5468 return NULL;
5469 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005471
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005474 Py_XDECREF(errorHandler);
5475 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 return NULL;
5477}
5478
Alexander Belopolsky40018472011-02-26 01:02:56 +00005479PyObject *
5480PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005481 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005483 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 char *p;
5485 char *q;
5486
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005487#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005488 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005489#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005490 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005491#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005492
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005493 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005495
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005496 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 if (repr == NULL)
5498 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005499 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005500 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005502 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 while (size-- > 0) {
5504 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005505#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 /* Map 32-bit characters to '\Uxxxxxxxx' */
5507 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005508 *p++ = '\\';
5509 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005510 *p++ = hexdigits[(ch >> 28) & 0xf];
5511 *p++ = hexdigits[(ch >> 24) & 0xf];
5512 *p++ = hexdigits[(ch >> 20) & 0xf];
5513 *p++ = hexdigits[(ch >> 16) & 0xf];
5514 *p++ = hexdigits[(ch >> 12) & 0xf];
5515 *p++ = hexdigits[(ch >> 8) & 0xf];
5516 *p++ = hexdigits[(ch >> 4) & 0xf];
5517 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005518 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005519 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005520#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5522 if (ch >= 0xD800 && ch < 0xDC00) {
5523 Py_UNICODE ch2;
5524 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005525
Benjamin Peterson29060642009-01-31 22:14:21 +00005526 ch2 = *s++;
5527 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005528 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5530 *p++ = '\\';
5531 *p++ = 'U';
5532 *p++ = hexdigits[(ucs >> 28) & 0xf];
5533 *p++ = hexdigits[(ucs >> 24) & 0xf];
5534 *p++ = hexdigits[(ucs >> 20) & 0xf];
5535 *p++ = hexdigits[(ucs >> 16) & 0xf];
5536 *p++ = hexdigits[(ucs >> 12) & 0xf];
5537 *p++ = hexdigits[(ucs >> 8) & 0xf];
5538 *p++ = hexdigits[(ucs >> 4) & 0xf];
5539 *p++ = hexdigits[ucs & 0xf];
5540 continue;
5541 }
5542 /* Fall through: isolated surrogates are copied as-is */
5543 s--;
5544 size++;
5545 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005546#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005547 /* Map 16-bit characters to '\uxxxx' */
5548 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 *p++ = '\\';
5550 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005551 *p++ = hexdigits[(ch >> 12) & 0xf];
5552 *p++ = hexdigits[(ch >> 8) & 0xf];
5553 *p++ = hexdigits[(ch >> 4) & 0xf];
5554 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 /* Copy everything else as-is */
5557 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558 *p++ = (char) ch;
5559 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005560 size = p - q;
5561
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005562 assert(size > 0);
5563 if (_PyBytes_Resize(&repr, size) < 0)
5564 return NULL;
5565 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566}
5567
Alexander Belopolsky40018472011-02-26 01:02:56 +00005568PyObject *
5569PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005571 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005573 PyErr_BadArgument();
5574 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005576 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5577 PyUnicode_GET_SIZE(unicode));
5578
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005579 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580}
5581
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005582/* --- Unicode Internal Codec ------------------------------------------- */
5583
Alexander Belopolsky40018472011-02-26 01:02:56 +00005584PyObject *
5585_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005586 Py_ssize_t size,
5587 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005588{
5589 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005590 Py_ssize_t startinpos;
5591 Py_ssize_t endinpos;
5592 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005593 PyUnicodeObject *v;
5594 Py_UNICODE *p;
5595 const char *end;
5596 const char *reason;
5597 PyObject *errorHandler = NULL;
5598 PyObject *exc = NULL;
5599
Neal Norwitzd43069c2006-01-08 01:12:10 +00005600#ifdef Py_UNICODE_WIDE
5601 Py_UNICODE unimax = PyUnicode_GetMax();
5602#endif
5603
Thomas Wouters89f507f2006-12-13 04:49:30 +00005604 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005605 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5606 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005608 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5609 as string was created with the old API. */
5610 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005612 p = PyUnicode_AS_UNICODE(v);
5613 end = s + size;
5614
5615 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005616 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005617 /* We have to sanity check the raw data, otherwise doom looms for
5618 some malformed UCS-4 data. */
5619 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005620#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005621 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005622#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005623 end-s < Py_UNICODE_SIZE
5624 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005626 startinpos = s - starts;
5627 if (end-s < Py_UNICODE_SIZE) {
5628 endinpos = end-starts;
5629 reason = "truncated input";
5630 }
5631 else {
5632 endinpos = s - starts + Py_UNICODE_SIZE;
5633 reason = "illegal code point (> 0x10FFFF)";
5634 }
5635 outpos = p - PyUnicode_AS_UNICODE(v);
5636 if (unicode_decode_call_errorhandler(
5637 errors, &errorHandler,
5638 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005639 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005640 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005641 goto onError;
5642 }
5643 }
5644 else {
5645 p++;
5646 s += Py_UNICODE_SIZE;
5647 }
5648 }
5649
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005650 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005651 goto onError;
5652 Py_XDECREF(errorHandler);
5653 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005654 if (PyUnicode_READY(v) == -1) {
5655 Py_DECREF(v);
5656 return NULL;
5657 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005658 return (PyObject *)v;
5659
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005661 Py_XDECREF(v);
5662 Py_XDECREF(errorHandler);
5663 Py_XDECREF(exc);
5664 return NULL;
5665}
5666
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667/* --- Latin-1 Codec ------------------------------------------------------ */
5668
Alexander Belopolsky40018472011-02-26 01:02:56 +00005669PyObject *
5670PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005671 Py_ssize_t size,
5672 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005675 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676}
5677
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005678/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005679static void
5680make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005681 const char *encoding,
5682 const Py_UNICODE *unicode, Py_ssize_t size,
5683 Py_ssize_t startpos, Py_ssize_t endpos,
5684 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 *exceptionObject = PyUnicodeEncodeError_Create(
5688 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 }
5690 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5692 goto onError;
5693 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5694 goto onError;
5695 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5696 goto onError;
5697 return;
5698 onError:
5699 Py_DECREF(*exceptionObject);
5700 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 }
5702}
5703
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005704/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005705static void
5706raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005707 const char *encoding,
5708 const Py_UNICODE *unicode, Py_ssize_t size,
5709 Py_ssize_t startpos, Py_ssize_t endpos,
5710 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005711{
5712 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005714 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005716}
5717
5718/* error handling callback helper:
5719 build arguments, call the callback and check the arguments,
5720 put the result into newpos and return the replacement string, which
5721 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005722static PyObject *
5723unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005724 PyObject **errorHandler,
5725 const char *encoding, const char *reason,
5726 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5727 Py_ssize_t startpos, Py_ssize_t endpos,
5728 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005729{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005730 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731
5732 PyObject *restuple;
5733 PyObject *resunicode;
5734
5735 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739 }
5740
5741 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005743 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005744 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005745
5746 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005748 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005750 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005751 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 Py_DECREF(restuple);
5753 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005754 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005755 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 &resunicode, newpos)) {
5757 Py_DECREF(restuple);
5758 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005759 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005760 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5761 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5762 Py_DECREF(restuple);
5763 return NULL;
5764 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005765 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005767 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5769 Py_DECREF(restuple);
5770 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005771 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005772 Py_INCREF(resunicode);
5773 Py_DECREF(restuple);
5774 return resunicode;
5775}
5776
Alexander Belopolsky40018472011-02-26 01:02:56 +00005777static PyObject *
5778unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005779 Py_ssize_t size,
5780 const char *errors,
5781 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782{
5783 /* output object */
5784 PyObject *res;
5785 /* pointers to the beginning and end+1 of input */
5786 const Py_UNICODE *startp = p;
5787 const Py_UNICODE *endp = p + size;
5788 /* pointer to the beginning of the unencodable characters */
5789 /* const Py_UNICODE *badp = NULL; */
5790 /* pointer into the output */
5791 char *str;
5792 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005793 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005794 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5795 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 PyObject *errorHandler = NULL;
5797 PyObject *exc = NULL;
5798 /* the following variable is used for caching string comparisons
5799 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5800 int known_errorHandler = -1;
5801
5802 /* allocate enough for a simple encoding without
5803 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005804 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005805 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005806 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005807 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005808 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005809 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005810 ressize = size;
5811
5812 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005814
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 /* can we encode this? */
5816 if (c<limit) {
5817 /* no overflow check, because we know that the space is enough */
5818 *str++ = (char)c;
5819 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005820 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 else {
5822 Py_ssize_t unicodepos = p-startp;
5823 Py_ssize_t requiredsize;
5824 PyObject *repunicode;
5825 Py_ssize_t repsize;
5826 Py_ssize_t newpos;
5827 Py_ssize_t respos;
5828 Py_UNICODE *uni2;
5829 /* startpos for collecting unencodable chars */
5830 const Py_UNICODE *collstart = p;
5831 const Py_UNICODE *collend = p;
5832 /* find all unecodable characters */
5833 while ((collend < endp) && ((*collend)>=limit))
5834 ++collend;
5835 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5836 if (known_errorHandler==-1) {
5837 if ((errors==NULL) || (!strcmp(errors, "strict")))
5838 known_errorHandler = 1;
5839 else if (!strcmp(errors, "replace"))
5840 known_errorHandler = 2;
5841 else if (!strcmp(errors, "ignore"))
5842 known_errorHandler = 3;
5843 else if (!strcmp(errors, "xmlcharrefreplace"))
5844 known_errorHandler = 4;
5845 else
5846 known_errorHandler = 0;
5847 }
5848 switch (known_errorHandler) {
5849 case 1: /* strict */
5850 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5851 goto onError;
5852 case 2: /* replace */
5853 while (collstart++<collend)
5854 *str++ = '?'; /* fall through */
5855 case 3: /* ignore */
5856 p = collend;
5857 break;
5858 case 4: /* xmlcharrefreplace */
5859 respos = str - PyBytes_AS_STRING(res);
5860 /* determine replacement size (temporarily (mis)uses p) */
5861 for (p = collstart, repsize = 0; p < collend; ++p) {
5862 if (*p<10)
5863 repsize += 2+1+1;
5864 else if (*p<100)
5865 repsize += 2+2+1;
5866 else if (*p<1000)
5867 repsize += 2+3+1;
5868 else if (*p<10000)
5869 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005870#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005871 else
5872 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005873#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 else if (*p<100000)
5875 repsize += 2+5+1;
5876 else if (*p<1000000)
5877 repsize += 2+6+1;
5878 else
5879 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005880#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 }
5882 requiredsize = respos+repsize+(endp-collend);
5883 if (requiredsize > ressize) {
5884 if (requiredsize<2*ressize)
5885 requiredsize = 2*ressize;
5886 if (_PyBytes_Resize(&res, requiredsize))
5887 goto onError;
5888 str = PyBytes_AS_STRING(res) + respos;
5889 ressize = requiredsize;
5890 }
5891 /* generate replacement (temporarily (mis)uses p) */
5892 for (p = collstart; p < collend; ++p) {
5893 str += sprintf(str, "&#%d;", (int)*p);
5894 }
5895 p = collend;
5896 break;
5897 default:
5898 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5899 encoding, reason, startp, size, &exc,
5900 collstart-startp, collend-startp, &newpos);
5901 if (repunicode == NULL)
5902 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005903 if (PyBytes_Check(repunicode)) {
5904 /* Directly copy bytes result to output. */
5905 repsize = PyBytes_Size(repunicode);
5906 if (repsize > 1) {
5907 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005908 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005909 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5910 Py_DECREF(repunicode);
5911 goto onError;
5912 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005913 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005914 ressize += repsize-1;
5915 }
5916 memcpy(str, PyBytes_AsString(repunicode), repsize);
5917 str += repsize;
5918 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005919 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005920 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005921 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 /* need more space? (at least enough for what we
5923 have+the replacement+the rest of the string, so
5924 we won't have to check space for encodable characters) */
5925 respos = str - PyBytes_AS_STRING(res);
5926 repsize = PyUnicode_GET_SIZE(repunicode);
5927 requiredsize = respos+repsize+(endp-collend);
5928 if (requiredsize > ressize) {
5929 if (requiredsize<2*ressize)
5930 requiredsize = 2*ressize;
5931 if (_PyBytes_Resize(&res, requiredsize)) {
5932 Py_DECREF(repunicode);
5933 goto onError;
5934 }
5935 str = PyBytes_AS_STRING(res) + respos;
5936 ressize = requiredsize;
5937 }
5938 /* check if there is anything unencodable in the replacement
5939 and copy it to the output */
5940 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5941 c = *uni2;
5942 if (c >= limit) {
5943 raise_encode_exception(&exc, encoding, startp, size,
5944 unicodepos, unicodepos+1, reason);
5945 Py_DECREF(repunicode);
5946 goto onError;
5947 }
5948 *str = (char)c;
5949 }
5950 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005951 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005952 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005953 }
5954 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005955 /* Resize if we allocated to much */
5956 size = str - PyBytes_AS_STRING(res);
5957 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005958 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005959 if (_PyBytes_Resize(&res, size) < 0)
5960 goto onError;
5961 }
5962
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005963 Py_XDECREF(errorHandler);
5964 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005965 return res;
5966
5967 onError:
5968 Py_XDECREF(res);
5969 Py_XDECREF(errorHandler);
5970 Py_XDECREF(exc);
5971 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005972}
5973
Alexander Belopolsky40018472011-02-26 01:02:56 +00005974PyObject *
5975PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005976 Py_ssize_t size,
5977 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005979 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980}
5981
Alexander Belopolsky40018472011-02-26 01:02:56 +00005982PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005983_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984{
5985 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 PyErr_BadArgument();
5987 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005989 if (PyUnicode_READY(unicode) == -1)
5990 return NULL;
5991 /* Fast path: if it is a one-byte string, construct
5992 bytes object directly. */
5993 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
5994 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
5995 PyUnicode_GET_LENGTH(unicode));
5996 /* Non-Latin-1 characters present. Defer to above function to
5997 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006000 errors);
6001}
6002
6003PyObject*
6004PyUnicode_AsLatin1String(PyObject *unicode)
6005{
6006 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007}
6008
6009/* --- 7-bit ASCII Codec -------------------------------------------------- */
6010
Alexander Belopolsky40018472011-02-26 01:02:56 +00006011PyObject *
6012PyUnicode_DecodeASCII(const char *s,
6013 Py_ssize_t size,
6014 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006016 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 PyUnicodeObject *v;
6018 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006019 Py_ssize_t startinpos;
6020 Py_ssize_t endinpos;
6021 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006022 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006023 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006024 PyObject *errorHandler = NULL;
6025 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006026 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006027
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006029 if (size == 1 && *(unsigned char*)s < 128)
6030 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6031
6032 /* Fast path. Assume the input actually *is* ASCII, and allocate
6033 a single-block Unicode object with that assumption. If there is
6034 an error, drop the object and start over. */
6035 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6036 if (v == NULL)
6037 goto onError;
6038 d = PyUnicode_1BYTE_DATA(v);
6039 for (i = 0; i < size; i++) {
6040 unsigned char ch = ((unsigned char*)s)[i];
6041 if (ch < 128)
6042 d[i] = ch;
6043 else
6044 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006046 if (i == size)
6047 return (PyObject*)v;
6048 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006049
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 v = _PyUnicode_New(size);
6051 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 e = s + size;
6057 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 register unsigned char c = (unsigned char)*s;
6059 if (c < 128) {
6060 *p++ = c;
6061 ++s;
6062 }
6063 else {
6064 startinpos = s-starts;
6065 endinpos = startinpos + 1;
6066 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6067 if (unicode_decode_call_errorhandler(
6068 errors, &errorHandler,
6069 "ascii", "ordinal not in range(128)",
6070 &starts, &e, &startinpos, &endinpos, &exc, &s,
6071 &v, &outpos, &p))
6072 goto onError;
6073 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006075 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6077 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006078 Py_XDECREF(errorHandler);
6079 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006080 if (PyUnicode_READY(v) == -1) {
6081 Py_DECREF(v);
6082 return NULL;
6083 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006085
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006088 Py_XDECREF(errorHandler);
6089 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 return NULL;
6091}
6092
Alexander Belopolsky40018472011-02-26 01:02:56 +00006093PyObject *
6094PyUnicode_EncodeASCII(const Py_UNICODE *p,
6095 Py_ssize_t size,
6096 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006098 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099}
6100
Alexander Belopolsky40018472011-02-26 01:02:56 +00006101PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006102_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103{
6104 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 PyErr_BadArgument();
6106 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006108 if (PyUnicode_READY(unicode) == -1)
6109 return NULL;
6110 /* Fast path: if it is an ASCII-only string, construct bytes object
6111 directly. Else defer to above function to raise the exception. */
6112 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6113 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6114 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006116 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006117 errors);
6118}
6119
6120PyObject *
6121PyUnicode_AsASCIIString(PyObject *unicode)
6122{
6123 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124}
6125
Victor Stinner99b95382011-07-04 14:23:54 +02006126#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006127
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006128/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006129
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006130#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006131#define NEED_RETRY
6132#endif
6133
6134/* XXX This code is limited to "true" double-byte encodings, as
6135 a) it assumes an incomplete character consists of a single byte, and
6136 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006138
Alexander Belopolsky40018472011-02-26 01:02:56 +00006139static int
6140is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006141{
6142 const char *curr = s + offset;
6143
6144 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 const char *prev = CharPrev(s, curr);
6146 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006147 }
6148 return 0;
6149}
6150
6151/*
6152 * Decode MBCS string into unicode object. If 'final' is set, converts
6153 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6154 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006155static int
6156decode_mbcs(PyUnicodeObject **v,
6157 const char *s, /* MBCS string */
6158 int size, /* sizeof MBCS string */
6159 int final,
6160 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006161{
6162 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006163 Py_ssize_t n;
6164 DWORD usize;
6165 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006166
6167 assert(size >= 0);
6168
Victor Stinner554f3f02010-06-16 23:33:54 +00006169 /* check and handle 'errors' arg */
6170 if (errors==NULL || strcmp(errors, "strict")==0)
6171 flags = MB_ERR_INVALID_CHARS;
6172 else if (strcmp(errors, "ignore")==0)
6173 flags = 0;
6174 else {
6175 PyErr_Format(PyExc_ValueError,
6176 "mbcs encoding does not support errors='%s'",
6177 errors);
6178 return -1;
6179 }
6180
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006181 /* Skip trailing lead-byte unless 'final' is set */
6182 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006183 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006184
6185 /* First get the size of the result */
6186 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006187 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6188 if (usize==0)
6189 goto mbcs_decode_error;
6190 } else
6191 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006192
6193 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 /* Create unicode object */
6195 *v = _PyUnicode_New(usize);
6196 if (*v == NULL)
6197 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006198 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006199 }
6200 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 /* Extend unicode object */
6202 n = PyUnicode_GET_SIZE(*v);
6203 if (_PyUnicode_Resize(v, n + usize) < 0)
6204 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006205 }
6206
6207 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006208 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006210 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6211 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006213 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006214 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006215
6216mbcs_decode_error:
6217 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6218 we raise a UnicodeDecodeError - else it is a 'generic'
6219 windows error
6220 */
6221 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6222 /* Ideally, we should get reason from FormatMessage - this
6223 is the Windows 2000 English version of the message
6224 */
6225 PyObject *exc = NULL;
6226 const char *reason = "No mapping for the Unicode character exists "
6227 "in the target multi-byte code page.";
6228 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6229 if (exc != NULL) {
6230 PyCodec_StrictErrors(exc);
6231 Py_DECREF(exc);
6232 }
6233 } else {
6234 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6235 }
6236 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006237}
6238
Alexander Belopolsky40018472011-02-26 01:02:56 +00006239PyObject *
6240PyUnicode_DecodeMBCSStateful(const char *s,
6241 Py_ssize_t size,
6242 const char *errors,
6243 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006244{
6245 PyUnicodeObject *v = NULL;
6246 int done;
6247
6248 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006250
6251#ifdef NEED_RETRY
6252 retry:
6253 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006254 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006255 else
6256#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006257 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006258
6259 if (done < 0) {
6260 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006262 }
6263
6264 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006265 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006266
6267#ifdef NEED_RETRY
6268 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 s += done;
6270 size -= done;
6271 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006272 }
6273#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006274 if (PyUnicode_READY(v) == -1) {
6275 Py_DECREF(v);
6276 return NULL;
6277 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006278 return (PyObject *)v;
6279}
6280
Alexander Belopolsky40018472011-02-26 01:02:56 +00006281PyObject *
6282PyUnicode_DecodeMBCS(const char *s,
6283 Py_ssize_t size,
6284 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006285{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006286 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6287}
6288
6289/*
6290 * Convert unicode into string object (MBCS).
6291 * Returns 0 if succeed, -1 otherwise.
6292 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006293static int
6294encode_mbcs(PyObject **repr,
6295 const Py_UNICODE *p, /* unicode */
6296 int size, /* size of unicode */
6297 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006298{
Victor Stinner554f3f02010-06-16 23:33:54 +00006299 BOOL usedDefaultChar = FALSE;
6300 BOOL *pusedDefaultChar;
6301 int mbcssize;
6302 Py_ssize_t n;
6303 PyObject *exc = NULL;
6304 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006305
6306 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006307
Victor Stinner554f3f02010-06-16 23:33:54 +00006308 /* check and handle 'errors' arg */
6309 if (errors==NULL || strcmp(errors, "strict")==0) {
6310 flags = WC_NO_BEST_FIT_CHARS;
6311 pusedDefaultChar = &usedDefaultChar;
6312 } else if (strcmp(errors, "replace")==0) {
6313 flags = 0;
6314 pusedDefaultChar = NULL;
6315 } else {
6316 PyErr_Format(PyExc_ValueError,
6317 "mbcs encoding does not support errors='%s'",
6318 errors);
6319 return -1;
6320 }
6321
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006322 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006323 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006324 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6325 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 if (mbcssize == 0) {
6327 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6328 return -1;
6329 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006330 /* If we used a default char, then we failed! */
6331 if (pusedDefaultChar && *pusedDefaultChar)
6332 goto mbcs_encode_error;
6333 } else {
6334 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006335 }
6336
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006337 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 /* Create string object */
6339 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6340 if (*repr == NULL)
6341 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006342 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006343 }
6344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 /* Extend string object */
6346 n = PyBytes_Size(*repr);
6347 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6348 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006349 }
6350
6351 /* Do the conversion */
6352 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006354 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6355 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006356 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6357 return -1;
6358 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006359 if (pusedDefaultChar && *pusedDefaultChar)
6360 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006361 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006362 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006363
6364mbcs_encode_error:
6365 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6366 Py_XDECREF(exc);
6367 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006368}
6369
Alexander Belopolsky40018472011-02-26 01:02:56 +00006370PyObject *
6371PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6372 Py_ssize_t size,
6373 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006374{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006375 PyObject *repr = NULL;
6376 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006377
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006378#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006380 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006381 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006382 else
6383#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006384 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006385
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006386 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 Py_XDECREF(repr);
6388 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006389 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006390
6391#ifdef NEED_RETRY
6392 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006393 p += INT_MAX;
6394 size -= INT_MAX;
6395 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006396 }
6397#endif
6398
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006399 return repr;
6400}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006401
Alexander Belopolsky40018472011-02-26 01:02:56 +00006402PyObject *
6403PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006404{
6405 if (!PyUnicode_Check(unicode)) {
6406 PyErr_BadArgument();
6407 return NULL;
6408 }
6409 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 PyUnicode_GET_SIZE(unicode),
6411 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006412}
6413
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006414#undef NEED_RETRY
6415
Victor Stinner99b95382011-07-04 14:23:54 +02006416#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006417
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418/* --- Character Mapping Codec -------------------------------------------- */
6419
Alexander Belopolsky40018472011-02-26 01:02:56 +00006420PyObject *
6421PyUnicode_DecodeCharmap(const char *s,
6422 Py_ssize_t size,
6423 PyObject *mapping,
6424 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006426 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006427 Py_ssize_t startinpos;
6428 Py_ssize_t endinpos;
6429 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 PyUnicodeObject *v;
6432 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006433 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434 PyObject *errorHandler = NULL;
6435 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006436 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006437 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006438
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 /* Default to Latin-1 */
6440 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442
6443 v = _PyUnicode_New(size);
6444 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006449 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006450 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006451 mapstring = PyUnicode_AS_UNICODE(mapping);
6452 maplen = PyUnicode_GET_SIZE(mapping);
6453 while (s < e) {
6454 unsigned char ch = *s;
6455 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456
Benjamin Peterson29060642009-01-31 22:14:21 +00006457 if (ch < maplen)
6458 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 if (x == 0xfffe) {
6461 /* undefined mapping */
6462 outpos = p-PyUnicode_AS_UNICODE(v);
6463 startinpos = s-starts;
6464 endinpos = startinpos+1;
6465 if (unicode_decode_call_errorhandler(
6466 errors, &errorHandler,
6467 "charmap", "character maps to <undefined>",
6468 &starts, &e, &startinpos, &endinpos, &exc, &s,
6469 &v, &outpos, &p)) {
6470 goto onError;
6471 }
6472 continue;
6473 }
6474 *p++ = x;
6475 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006476 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006477 }
6478 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 while (s < e) {
6480 unsigned char ch = *s;
6481 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006482
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6484 w = PyLong_FromLong((long)ch);
6485 if (w == NULL)
6486 goto onError;
6487 x = PyObject_GetItem(mapping, w);
6488 Py_DECREF(w);
6489 if (x == NULL) {
6490 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6491 /* No mapping found means: mapping is undefined. */
6492 PyErr_Clear();
6493 x = Py_None;
6494 Py_INCREF(x);
6495 } else
6496 goto onError;
6497 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006498
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 /* Apply mapping */
6500 if (PyLong_Check(x)) {
6501 long value = PyLong_AS_LONG(x);
6502 if (value < 0 || value > 65535) {
6503 PyErr_SetString(PyExc_TypeError,
6504 "character mapping must be in range(65536)");
6505 Py_DECREF(x);
6506 goto onError;
6507 }
6508 *p++ = (Py_UNICODE)value;
6509 }
6510 else if (x == Py_None) {
6511 /* undefined mapping */
6512 outpos = p-PyUnicode_AS_UNICODE(v);
6513 startinpos = s-starts;
6514 endinpos = startinpos+1;
6515 if (unicode_decode_call_errorhandler(
6516 errors, &errorHandler,
6517 "charmap", "character maps to <undefined>",
6518 &starts, &e, &startinpos, &endinpos, &exc, &s,
6519 &v, &outpos, &p)) {
6520 Py_DECREF(x);
6521 goto onError;
6522 }
6523 Py_DECREF(x);
6524 continue;
6525 }
6526 else if (PyUnicode_Check(x)) {
6527 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006528
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 if (targetsize == 1)
6530 /* 1-1 mapping */
6531 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006532
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 else if (targetsize > 1) {
6534 /* 1-n mapping */
6535 if (targetsize > extrachars) {
6536 /* resize first */
6537 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6538 Py_ssize_t needed = (targetsize - extrachars) + \
6539 (targetsize << 2);
6540 extrachars += needed;
6541 /* XXX overflow detection missing */
6542 if (_PyUnicode_Resize(&v,
6543 PyUnicode_GET_SIZE(v) + needed) < 0) {
6544 Py_DECREF(x);
6545 goto onError;
6546 }
6547 p = PyUnicode_AS_UNICODE(v) + oldpos;
6548 }
6549 Py_UNICODE_COPY(p,
6550 PyUnicode_AS_UNICODE(x),
6551 targetsize);
6552 p += targetsize;
6553 extrachars -= targetsize;
6554 }
6555 /* 1-0 mapping: skip the character */
6556 }
6557 else {
6558 /* wrong return value */
6559 PyErr_SetString(PyExc_TypeError,
6560 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006561 Py_DECREF(x);
6562 goto onError;
6563 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 Py_DECREF(x);
6565 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006566 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 }
6568 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006569 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6570 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006571 Py_XDECREF(errorHandler);
6572 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006573 if (PyUnicode_READY(v) == -1) {
6574 Py_DECREF(v);
6575 return NULL;
6576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006578
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006580 Py_XDECREF(errorHandler);
6581 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 Py_XDECREF(v);
6583 return NULL;
6584}
6585
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006586/* Charmap encoding: the lookup table */
6587
Alexander Belopolsky40018472011-02-26 01:02:56 +00006588struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 PyObject_HEAD
6590 unsigned char level1[32];
6591 int count2, count3;
6592 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006593};
6594
6595static PyObject*
6596encoding_map_size(PyObject *obj, PyObject* args)
6597{
6598 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006599 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006601}
6602
6603static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006604 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 PyDoc_STR("Return the size (in bytes) of this object") },
6606 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006607};
6608
6609static void
6610encoding_map_dealloc(PyObject* o)
6611{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006612 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006613}
6614
6615static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006616 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 "EncodingMap", /*tp_name*/
6618 sizeof(struct encoding_map), /*tp_basicsize*/
6619 0, /*tp_itemsize*/
6620 /* methods */
6621 encoding_map_dealloc, /*tp_dealloc*/
6622 0, /*tp_print*/
6623 0, /*tp_getattr*/
6624 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006625 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006626 0, /*tp_repr*/
6627 0, /*tp_as_number*/
6628 0, /*tp_as_sequence*/
6629 0, /*tp_as_mapping*/
6630 0, /*tp_hash*/
6631 0, /*tp_call*/
6632 0, /*tp_str*/
6633 0, /*tp_getattro*/
6634 0, /*tp_setattro*/
6635 0, /*tp_as_buffer*/
6636 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6637 0, /*tp_doc*/
6638 0, /*tp_traverse*/
6639 0, /*tp_clear*/
6640 0, /*tp_richcompare*/
6641 0, /*tp_weaklistoffset*/
6642 0, /*tp_iter*/
6643 0, /*tp_iternext*/
6644 encoding_map_methods, /*tp_methods*/
6645 0, /*tp_members*/
6646 0, /*tp_getset*/
6647 0, /*tp_base*/
6648 0, /*tp_dict*/
6649 0, /*tp_descr_get*/
6650 0, /*tp_descr_set*/
6651 0, /*tp_dictoffset*/
6652 0, /*tp_init*/
6653 0, /*tp_alloc*/
6654 0, /*tp_new*/
6655 0, /*tp_free*/
6656 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006657};
6658
6659PyObject*
6660PyUnicode_BuildEncodingMap(PyObject* string)
6661{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006662 PyObject *result;
6663 struct encoding_map *mresult;
6664 int i;
6665 int need_dict = 0;
6666 unsigned char level1[32];
6667 unsigned char level2[512];
6668 unsigned char *mlevel1, *mlevel2, *mlevel3;
6669 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006670 int kind;
6671 void *data;
6672 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006674 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006675 PyErr_BadArgument();
6676 return NULL;
6677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006678 kind = PyUnicode_KIND(string);
6679 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006680 memset(level1, 0xFF, sizeof level1);
6681 memset(level2, 0xFF, sizeof level2);
6682
6683 /* If there isn't a one-to-one mapping of NULL to \0,
6684 or if there are non-BMP characters, we need to use
6685 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006686 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006687 need_dict = 1;
6688 for (i = 1; i < 256; i++) {
6689 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006690 ch = PyUnicode_READ(kind, data, i);
6691 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006692 need_dict = 1;
6693 break;
6694 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006695 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006696 /* unmapped character */
6697 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006698 l1 = ch >> 11;
6699 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006700 if (level1[l1] == 0xFF)
6701 level1[l1] = count2++;
6702 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006703 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006704 }
6705
6706 if (count2 >= 0xFF || count3 >= 0xFF)
6707 need_dict = 1;
6708
6709 if (need_dict) {
6710 PyObject *result = PyDict_New();
6711 PyObject *key, *value;
6712 if (!result)
6713 return NULL;
6714 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006715 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006716 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006717 if (!key || !value)
6718 goto failed1;
6719 if (PyDict_SetItem(result, key, value) == -1)
6720 goto failed1;
6721 Py_DECREF(key);
6722 Py_DECREF(value);
6723 }
6724 return result;
6725 failed1:
6726 Py_XDECREF(key);
6727 Py_XDECREF(value);
6728 Py_DECREF(result);
6729 return NULL;
6730 }
6731
6732 /* Create a three-level trie */
6733 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6734 16*count2 + 128*count3 - 1);
6735 if (!result)
6736 return PyErr_NoMemory();
6737 PyObject_Init(result, &EncodingMapType);
6738 mresult = (struct encoding_map*)result;
6739 mresult->count2 = count2;
6740 mresult->count3 = count3;
6741 mlevel1 = mresult->level1;
6742 mlevel2 = mresult->level23;
6743 mlevel3 = mresult->level23 + 16*count2;
6744 memcpy(mlevel1, level1, 32);
6745 memset(mlevel2, 0xFF, 16*count2);
6746 memset(mlevel3, 0, 128*count3);
6747 count3 = 0;
6748 for (i = 1; i < 256; i++) {
6749 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006750 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006751 /* unmapped character */
6752 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006753 o1 = PyUnicode_READ(kind, data, i)>>11;
6754 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006755 i2 = 16*mlevel1[o1] + o2;
6756 if (mlevel2[i2] == 0xFF)
6757 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006758 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006759 i3 = 128*mlevel2[i2] + o3;
6760 mlevel3[i3] = i;
6761 }
6762 return result;
6763}
6764
6765static int
6766encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6767{
6768 struct encoding_map *map = (struct encoding_map*)mapping;
6769 int l1 = c>>11;
6770 int l2 = (c>>7) & 0xF;
6771 int l3 = c & 0x7F;
6772 int i;
6773
6774#ifdef Py_UNICODE_WIDE
6775 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006777 }
6778#endif
6779 if (c == 0)
6780 return 0;
6781 /* level 1*/
6782 i = map->level1[l1];
6783 if (i == 0xFF) {
6784 return -1;
6785 }
6786 /* level 2*/
6787 i = map->level23[16*i+l2];
6788 if (i == 0xFF) {
6789 return -1;
6790 }
6791 /* level 3 */
6792 i = map->level23[16*map->count2 + 128*i + l3];
6793 if (i == 0) {
6794 return -1;
6795 }
6796 return i;
6797}
6798
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006799/* Lookup the character ch in the mapping. If the character
6800 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006801 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006802static PyObject *
6803charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804{
Christian Heimes217cfd12007-12-02 14:31:20 +00006805 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006806 PyObject *x;
6807
6808 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006810 x = PyObject_GetItem(mapping, w);
6811 Py_DECREF(w);
6812 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6814 /* No mapping found means: mapping is undefined. */
6815 PyErr_Clear();
6816 x = Py_None;
6817 Py_INCREF(x);
6818 return x;
6819 } else
6820 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006822 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006824 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 long value = PyLong_AS_LONG(x);
6826 if (value < 0 || value > 255) {
6827 PyErr_SetString(PyExc_TypeError,
6828 "character mapping must be in range(256)");
6829 Py_DECREF(x);
6830 return NULL;
6831 }
6832 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006834 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006837 /* wrong return value */
6838 PyErr_Format(PyExc_TypeError,
6839 "character mapping must return integer, bytes or None, not %.400s",
6840 x->ob_type->tp_name);
6841 Py_DECREF(x);
6842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 }
6844}
6845
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006846static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006847charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006848{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006849 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6850 /* exponentially overallocate to minimize reallocations */
6851 if (requiredsize < 2*outsize)
6852 requiredsize = 2*outsize;
6853 if (_PyBytes_Resize(outobj, requiredsize))
6854 return -1;
6855 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006856}
6857
Benjamin Peterson14339b62009-01-31 16:36:08 +00006858typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006860} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006861/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006862 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006863 space is available. Return a new reference to the object that
6864 was put in the output buffer, or Py_None, if the mapping was undefined
6865 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006866 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006867static charmapencode_result
6868charmapencode_output(Py_UNICODE c, PyObject *mapping,
6869 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006870{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006871 PyObject *rep;
6872 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006873 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006874
Christian Heimes90aa7642007-12-19 02:45:37 +00006875 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006876 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006878 if (res == -1)
6879 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 if (outsize<requiredsize)
6881 if (charmapencode_resize(outobj, outpos, requiredsize))
6882 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006883 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 outstart[(*outpos)++] = (char)res;
6885 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006886 }
6887
6888 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006889 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006890 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006891 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 Py_DECREF(rep);
6893 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006894 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 if (PyLong_Check(rep)) {
6896 Py_ssize_t requiredsize = *outpos+1;
6897 if (outsize<requiredsize)
6898 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6899 Py_DECREF(rep);
6900 return enc_EXCEPTION;
6901 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006902 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006904 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006905 else {
6906 const char *repchars = PyBytes_AS_STRING(rep);
6907 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6908 Py_ssize_t requiredsize = *outpos+repsize;
6909 if (outsize<requiredsize)
6910 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6911 Py_DECREF(rep);
6912 return enc_EXCEPTION;
6913 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006914 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006915 memcpy(outstart + *outpos, repchars, repsize);
6916 *outpos += repsize;
6917 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006918 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006919 Py_DECREF(rep);
6920 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006921}
6922
6923/* handle an error in PyUnicode_EncodeCharmap
6924 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006925static int
6926charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006927 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006928 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006929 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006930 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006931{
6932 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006933 Py_ssize_t repsize;
6934 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006935 Py_UNICODE *uni2;
6936 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006937 Py_ssize_t collstartpos = *inpos;
6938 Py_ssize_t collendpos = *inpos+1;
6939 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006940 char *encoding = "charmap";
6941 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006942 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006943
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006944 /* find all unencodable characters */
6945 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006946 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006947 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 int res = encoding_map_lookup(p[collendpos], mapping);
6949 if (res != -1)
6950 break;
6951 ++collendpos;
6952 continue;
6953 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006954
Benjamin Peterson29060642009-01-31 22:14:21 +00006955 rep = charmapencode_lookup(p[collendpos], mapping);
6956 if (rep==NULL)
6957 return -1;
6958 else if (rep!=Py_None) {
6959 Py_DECREF(rep);
6960 break;
6961 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006962 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006963 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006964 }
6965 /* cache callback name lookup
6966 * (if not done yet, i.e. it's the first error) */
6967 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 if ((errors==NULL) || (!strcmp(errors, "strict")))
6969 *known_errorHandler = 1;
6970 else if (!strcmp(errors, "replace"))
6971 *known_errorHandler = 2;
6972 else if (!strcmp(errors, "ignore"))
6973 *known_errorHandler = 3;
6974 else if (!strcmp(errors, "xmlcharrefreplace"))
6975 *known_errorHandler = 4;
6976 else
6977 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006978 }
6979 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006980 case 1: /* strict */
6981 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6982 return -1;
6983 case 2: /* replace */
6984 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 x = charmapencode_output('?', mapping, res, respos);
6986 if (x==enc_EXCEPTION) {
6987 return -1;
6988 }
6989 else if (x==enc_FAILED) {
6990 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6991 return -1;
6992 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006993 }
6994 /* fall through */
6995 case 3: /* ignore */
6996 *inpos = collendpos;
6997 break;
6998 case 4: /* xmlcharrefreplace */
6999 /* generate replacement (temporarily (mis)uses p) */
7000 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007001 char buffer[2+29+1+1];
7002 char *cp;
7003 sprintf(buffer, "&#%d;", (int)p[collpos]);
7004 for (cp = buffer; *cp; ++cp) {
7005 x = charmapencode_output(*cp, mapping, res, respos);
7006 if (x==enc_EXCEPTION)
7007 return -1;
7008 else if (x==enc_FAILED) {
7009 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7010 return -1;
7011 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007012 }
7013 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007014 *inpos = collendpos;
7015 break;
7016 default:
7017 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 encoding, reason, p, size, exceptionObject,
7019 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007020 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007022 if (PyBytes_Check(repunicode)) {
7023 /* Directly copy bytes result to output. */
7024 Py_ssize_t outsize = PyBytes_Size(*res);
7025 Py_ssize_t requiredsize;
7026 repsize = PyBytes_Size(repunicode);
7027 requiredsize = *respos + repsize;
7028 if (requiredsize > outsize)
7029 /* Make room for all additional bytes. */
7030 if (charmapencode_resize(res, respos, requiredsize)) {
7031 Py_DECREF(repunicode);
7032 return -1;
7033 }
7034 memcpy(PyBytes_AsString(*res) + *respos,
7035 PyBytes_AsString(repunicode), repsize);
7036 *respos += repsize;
7037 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007038 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007039 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007040 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007041 /* generate replacement */
7042 repsize = PyUnicode_GET_SIZE(repunicode);
7043 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007044 x = charmapencode_output(*uni2, mapping, res, respos);
7045 if (x==enc_EXCEPTION) {
7046 return -1;
7047 }
7048 else if (x==enc_FAILED) {
7049 Py_DECREF(repunicode);
7050 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7051 return -1;
7052 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007053 }
7054 *inpos = newpos;
7055 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007056 }
7057 return 0;
7058}
7059
Alexander Belopolsky40018472011-02-26 01:02:56 +00007060PyObject *
7061PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7062 Py_ssize_t size,
7063 PyObject *mapping,
7064 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007066 /* output object */
7067 PyObject *res = NULL;
7068 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007069 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007070 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007071 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007072 PyObject *errorHandler = NULL;
7073 PyObject *exc = NULL;
7074 /* the following variable is used for caching string comparisons
7075 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7076 * 3=ignore, 4=xmlcharrefreplace */
7077 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078
7079 /* Default to Latin-1 */
7080 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007081 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007083 /* allocate enough for a simple encoding without
7084 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007085 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007086 if (res == NULL)
7087 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007088 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007091 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 /* try to encode it */
7093 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7094 if (x==enc_EXCEPTION) /* error */
7095 goto onError;
7096 if (x==enc_FAILED) { /* unencodable character */
7097 if (charmap_encoding_error(p, size, &inpos, mapping,
7098 &exc,
7099 &known_errorHandler, &errorHandler, errors,
7100 &res, &respos)) {
7101 goto onError;
7102 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007103 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 else
7105 /* done with this character => adjust input position */
7106 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007109 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007110 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007111 if (_PyBytes_Resize(&res, respos) < 0)
7112 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007113
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007114 Py_XDECREF(exc);
7115 Py_XDECREF(errorHandler);
7116 return res;
7117
Benjamin Peterson29060642009-01-31 22:14:21 +00007118 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007119 Py_XDECREF(res);
7120 Py_XDECREF(exc);
7121 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122 return NULL;
7123}
7124
Alexander Belopolsky40018472011-02-26 01:02:56 +00007125PyObject *
7126PyUnicode_AsCharmapString(PyObject *unicode,
7127 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128{
7129 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 PyErr_BadArgument();
7131 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132 }
7133 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 PyUnicode_GET_SIZE(unicode),
7135 mapping,
7136 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137}
7138
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007139/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007140static void
7141make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007142 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007143 Py_ssize_t startpos, Py_ssize_t endpos,
7144 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007146 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007147 *exceptionObject = _PyUnicodeTranslateError_Create(
7148 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 }
7150 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007151 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7152 goto onError;
7153 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7154 goto onError;
7155 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7156 goto onError;
7157 return;
7158 onError:
7159 Py_DECREF(*exceptionObject);
7160 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161 }
7162}
7163
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007164/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007165static void
7166raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007167 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007168 Py_ssize_t startpos, Py_ssize_t endpos,
7169 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007170{
7171 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007172 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007173 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007175}
7176
7177/* error handling callback helper:
7178 build arguments, call the callback and check the arguments,
7179 put the result into newpos and return the replacement string, which
7180 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007181static PyObject *
7182unicode_translate_call_errorhandler(const char *errors,
7183 PyObject **errorHandler,
7184 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007185 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007186 Py_ssize_t startpos, Py_ssize_t endpos,
7187 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007188{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007189 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007190
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007191 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007192 PyObject *restuple;
7193 PyObject *resunicode;
7194
7195 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007197 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007199 }
7200
7201 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007202 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007203 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007204 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007205
7206 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007207 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007208 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007210 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007211 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 Py_DECREF(restuple);
7213 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007214 }
7215 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 &resunicode, &i_newpos)) {
7217 Py_DECREF(restuple);
7218 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007219 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007220 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007221 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007222 else
7223 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007224 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7226 Py_DECREF(restuple);
7227 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007228 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007229 Py_INCREF(resunicode);
7230 Py_DECREF(restuple);
7231 return resunicode;
7232}
7233
7234/* Lookup the character ch in the mapping and put the result in result,
7235 which must be decrefed by the caller.
7236 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007237static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007238charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007239{
Christian Heimes217cfd12007-12-02 14:31:20 +00007240 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007241 PyObject *x;
7242
7243 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007244 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007245 x = PyObject_GetItem(mapping, w);
7246 Py_DECREF(w);
7247 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007248 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7249 /* No mapping found means: use 1:1 mapping. */
7250 PyErr_Clear();
7251 *result = NULL;
7252 return 0;
7253 } else
7254 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007255 }
7256 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007257 *result = x;
7258 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007259 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007260 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 long value = PyLong_AS_LONG(x);
7262 long max = PyUnicode_GetMax();
7263 if (value < 0 || value > max) {
7264 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007265 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 Py_DECREF(x);
7267 return -1;
7268 }
7269 *result = x;
7270 return 0;
7271 }
7272 else if (PyUnicode_Check(x)) {
7273 *result = x;
7274 return 0;
7275 }
7276 else {
7277 /* wrong return value */
7278 PyErr_SetString(PyExc_TypeError,
7279 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007280 Py_DECREF(x);
7281 return -1;
7282 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007283}
7284/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007285 if not reallocate and adjust various state variables.
7286 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007287static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007288charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007289 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007290{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007291 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007292 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 /* exponentially overallocate to minimize reallocations */
7294 if (requiredsize < 2 * oldsize)
7295 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007296 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7297 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007298 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007299 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007300 }
7301 return 0;
7302}
7303/* lookup the character, put the result in the output string and adjust
7304 various state variables. Return a new reference to the object that
7305 was put in the output buffer in *result, or Py_None, if the mapping was
7306 undefined (in which case no character was written).
7307 The called must decref result.
7308 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007309static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007310charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7311 PyObject *mapping, Py_UCS4 **output,
7312 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007313 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007314{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007315 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7316 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007318 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007320 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007321 }
7322 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007324 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007325 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007326 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007327 }
7328 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007329 Py_ssize_t repsize;
7330 if (PyUnicode_READY(*res) == -1)
7331 return -1;
7332 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007333 if (repsize==1) {
7334 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007335 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007336 }
7337 else if (repsize!=0) {
7338 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007339 Py_ssize_t requiredsize = *opos +
7340 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007342 Py_ssize_t i;
7343 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007345 for(i = 0; i < repsize; i++)
7346 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007348 }
7349 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007350 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007351 return 0;
7352}
7353
Alexander Belopolsky40018472011-02-26 01:02:56 +00007354PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007355_PyUnicode_TranslateCharmap(PyObject *input,
7356 PyObject *mapping,
7357 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007359 /* input object */
7360 char *idata;
7361 Py_ssize_t size, i;
7362 int kind;
7363 /* output buffer */
7364 Py_UCS4 *output = NULL;
7365 Py_ssize_t osize;
7366 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007367 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007368 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007369 char *reason = "character maps to <undefined>";
7370 PyObject *errorHandler = NULL;
7371 PyObject *exc = NULL;
7372 /* the following variable is used for caching string comparisons
7373 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7374 * 3=ignore, 4=xmlcharrefreplace */
7375 int known_errorHandler = -1;
7376
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 PyErr_BadArgument();
7379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007382 if (PyUnicode_READY(input) == -1)
7383 return NULL;
7384 idata = (char*)PyUnicode_DATA(input);
7385 kind = PyUnicode_KIND(input);
7386 size = PyUnicode_GET_LENGTH(input);
7387 i = 0;
7388
7389 if (size == 0) {
7390 Py_INCREF(input);
7391 return input;
7392 }
7393
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007394 /* allocate enough for a simple 1:1 translation without
7395 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007396 osize = size;
7397 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7398 opos = 0;
7399 if (output == NULL) {
7400 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007402 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007404 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 /* try to encode it */
7406 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007407 if (charmaptranslate_output(input, i, mapping,
7408 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 Py_XDECREF(x);
7410 goto onError;
7411 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007412 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007414 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 else { /* untranslatable character */
7416 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7417 Py_ssize_t repsize;
7418 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007419 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007421 Py_ssize_t collstart = i;
7422 Py_ssize_t collend = i+1;
7423 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007426 while (collend < size) {
7427 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 goto onError;
7429 Py_XDECREF(x);
7430 if (x!=Py_None)
7431 break;
7432 ++collend;
7433 }
7434 /* cache callback name lookup
7435 * (if not done yet, i.e. it's the first error) */
7436 if (known_errorHandler==-1) {
7437 if ((errors==NULL) || (!strcmp(errors, "strict")))
7438 known_errorHandler = 1;
7439 else if (!strcmp(errors, "replace"))
7440 known_errorHandler = 2;
7441 else if (!strcmp(errors, "ignore"))
7442 known_errorHandler = 3;
7443 else if (!strcmp(errors, "xmlcharrefreplace"))
7444 known_errorHandler = 4;
7445 else
7446 known_errorHandler = 0;
7447 }
7448 switch (known_errorHandler) {
7449 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007450 raise_translate_exception(&exc, input, collstart,
7451 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007452 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 case 2: /* replace */
7454 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007455 for (coll = collstart; coll<collend; coll++)
7456 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 /* fall through */
7458 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007459 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 break;
7461 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007462 /* generate replacement (temporarily (mis)uses i) */
7463 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 char buffer[2+29+1+1];
7465 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007466 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7467 if (charmaptranslate_makespace(&output, &osize,
7468 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 goto onError;
7470 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007471 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007473 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 break;
7475 default:
7476 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007477 reason, input, &exc,
7478 collstart, collend, &newpos);
7479 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 goto onError;
7481 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007482 repsize = PyUnicode_GET_LENGTH(repunicode);
7483 if (charmaptranslate_makespace(&output, &osize,
7484 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 Py_DECREF(repunicode);
7486 goto onError;
7487 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007488 for (uni2 = 0; repsize-->0; ++uni2)
7489 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7490 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007491 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007492 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007493 }
7494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007495 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7496 if (!res)
7497 goto onError;
7498 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007499 Py_XDECREF(exc);
7500 Py_XDECREF(errorHandler);
7501 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502
Benjamin Peterson29060642009-01-31 22:14:21 +00007503 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007504 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007505 Py_XDECREF(exc);
7506 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507 return NULL;
7508}
7509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007510/* Deprecated. Use PyUnicode_Translate instead. */
7511PyObject *
7512PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7513 Py_ssize_t size,
7514 PyObject *mapping,
7515 const char *errors)
7516{
7517 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7518 if (!unicode)
7519 return NULL;
7520 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7521}
7522
Alexander Belopolsky40018472011-02-26 01:02:56 +00007523PyObject *
7524PyUnicode_Translate(PyObject *str,
7525 PyObject *mapping,
7526 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527{
7528 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007529
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530 str = PyUnicode_FromObject(str);
7531 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007532 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007533 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534 Py_DECREF(str);
7535 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007536
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538 Py_XDECREF(str);
7539 return NULL;
7540}
Tim Petersced69f82003-09-16 20:30:58 +00007541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007542static Py_UCS4
7543fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7544{
7545 /* No need to call PyUnicode_READY(self) because this function is only
7546 called as a callback from fixup() which does it already. */
7547 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7548 const int kind = PyUnicode_KIND(self);
7549 void *data = PyUnicode_DATA(self);
7550 Py_UCS4 maxchar = 0, ch, fixed;
7551 Py_ssize_t i;
7552
7553 for (i = 0; i < len; ++i) {
7554 ch = PyUnicode_READ(kind, data, i);
7555 fixed = 0;
7556 if (ch > 127) {
7557 if (Py_UNICODE_ISSPACE(ch))
7558 fixed = ' ';
7559 else {
7560 const int decimal = Py_UNICODE_TODECIMAL(ch);
7561 if (decimal >= 0)
7562 fixed = '0' + decimal;
7563 }
7564 if (fixed != 0) {
7565 if (fixed > maxchar)
7566 maxchar = fixed;
7567 PyUnicode_WRITE(kind, data, i, fixed);
7568 }
7569 else if (ch > maxchar)
7570 maxchar = ch;
7571 }
7572 else if (ch > maxchar)
7573 maxchar = ch;
7574 }
7575
7576 return maxchar;
7577}
7578
7579PyObject *
7580_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7581{
7582 if (!PyUnicode_Check(unicode)) {
7583 PyErr_BadInternalCall();
7584 return NULL;
7585 }
7586 if (PyUnicode_READY(unicode) == -1)
7587 return NULL;
7588 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7589 /* If the string is already ASCII, just return the same string */
7590 Py_INCREF(unicode);
7591 return unicode;
7592 }
7593 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7594}
7595
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007596PyObject *
7597PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7598 Py_ssize_t length)
7599{
7600 PyObject *result;
7601 Py_UNICODE *p; /* write pointer into result */
7602 Py_ssize_t i;
7603 /* Copy to a new string */
7604 result = (PyObject *)_PyUnicode_New(length);
7605 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7606 if (result == NULL)
7607 return result;
7608 p = PyUnicode_AS_UNICODE(result);
7609 /* Iterate over code points */
7610 for (i = 0; i < length; i++) {
7611 Py_UNICODE ch =s[i];
7612 if (ch > 127) {
7613 int decimal = Py_UNICODE_TODECIMAL(ch);
7614 if (decimal >= 0)
7615 p[i] = '0' + decimal;
7616 }
7617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007618 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7619 Py_DECREF(result);
7620 return NULL;
7621 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007622 return result;
7623}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007624/* --- Decimal Encoder ---------------------------------------------------- */
7625
Alexander Belopolsky40018472011-02-26 01:02:56 +00007626int
7627PyUnicode_EncodeDecimal(Py_UNICODE *s,
7628 Py_ssize_t length,
7629 char *output,
7630 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007631{
7632 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007633 PyObject *errorHandler = NULL;
7634 PyObject *exc = NULL;
7635 const char *encoding = "decimal";
7636 const char *reason = "invalid decimal Unicode string";
7637 /* the following variable is used for caching string comparisons
7638 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7639 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007640
7641 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 PyErr_BadArgument();
7643 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007644 }
7645
7646 p = s;
7647 end = s + length;
7648 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 register Py_UNICODE ch = *p;
7650 int decimal;
7651 PyObject *repunicode;
7652 Py_ssize_t repsize;
7653 Py_ssize_t newpos;
7654 Py_UNICODE *uni2;
7655 Py_UNICODE *collstart;
7656 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007657
Benjamin Peterson29060642009-01-31 22:14:21 +00007658 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007659 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 ++p;
7661 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007662 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 decimal = Py_UNICODE_TODECIMAL(ch);
7664 if (decimal >= 0) {
7665 *output++ = '0' + decimal;
7666 ++p;
7667 continue;
7668 }
7669 if (0 < ch && ch < 256) {
7670 *output++ = (char)ch;
7671 ++p;
7672 continue;
7673 }
7674 /* All other characters are considered unencodable */
7675 collstart = p;
7676 collend = p+1;
7677 while (collend < end) {
7678 if ((0 < *collend && *collend < 256) ||
7679 !Py_UNICODE_ISSPACE(*collend) ||
7680 Py_UNICODE_TODECIMAL(*collend))
7681 break;
7682 }
7683 /* cache callback name lookup
7684 * (if not done yet, i.e. it's the first error) */
7685 if (known_errorHandler==-1) {
7686 if ((errors==NULL) || (!strcmp(errors, "strict")))
7687 known_errorHandler = 1;
7688 else if (!strcmp(errors, "replace"))
7689 known_errorHandler = 2;
7690 else if (!strcmp(errors, "ignore"))
7691 known_errorHandler = 3;
7692 else if (!strcmp(errors, "xmlcharrefreplace"))
7693 known_errorHandler = 4;
7694 else
7695 known_errorHandler = 0;
7696 }
7697 switch (known_errorHandler) {
7698 case 1: /* strict */
7699 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7700 goto onError;
7701 case 2: /* replace */
7702 for (p = collstart; p < collend; ++p)
7703 *output++ = '?';
7704 /* fall through */
7705 case 3: /* ignore */
7706 p = collend;
7707 break;
7708 case 4: /* xmlcharrefreplace */
7709 /* generate replacement (temporarily (mis)uses p) */
7710 for (p = collstart; p < collend; ++p)
7711 output += sprintf(output, "&#%d;", (int)*p);
7712 p = collend;
7713 break;
7714 default:
7715 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7716 encoding, reason, s, length, &exc,
7717 collstart-s, collend-s, &newpos);
7718 if (repunicode == NULL)
7719 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007720 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007721 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007722 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7723 Py_DECREF(repunicode);
7724 goto onError;
7725 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 /* generate replacement */
7727 repsize = PyUnicode_GET_SIZE(repunicode);
7728 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7729 Py_UNICODE ch = *uni2;
7730 if (Py_UNICODE_ISSPACE(ch))
7731 *output++ = ' ';
7732 else {
7733 decimal = Py_UNICODE_TODECIMAL(ch);
7734 if (decimal >= 0)
7735 *output++ = '0' + decimal;
7736 else if (0 < ch && ch < 256)
7737 *output++ = (char)ch;
7738 else {
7739 Py_DECREF(repunicode);
7740 raise_encode_exception(&exc, encoding,
7741 s, length, collstart-s, collend-s, reason);
7742 goto onError;
7743 }
7744 }
7745 }
7746 p = s + newpos;
7747 Py_DECREF(repunicode);
7748 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007749 }
7750 /* 0-terminate the output string */
7751 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007752 Py_XDECREF(exc);
7753 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007754 return 0;
7755
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007757 Py_XDECREF(exc);
7758 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007759 return -1;
7760}
7761
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762/* --- Helpers ------------------------------------------------------------ */
7763
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007764#include "stringlib/ucs1lib.h"
7765#include "stringlib/fastsearch.h"
7766#include "stringlib/partition.h"
7767#include "stringlib/split.h"
7768#include "stringlib/count.h"
7769#include "stringlib/find.h"
7770#include "stringlib/localeutil.h"
7771#include "stringlib/undef.h"
7772
7773#include "stringlib/ucs2lib.h"
7774#include "stringlib/fastsearch.h"
7775#include "stringlib/partition.h"
7776#include "stringlib/split.h"
7777#include "stringlib/count.h"
7778#include "stringlib/find.h"
7779#include "stringlib/localeutil.h"
7780#include "stringlib/undef.h"
7781
7782#include "stringlib/ucs4lib.h"
7783#include "stringlib/fastsearch.h"
7784#include "stringlib/partition.h"
7785#include "stringlib/split.h"
7786#include "stringlib/count.h"
7787#include "stringlib/find.h"
7788#include "stringlib/localeutil.h"
7789#include "stringlib/undef.h"
7790
7791static Py_ssize_t
7792any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7793 const Py_UCS1*, Py_ssize_t,
7794 Py_ssize_t, Py_ssize_t),
7795 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7796 const Py_UCS2*, Py_ssize_t,
7797 Py_ssize_t, Py_ssize_t),
7798 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7799 const Py_UCS4*, Py_ssize_t,
7800 Py_ssize_t, Py_ssize_t),
7801 PyObject* s1, PyObject* s2,
7802 Py_ssize_t start,
7803 Py_ssize_t end)
7804{
7805 int kind1, kind2, kind;
7806 void *buf1, *buf2;
7807 Py_ssize_t len1, len2, result;
7808
7809 kind1 = PyUnicode_KIND(s1);
7810 kind2 = PyUnicode_KIND(s2);
7811 kind = kind1 > kind2 ? kind1 : kind2;
7812 buf1 = PyUnicode_DATA(s1);
7813 buf2 = PyUnicode_DATA(s2);
7814 if (kind1 != kind)
7815 buf1 = _PyUnicode_AsKind(s1, kind);
7816 if (!buf1)
7817 return -2;
7818 if (kind2 != kind)
7819 buf2 = _PyUnicode_AsKind(s2, kind);
7820 if (!buf2) {
7821 if (kind1 != kind) PyMem_Free(buf1);
7822 return -2;
7823 }
7824 len1 = PyUnicode_GET_LENGTH(s1);
7825 len2 = PyUnicode_GET_LENGTH(s2);
7826
7827 switch(kind) {
7828 case PyUnicode_1BYTE_KIND:
7829 result = ucs1(buf1, len1, buf2, len2, start, end);
7830 break;
7831 case PyUnicode_2BYTE_KIND:
7832 result = ucs2(buf1, len1, buf2, len2, start, end);
7833 break;
7834 case PyUnicode_4BYTE_KIND:
7835 result = ucs4(buf1, len1, buf2, len2, start, end);
7836 break;
7837 default:
7838 assert(0); result = -2;
7839 }
7840
7841 if (kind1 != kind)
7842 PyMem_Free(buf1);
7843 if (kind2 != kind)
7844 PyMem_Free(buf2);
7845
7846 return result;
7847}
7848
7849Py_ssize_t
7850_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7851 Py_ssize_t n_buffer,
7852 void *digits, Py_ssize_t n_digits,
7853 Py_ssize_t min_width,
7854 const char *grouping,
7855 const char *thousands_sep)
7856{
7857 switch(kind) {
7858 case PyUnicode_1BYTE_KIND:
7859 return _PyUnicode_ucs1_InsertThousandsGrouping(
7860 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7861 min_width, grouping, thousands_sep);
7862 case PyUnicode_2BYTE_KIND:
7863 return _PyUnicode_ucs2_InsertThousandsGrouping(
7864 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7865 min_width, grouping, thousands_sep);
7866 case PyUnicode_4BYTE_KIND:
7867 return _PyUnicode_ucs4_InsertThousandsGrouping(
7868 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7869 min_width, grouping, thousands_sep);
7870 }
7871 assert(0);
7872 return -1;
7873}
7874
7875
Eric Smith8c663262007-08-25 02:26:07 +00007876#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007877#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007878
Thomas Wouters477c8d52006-05-27 19:21:47 +00007879#include "stringlib/count.h"
7880#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007881
Thomas Wouters477c8d52006-05-27 19:21:47 +00007882/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007883#define ADJUST_INDICES(start, end, len) \
7884 if (end > len) \
7885 end = len; \
7886 else if (end < 0) { \
7887 end += len; \
7888 if (end < 0) \
7889 end = 0; \
7890 } \
7891 if (start < 0) { \
7892 start += len; \
7893 if (start < 0) \
7894 start = 0; \
7895 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007896
Alexander Belopolsky40018472011-02-26 01:02:56 +00007897Py_ssize_t
7898PyUnicode_Count(PyObject *str,
7899 PyObject *substr,
7900 Py_ssize_t start,
7901 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007903 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007904 PyUnicodeObject* str_obj;
7905 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007906 int kind1, kind2, kind;
7907 void *buf1 = NULL, *buf2 = NULL;
7908 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007909
Thomas Wouters477c8d52006-05-27 19:21:47 +00007910 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007911 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007912 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007913 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007914 if (!sub_obj || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007915 Py_DECREF(str_obj);
7916 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917 }
Tim Petersced69f82003-09-16 20:30:58 +00007918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007919 kind1 = PyUnicode_KIND(str_obj);
7920 kind2 = PyUnicode_KIND(sub_obj);
7921 kind = kind1 > kind2 ? kind1 : kind2;
7922 buf1 = PyUnicode_DATA(str_obj);
7923 if (kind1 != kind)
7924 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7925 if (!buf1)
7926 goto onError;
7927 buf2 = PyUnicode_DATA(sub_obj);
7928 if (kind2 != kind)
7929 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7930 if (!buf2)
7931 goto onError;
7932 len1 = PyUnicode_GET_LENGTH(str_obj);
7933 len2 = PyUnicode_GET_LENGTH(sub_obj);
7934
7935 ADJUST_INDICES(start, end, len1);
7936 switch(kind) {
7937 case PyUnicode_1BYTE_KIND:
7938 result = ucs1lib_count(
7939 ((Py_UCS1*)buf1) + start, end - start,
7940 buf2, len2, PY_SSIZE_T_MAX
7941 );
7942 break;
7943 case PyUnicode_2BYTE_KIND:
7944 result = ucs2lib_count(
7945 ((Py_UCS2*)buf1) + start, end - start,
7946 buf2, len2, PY_SSIZE_T_MAX
7947 );
7948 break;
7949 case PyUnicode_4BYTE_KIND:
7950 result = ucs4lib_count(
7951 ((Py_UCS4*)buf1) + start, end - start,
7952 buf2, len2, PY_SSIZE_T_MAX
7953 );
7954 break;
7955 default:
7956 assert(0); result = 0;
7957 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007958
7959 Py_DECREF(sub_obj);
7960 Py_DECREF(str_obj);
7961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007962 if (kind1 != kind)
7963 PyMem_Free(buf1);
7964 if (kind2 != kind)
7965 PyMem_Free(buf2);
7966
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007968 onError:
7969 Py_DECREF(sub_obj);
7970 Py_DECREF(str_obj);
7971 if (kind1 != kind && buf1)
7972 PyMem_Free(buf1);
7973 if (kind2 != kind && buf2)
7974 PyMem_Free(buf2);
7975 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976}
7977
Alexander Belopolsky40018472011-02-26 01:02:56 +00007978Py_ssize_t
7979PyUnicode_Find(PyObject *str,
7980 PyObject *sub,
7981 Py_ssize_t start,
7982 Py_ssize_t end,
7983 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007985 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00007986
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007988 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007990 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007991 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 Py_DECREF(str);
7993 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 }
Tim Petersced69f82003-09-16 20:30:58 +00007995
Thomas Wouters477c8d52006-05-27 19:21:47 +00007996 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007997 result = any_find_slice(
7998 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
7999 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008000 );
8001 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008002 result = any_find_slice(
8003 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8004 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008005 );
8006
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008008 Py_DECREF(sub);
8009
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 return result;
8011}
8012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008013Py_ssize_t
8014PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8015 Py_ssize_t start, Py_ssize_t end,
8016 int direction)
8017{
8018 char *result;
8019 int kind;
8020 if (PyUnicode_READY(str) == -1)
8021 return -2;
8022 if (end > PyUnicode_GET_LENGTH(str))
8023 end = PyUnicode_GET_LENGTH(str);
8024 kind = PyUnicode_KIND(str);
8025 result = findchar(PyUnicode_1BYTE_DATA(str)
8026 + PyUnicode_KIND_SIZE(kind, start),
8027 kind,
8028 end-start, ch, direction);
8029 if (!result)
8030 return -1;
8031 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8032}
8033
Alexander Belopolsky40018472011-02-26 01:02:56 +00008034static int
8035tailmatch(PyUnicodeObject *self,
8036 PyUnicodeObject *substring,
8037 Py_ssize_t start,
8038 Py_ssize_t end,
8039 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008041 int kind_self;
8042 int kind_sub;
8043 void *data_self;
8044 void *data_sub;
8045 Py_ssize_t offset;
8046 Py_ssize_t i;
8047 Py_ssize_t end_sub;
8048
8049 if (PyUnicode_READY(self) == -1 ||
8050 PyUnicode_READY(substring) == -1)
8051 return 0;
8052
8053 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 return 1;
8055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008056 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8057 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008061 kind_self = PyUnicode_KIND(self);
8062 data_self = PyUnicode_DATA(self);
8063 kind_sub = PyUnicode_KIND(substring);
8064 data_sub = PyUnicode_DATA(substring);
8065 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8066
8067 if (direction > 0)
8068 offset = end;
8069 else
8070 offset = start;
8071
8072 if (PyUnicode_READ(kind_self, data_self, offset) ==
8073 PyUnicode_READ(kind_sub, data_sub, 0) &&
8074 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8075 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8076 /* If both are of the same kind, memcmp is sufficient */
8077 if (kind_self == kind_sub) {
8078 return ! memcmp((char *)data_self +
8079 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8080 data_sub,
8081 PyUnicode_GET_LENGTH(substring) *
8082 PyUnicode_CHARACTER_SIZE(substring));
8083 }
8084 /* otherwise we have to compare each character by first accesing it */
8085 else {
8086 /* We do not need to compare 0 and len(substring)-1 because
8087 the if statement above ensured already that they are equal
8088 when we end up here. */
8089 // TODO: honor direction and do a forward or backwards search
8090 for (i = 1; i < end_sub; ++i) {
8091 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8092 PyUnicode_READ(kind_sub, data_sub, i))
8093 return 0;
8094 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 }
8098
8099 return 0;
8100}
8101
Alexander Belopolsky40018472011-02-26 01:02:56 +00008102Py_ssize_t
8103PyUnicode_Tailmatch(PyObject *str,
8104 PyObject *substr,
8105 Py_ssize_t start,
8106 Py_ssize_t end,
8107 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008109 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008110
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111 str = PyUnicode_FromObject(str);
8112 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008113 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114 substr = PyUnicode_FromObject(substr);
8115 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008116 Py_DECREF(str);
8117 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118 }
Tim Petersced69f82003-09-16 20:30:58 +00008119
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 (PyUnicodeObject *)substr,
8122 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 Py_DECREF(str);
8124 Py_DECREF(substr);
8125 return result;
8126}
8127
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128/* Apply fixfct filter to the Unicode object self and return a
8129 reference to the modified object */
8130
Alexander Belopolsky40018472011-02-26 01:02:56 +00008131static PyObject *
8132fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008133 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008135 PyObject *u;
8136 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008138 if (PyUnicode_READY(self) == -1)
8139 return NULL;
8140 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8141 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8142 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008146 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8147 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008149 /* fix functions return the new maximum character in a string,
8150 if the kind of the resulting unicode object does not change,
8151 everything is fine. Otherwise we need to change the string kind
8152 and re-run the fix function. */
8153 maxchar_new = fixfct((PyUnicodeObject*)u);
8154 if (maxchar_new == 0)
8155 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8156 else if (maxchar_new <= 127)
8157 maxchar_new = 127;
8158 else if (maxchar_new <= 255)
8159 maxchar_new = 255;
8160 else if (maxchar_new <= 65535)
8161 maxchar_new = 65535;
8162 else
8163 maxchar_new = 1114111; /* 0x10ffff */
8164
8165 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 /* fixfct should return TRUE if it modified the buffer. If
8167 FALSE, return a reference to the original buffer instead
8168 (to save space, not time) */
8169 Py_INCREF(self);
8170 Py_DECREF(u);
8171 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008173 else if (maxchar_new == maxchar_old) {
8174 return u;
8175 }
8176 else {
8177 /* In case the maximum character changed, we need to
8178 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008179 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 if (v == NULL) {
8181 Py_DECREF(u);
8182 return NULL;
8183 }
8184 if (maxchar_new > maxchar_old) {
8185 /* If the maxchar increased so that the kind changed, not all
8186 characters are representable anymore and we need to fix the
8187 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008188 if (PyUnicode_CopyCharacters(v, 0,
8189 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008190 PyUnicode_GET_LENGTH(self)) < 0)
8191 {
8192 Py_DECREF(u);
8193 return NULL;
8194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008195 maxchar_old = fixfct((PyUnicodeObject*)v);
8196 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8197 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008198 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008199 if (PyUnicode_CopyCharacters(v, 0,
8200 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008201 PyUnicode_GET_LENGTH(self)) < 0)
8202 {
8203 Py_DECREF(u);
8204 return NULL;
8205 }
8206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008207
8208 Py_DECREF(u);
8209 return v;
8210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211}
8212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008213static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008214fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008216 /* No need to call PyUnicode_READY(self) because this function is only
8217 called as a callback from fixup() which does it already. */
8218 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8219 const int kind = PyUnicode_KIND(self);
8220 void *data = PyUnicode_DATA(self);
8221 int touched = 0;
8222 Py_UCS4 maxchar = 0;
8223 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008225 for (i = 0; i < len; ++i) {
8226 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8227 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8228 if (up != ch) {
8229 if (up > maxchar)
8230 maxchar = up;
8231 PyUnicode_WRITE(kind, data, i, up);
8232 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008234 else if (ch > maxchar)
8235 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236 }
8237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238 if (touched)
8239 return maxchar;
8240 else
8241 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242}
8243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008244static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008245fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008247 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8248 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8249 const int kind = PyUnicode_KIND(self);
8250 void *data = PyUnicode_DATA(self);
8251 int touched = 0;
8252 Py_UCS4 maxchar = 0;
8253 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008255 for(i = 0; i < len; ++i) {
8256 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8257 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8258 if (lo != ch) {
8259 if (lo > maxchar)
8260 maxchar = lo;
8261 PyUnicode_WRITE(kind, data, i, lo);
8262 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008264 else if (ch > maxchar)
8265 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266 }
8267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008268 if (touched)
8269 return maxchar;
8270 else
8271 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272}
8273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008275fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008277 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8278 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8279 const int kind = PyUnicode_KIND(self);
8280 void *data = PyUnicode_DATA(self);
8281 int touched = 0;
8282 Py_UCS4 maxchar = 0;
8283 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008285 for(i = 0; i < len; ++i) {
8286 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8287 Py_UCS4 nu = 0;
8288
8289 if (Py_UNICODE_ISUPPER(ch))
8290 nu = Py_UNICODE_TOLOWER(ch);
8291 else if (Py_UNICODE_ISLOWER(ch))
8292 nu = Py_UNICODE_TOUPPER(ch);
8293
8294 if (nu != 0) {
8295 if (nu > maxchar)
8296 maxchar = nu;
8297 PyUnicode_WRITE(kind, data, i, nu);
8298 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008300 else if (ch > maxchar)
8301 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 }
8303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008304 if (touched)
8305 return maxchar;
8306 else
8307 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308}
8309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008310static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008311fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008313 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8314 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8315 const int kind = PyUnicode_KIND(self);
8316 void *data = PyUnicode_DATA(self);
8317 int touched = 0;
8318 Py_UCS4 maxchar = 0;
8319 Py_ssize_t i = 0;
8320 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008321
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008322 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008324
8325 ch = PyUnicode_READ(kind, data, i);
8326 if (!Py_UNICODE_ISUPPER(ch)) {
8327 maxchar = Py_UNICODE_TOUPPER(ch);
8328 PyUnicode_WRITE(kind, data, i, maxchar);
8329 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008331 ++i;
8332 for(; i < len; ++i) {
8333 ch = PyUnicode_READ(kind, data, i);
8334 if (!Py_UNICODE_ISLOWER(ch)) {
8335 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8336 if (lo > maxchar)
8337 maxchar = lo;
8338 PyUnicode_WRITE(kind, data, i, lo);
8339 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008340 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008341 else if (ch > maxchar)
8342 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008343 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008344
8345 if (touched)
8346 return maxchar;
8347 else
8348 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349}
8350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008352fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008354 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8355 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8356 const int kind = PyUnicode_KIND(self);
8357 void *data = PyUnicode_DATA(self);
8358 Py_UCS4 maxchar = 0;
8359 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360 int previous_is_cased;
8361
8362 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008363 if (len == 1) {
8364 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8365 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8366 if (ti != ch) {
8367 PyUnicode_WRITE(kind, data, i, ti);
8368 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 }
8370 else
8371 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 for(; i < len; ++i) {
8375 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8376 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008377
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008379 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 nu = Py_UNICODE_TOTITLE(ch);
8382
8383 if (nu > maxchar)
8384 maxchar = nu;
8385 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008386
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 if (Py_UNICODE_ISLOWER(ch) ||
8388 Py_UNICODE_ISUPPER(ch) ||
8389 Py_UNICODE_ISTITLE(ch))
8390 previous_is_cased = 1;
8391 else
8392 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395}
8396
Tim Peters8ce9f162004-08-27 01:49:32 +00008397PyObject *
8398PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008400 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008401 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008403 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008404 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8405 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008406 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008407 Py_ssize_t sz, i, res_offset;
8408 Py_UCS4 maxchar = 0;
8409 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410
Tim Peters05eba1f2004-08-27 21:32:02 +00008411 fseq = PySequence_Fast(seq, "");
8412 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008413 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008414 }
8415
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008416 /* NOTE: the following code can't call back into Python code,
8417 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008418 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008419
Tim Peters05eba1f2004-08-27 21:32:02 +00008420 seqlen = PySequence_Fast_GET_SIZE(fseq);
8421 /* If empty sequence, return u"". */
8422 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008423 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008425 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008426 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008427 /* If singleton sequence with an exact Unicode, return that. */
8428 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 item = items[0];
8430 if (PyUnicode_CheckExact(item)) {
8431 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008432 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 goto Done;
8434 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008435 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008436 else {
8437 /* Set up sep and seplen */
8438 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008439 /* fall back to a blank space separator */
8440 sep = PyUnicode_FromOrdinal(' ');
8441 if (!sep || PyUnicode_READY(sep) == -1)
8442 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008443 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008444 else {
8445 if (!PyUnicode_Check(separator)) {
8446 PyErr_Format(PyExc_TypeError,
8447 "separator: expected str instance,"
8448 " %.80s found",
8449 Py_TYPE(separator)->tp_name);
8450 goto onError;
8451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452 if (PyUnicode_READY(separator) == -1)
8453 goto onError;
8454 sep = separator;
8455 seplen = PyUnicode_GET_LENGTH(separator);
8456 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8457 /* inc refcount to keep this code path symetric with the
8458 above case of a blank separator */
8459 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008460 }
8461 }
8462
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008463 /* There are at least two things to join, or else we have a subclass
8464 * of str in the sequence.
8465 * Do a pre-pass to figure out the total amount of space we'll
8466 * need (sz), and see whether all argument are strings.
8467 */
8468 sz = 0;
8469 for (i = 0; i < seqlen; i++) {
8470 const Py_ssize_t old_sz = sz;
8471 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 if (!PyUnicode_Check(item)) {
8473 PyErr_Format(PyExc_TypeError,
8474 "sequence item %zd: expected str instance,"
8475 " %.80s found",
8476 i, Py_TYPE(item)->tp_name);
8477 goto onError;
8478 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008479 if (PyUnicode_READY(item) == -1)
8480 goto onError;
8481 sz += PyUnicode_GET_LENGTH(item);
8482 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8483 if (item_maxchar > maxchar)
8484 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008485 if (i != 0)
8486 sz += seplen;
8487 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8488 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008490 goto onError;
8491 }
8492 }
Tim Petersced69f82003-09-16 20:30:58 +00008493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008494 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008495 if (res == NULL)
8496 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008497
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008498 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008499 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008500 Py_ssize_t itemlen;
8501 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 /* Copy item, and maybe the separator. */
8504 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008505 if (PyUnicode_CopyCharacters(res, res_offset,
8506 sep, 0, seplen) < 0)
8507 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008510 if (PyUnicode_CopyCharacters(res, res_offset,
8511 item, 0, itemlen) < 0)
8512 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008513 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008514 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008516
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008518 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 Py_XDECREF(sep);
8520 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008523 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008524 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008525 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526 return NULL;
8527}
8528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529#define FILL(kind, data, value, start, length) \
8530 do { \
8531 Py_ssize_t i_ = 0; \
8532 assert(kind != PyUnicode_WCHAR_KIND); \
8533 switch ((kind)) { \
8534 case PyUnicode_1BYTE_KIND: { \
8535 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8536 memset(to_, (unsigned char)value, length); \
8537 break; \
8538 } \
8539 case PyUnicode_2BYTE_KIND: { \
8540 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8541 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8542 break; \
8543 } \
8544 default: { \
8545 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8546 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8547 break; \
8548 } \
8549 } \
8550 } while (0)
8551
Alexander Belopolsky40018472011-02-26 01:02:56 +00008552static PyUnicodeObject *
8553pad(PyUnicodeObject *self,
8554 Py_ssize_t left,
8555 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 PyObject *u;
8559 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008560 int kind;
8561 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562
8563 if (left < 0)
8564 left = 0;
8565 if (right < 0)
8566 right = 0;
8567
Tim Peters7a29bd52001-09-12 03:03:31 +00008568 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569 Py_INCREF(self);
8570 return self;
8571 }
8572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8574 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008575 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8576 return NULL;
8577 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8579 if (fill > maxchar)
8580 maxchar = fill;
8581 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008582 if (!u)
8583 return NULL;
8584
8585 kind = PyUnicode_KIND(u);
8586 data = PyUnicode_DATA(u);
8587 if (left)
8588 FILL(kind, data, fill, 0, left);
8589 if (right)
8590 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008591 if (PyUnicode_CopyCharacters(u, left,
8592 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008593 _PyUnicode_LENGTH(self)) < 0)
8594 {
8595 Py_DECREF(u);
8596 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597 }
8598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602
Alexander Belopolsky40018472011-02-26 01:02:56 +00008603PyObject *
8604PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607
8608 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 switch(PyUnicode_KIND(string)) {
8613 case PyUnicode_1BYTE_KIND:
8614 list = ucs1lib_splitlines(
8615 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8616 PyUnicode_GET_LENGTH(string), keepends);
8617 break;
8618 case PyUnicode_2BYTE_KIND:
8619 list = ucs2lib_splitlines(
8620 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8621 PyUnicode_GET_LENGTH(string), keepends);
8622 break;
8623 case PyUnicode_4BYTE_KIND:
8624 list = ucs4lib_splitlines(
8625 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8626 PyUnicode_GET_LENGTH(string), keepends);
8627 break;
8628 default:
8629 assert(0);
8630 list = 0;
8631 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 Py_DECREF(string);
8633 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634}
8635
Alexander Belopolsky40018472011-02-26 01:02:56 +00008636static PyObject *
8637split(PyUnicodeObject *self,
8638 PyUnicodeObject *substring,
8639 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 int kind1, kind2, kind;
8642 void *buf1, *buf2;
8643 Py_ssize_t len1, len2;
8644 PyObject* out;
8645
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008647 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 if (PyUnicode_READY(self) == -1)
8650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 if (substring == NULL)
8653 switch(PyUnicode_KIND(self)) {
8654 case PyUnicode_1BYTE_KIND:
8655 return ucs1lib_split_whitespace(
8656 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8657 PyUnicode_GET_LENGTH(self), maxcount
8658 );
8659 case PyUnicode_2BYTE_KIND:
8660 return ucs2lib_split_whitespace(
8661 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8662 PyUnicode_GET_LENGTH(self), maxcount
8663 );
8664 case PyUnicode_4BYTE_KIND:
8665 return ucs4lib_split_whitespace(
8666 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8667 PyUnicode_GET_LENGTH(self), maxcount
8668 );
8669 default:
8670 assert(0);
8671 return NULL;
8672 }
8673
8674 if (PyUnicode_READY(substring) == -1)
8675 return NULL;
8676
8677 kind1 = PyUnicode_KIND(self);
8678 kind2 = PyUnicode_KIND(substring);
8679 kind = kind1 > kind2 ? kind1 : kind2;
8680 buf1 = PyUnicode_DATA(self);
8681 buf2 = PyUnicode_DATA(substring);
8682 if (kind1 != kind)
8683 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8684 if (!buf1)
8685 return NULL;
8686 if (kind2 != kind)
8687 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8688 if (!buf2) {
8689 if (kind1 != kind) PyMem_Free(buf1);
8690 return NULL;
8691 }
8692 len1 = PyUnicode_GET_LENGTH(self);
8693 len2 = PyUnicode_GET_LENGTH(substring);
8694
8695 switch(kind) {
8696 case PyUnicode_1BYTE_KIND:
8697 out = ucs1lib_split(
8698 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8699 break;
8700 case PyUnicode_2BYTE_KIND:
8701 out = ucs2lib_split(
8702 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8703 break;
8704 case PyUnicode_4BYTE_KIND:
8705 out = ucs4lib_split(
8706 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8707 break;
8708 default:
8709 out = NULL;
8710 }
8711 if (kind1 != kind)
8712 PyMem_Free(buf1);
8713 if (kind2 != kind)
8714 PyMem_Free(buf2);
8715 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716}
8717
Alexander Belopolsky40018472011-02-26 01:02:56 +00008718static PyObject *
8719rsplit(PyUnicodeObject *self,
8720 PyUnicodeObject *substring,
8721 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008722{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 int kind1, kind2, kind;
8724 void *buf1, *buf2;
8725 Py_ssize_t len1, len2;
8726 PyObject* out;
8727
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008728 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008729 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008731 if (PyUnicode_READY(self) == -1)
8732 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 if (substring == NULL)
8735 switch(PyUnicode_KIND(self)) {
8736 case PyUnicode_1BYTE_KIND:
8737 return ucs1lib_rsplit_whitespace(
8738 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8739 PyUnicode_GET_LENGTH(self), maxcount
8740 );
8741 case PyUnicode_2BYTE_KIND:
8742 return ucs2lib_rsplit_whitespace(
8743 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8744 PyUnicode_GET_LENGTH(self), maxcount
8745 );
8746 case PyUnicode_4BYTE_KIND:
8747 return ucs4lib_rsplit_whitespace(
8748 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8749 PyUnicode_GET_LENGTH(self), maxcount
8750 );
8751 default:
8752 assert(0);
8753 return NULL;
8754 }
8755
8756 if (PyUnicode_READY(substring) == -1)
8757 return NULL;
8758
8759 kind1 = PyUnicode_KIND(self);
8760 kind2 = PyUnicode_KIND(substring);
8761 kind = kind1 > kind2 ? kind1 : kind2;
8762 buf1 = PyUnicode_DATA(self);
8763 buf2 = PyUnicode_DATA(substring);
8764 if (kind1 != kind)
8765 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8766 if (!buf1)
8767 return NULL;
8768 if (kind2 != kind)
8769 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8770 if (!buf2) {
8771 if (kind1 != kind) PyMem_Free(buf1);
8772 return NULL;
8773 }
8774 len1 = PyUnicode_GET_LENGTH(self);
8775 len2 = PyUnicode_GET_LENGTH(substring);
8776
8777 switch(kind) {
8778 case PyUnicode_1BYTE_KIND:
8779 out = ucs1lib_rsplit(
8780 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8781 break;
8782 case PyUnicode_2BYTE_KIND:
8783 out = ucs2lib_rsplit(
8784 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8785 break;
8786 case PyUnicode_4BYTE_KIND:
8787 out = ucs4lib_rsplit(
8788 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8789 break;
8790 default:
8791 out = NULL;
8792 }
8793 if (kind1 != kind)
8794 PyMem_Free(buf1);
8795 if (kind2 != kind)
8796 PyMem_Free(buf2);
8797 return out;
8798}
8799
8800static Py_ssize_t
8801anylib_find(int kind, void *buf1, Py_ssize_t len1,
8802 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8803{
8804 switch(kind) {
8805 case PyUnicode_1BYTE_KIND:
8806 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8807 case PyUnicode_2BYTE_KIND:
8808 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8809 case PyUnicode_4BYTE_KIND:
8810 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8811 }
8812 assert(0);
8813 return -1;
8814}
8815
8816static Py_ssize_t
8817anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8818 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8819{
8820 switch(kind) {
8821 case PyUnicode_1BYTE_KIND:
8822 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8823 case PyUnicode_2BYTE_KIND:
8824 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8825 case PyUnicode_4BYTE_KIND:
8826 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8827 }
8828 assert(0);
8829 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008830}
8831
Alexander Belopolsky40018472011-02-26 01:02:56 +00008832static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833replace(PyObject *self, PyObject *str1,
8834 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 PyObject *u;
8837 char *sbuf = PyUnicode_DATA(self);
8838 char *buf1 = PyUnicode_DATA(str1);
8839 char *buf2 = PyUnicode_DATA(str2);
8840 int srelease = 0, release1 = 0, release2 = 0;
8841 int skind = PyUnicode_KIND(self);
8842 int kind1 = PyUnicode_KIND(str1);
8843 int kind2 = PyUnicode_KIND(str2);
8844 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8845 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8846 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847
8848 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008849 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008851 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853 if (skind < kind1)
8854 /* substring too wide to be present */
8855 goto nothing;
8856
8857 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008858 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008859 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008860 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008861 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008863 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864 Py_UCS4 u1, u2, maxchar;
8865 int mayshrink, rkind;
8866 u1 = PyUnicode_READ_CHAR(str1, 0);
8867 if (!findchar(sbuf, PyUnicode_KIND(self),
8868 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008869 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008870 u2 = PyUnicode_READ_CHAR(str2, 0);
8871 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8872 /* Replacing u1 with u2 may cause a maxchar reduction in the
8873 result string. */
8874 mayshrink = maxchar > 127;
8875 if (u2 > maxchar) {
8876 maxchar = u2;
8877 mayshrink = 0;
8878 }
8879 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008880 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008882 if (PyUnicode_CopyCharacters(u, 0,
8883 (PyObject*)self, 0, slen) < 0)
8884 {
8885 Py_DECREF(u);
8886 return NULL;
8887 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 rkind = PyUnicode_KIND(u);
8889 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8890 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008891 if (--maxcount < 0)
8892 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895 if (mayshrink) {
8896 PyObject *tmp = u;
8897 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8898 PyUnicode_GET_LENGTH(tmp));
8899 Py_DECREF(tmp);
8900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 int rkind = skind;
8903 char *res;
8904 if (kind1 < rkind) {
8905 /* widen substring */
8906 buf1 = _PyUnicode_AsKind(str1, rkind);
8907 if (!buf1) goto error;
8908 release1 = 1;
8909 }
8910 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008911 if (i < 0)
8912 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913 if (rkind > kind2) {
8914 /* widen replacement */
8915 buf2 = _PyUnicode_AsKind(str2, rkind);
8916 if (!buf2) goto error;
8917 release2 = 1;
8918 }
8919 else if (rkind < kind2) {
8920 /* widen self and buf1 */
8921 rkind = kind2;
8922 if (release1) PyMem_Free(buf1);
8923 sbuf = _PyUnicode_AsKind(self, rkind);
8924 if (!sbuf) goto error;
8925 srelease = 1;
8926 buf1 = _PyUnicode_AsKind(str1, rkind);
8927 if (!buf1) goto error;
8928 release1 = 1;
8929 }
8930 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8931 if (!res) {
8932 PyErr_NoMemory();
8933 goto error;
8934 }
8935 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008936 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8938 buf2,
8939 PyUnicode_KIND_SIZE(rkind, len2));
8940 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008941
8942 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8944 slen-i,
8945 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008946 if (i == -1)
8947 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8949 buf2,
8950 PyUnicode_KIND_SIZE(rkind, len2));
8951 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008952 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953
8954 u = PyUnicode_FromKindAndData(rkind, res, slen);
8955 PyMem_Free(res);
8956 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 Py_ssize_t n, i, j, ires;
8961 Py_ssize_t product, new_size;
8962 int rkind = skind;
8963 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 if (kind1 < rkind) {
8966 buf1 = _PyUnicode_AsKind(str1, rkind);
8967 if (!buf1) goto error;
8968 release1 = 1;
8969 }
8970 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008971 if (n == 0)
8972 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 if (kind2 < rkind) {
8974 buf2 = _PyUnicode_AsKind(str2, rkind);
8975 if (!buf2) goto error;
8976 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 else if (kind2 > rkind) {
8979 rkind = kind2;
8980 sbuf = _PyUnicode_AsKind(self, rkind);
8981 if (!sbuf) goto error;
8982 srelease = 1;
8983 if (release1) PyMem_Free(buf1);
8984 buf1 = _PyUnicode_AsKind(str1, rkind);
8985 if (!buf1) goto error;
8986 release1 = 1;
8987 }
8988 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
8989 PyUnicode_GET_LENGTH(str1))); */
8990 product = n * (len2-len1);
8991 if ((product / (len2-len1)) != n) {
8992 PyErr_SetString(PyExc_OverflowError,
8993 "replace string is too long");
8994 goto error;
8995 }
8996 new_size = slen + product;
8997 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
8998 PyErr_SetString(PyExc_OverflowError,
8999 "replace string is too long");
9000 goto error;
9001 }
9002 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9003 if (!res)
9004 goto error;
9005 ires = i = 0;
9006 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009007 while (n-- > 0) {
9008 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 j = anylib_find(rkind,
9010 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9011 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009012 if (j == -1)
9013 break;
9014 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009015 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9017 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9018 PyUnicode_KIND_SIZE(rkind, j-i));
9019 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009020 }
9021 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022 if (len2 > 0) {
9023 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9024 buf2,
9025 PyUnicode_KIND_SIZE(rkind, len2));
9026 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009031 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9033 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9034 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009035 } else {
9036 /* interleave */
9037 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009038 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9039 buf2,
9040 PyUnicode_KIND_SIZE(rkind, len2));
9041 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009042 if (--n <= 0)
9043 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9045 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9046 PyUnicode_KIND_SIZE(rkind, 1));
9047 ires++;
9048 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9051 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9052 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056 if (srelease)
9057 PyMem_FREE(sbuf);
9058 if (release1)
9059 PyMem_FREE(buf1);
9060 if (release2)
9061 PyMem_FREE(buf2);
9062 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009063
Benjamin Peterson29060642009-01-31 22:14:21 +00009064 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009065 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 if (srelease)
9067 PyMem_FREE(sbuf);
9068 if (release1)
9069 PyMem_FREE(buf1);
9070 if (release2)
9071 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009072 if (PyUnicode_CheckExact(self)) {
9073 Py_INCREF(self);
9074 return (PyObject *) self;
9075 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009076 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 error:
9078 if (srelease && sbuf)
9079 PyMem_FREE(sbuf);
9080 if (release1 && buf1)
9081 PyMem_FREE(buf1);
9082 if (release2 && buf2)
9083 PyMem_FREE(buf2);
9084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085}
9086
9087/* --- Unicode Object Methods --------------------------------------------- */
9088
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009089PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091\n\
9092Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009093characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094
9095static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009096unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 return fixup(self, fixtitle);
9099}
9100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009101PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009102 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103\n\
9104Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009105have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106
9107static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009108unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110 return fixup(self, fixcapitalize);
9111}
9112
9113#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009114PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116\n\
9117Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009118normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119
9120static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009121unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122{
9123 PyObject *list;
9124 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009125 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127 /* Split into words */
9128 list = split(self, NULL, -1);
9129 if (!list)
9130 return NULL;
9131
9132 /* Capitalize each word */
9133 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9134 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136 if (item == NULL)
9137 goto onError;
9138 Py_DECREF(PyList_GET_ITEM(list, i));
9139 PyList_SET_ITEM(list, i, item);
9140 }
9141
9142 /* Join the words to form a new string */
9143 item = PyUnicode_Join(NULL, list);
9144
Benjamin Peterson29060642009-01-31 22:14:21 +00009145 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146 Py_DECREF(list);
9147 return (PyObject *)item;
9148}
9149#endif
9150
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009151/* Argument converter. Coerces to a single unicode character */
9152
9153static int
9154convert_uc(PyObject *obj, void *addr)
9155{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009157 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009158
Benjamin Peterson14339b62009-01-31 16:36:08 +00009159 uniobj = PyUnicode_FromObject(obj);
9160 if (uniobj == NULL) {
9161 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009162 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009163 return 0;
9164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009165 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009166 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009168 Py_DECREF(uniobj);
9169 return 0;
9170 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171 if (PyUnicode_READY(uniobj)) {
9172 Py_DECREF(uniobj);
9173 return 0;
9174 }
9175 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009176 Py_DECREF(uniobj);
9177 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009178}
9179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009180PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009181 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009182\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009183Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009184done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185
9186static PyObject *
9187unicode_center(PyUnicodeObject *self, PyObject *args)
9188{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009189 Py_ssize_t marg, left;
9190 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009191 Py_UCS4 fillchar = ' ';
9192
9193 if (PyUnicode_READY(self) == -1)
9194 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009195
Thomas Woutersde017742006-02-16 19:34:37 +00009196 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009197 return NULL;
9198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009199 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009200 Py_INCREF(self);
9201 return (PyObject*) self;
9202 }
9203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009204 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205 left = marg / 2 + (marg & width & 1);
9206
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009207 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208}
9209
Marc-André Lemburge5034372000-08-08 08:04:29 +00009210#if 0
9211
9212/* This code should go into some future Unicode collation support
9213 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009214 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009215
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009216/* speedy UTF-16 code point order comparison */
9217/* gleaned from: */
9218/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9219
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009220static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009221{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009222 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009223 0, 0, 0, 0, 0, 0, 0, 0,
9224 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009225 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009226};
9227
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228static int
9229unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9230{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009231 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009232
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233 Py_UNICODE *s1 = str1->str;
9234 Py_UNICODE *s2 = str2->str;
9235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236 len1 = str1->_base._base.length;
9237 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009238
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009240 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009241
9242 c1 = *s1++;
9243 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009244
Benjamin Peterson29060642009-01-31 22:14:21 +00009245 if (c1 > (1<<11) * 26)
9246 c1 += utf16Fixup[c1>>11];
9247 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009248 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009249 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009250
9251 if (c1 != c2)
9252 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009253
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009254 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255 }
9256
9257 return (len1 < len2) ? -1 : (len1 != len2);
9258}
9259
Marc-André Lemburge5034372000-08-08 08:04:29 +00009260#else
9261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262/* This function assumes that str1 and str2 are readied by the caller. */
9263
Marc-André Lemburge5034372000-08-08 08:04:29 +00009264static int
9265unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9266{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 int kind1, kind2;
9268 void *data1, *data2;
9269 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 kind1 = PyUnicode_KIND(str1);
9272 kind2 = PyUnicode_KIND(str2);
9273 data1 = PyUnicode_DATA(str1);
9274 data2 = PyUnicode_DATA(str2);
9275 len1 = PyUnicode_GET_LENGTH(str1);
9276 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 for (i = 0; i < len1 && i < len2; ++i) {
9279 Py_UCS4 c1, c2;
9280 c1 = PyUnicode_READ(kind1, data1, i);
9281 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009282
9283 if (c1 != c2)
9284 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009285 }
9286
9287 return (len1 < len2) ? -1 : (len1 != len2);
9288}
9289
9290#endif
9291
Alexander Belopolsky40018472011-02-26 01:02:56 +00009292int
9293PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9296 if (PyUnicode_READY(left) == -1 ||
9297 PyUnicode_READY(right) == -1)
9298 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009299 return unicode_compare((PyUnicodeObject *)left,
9300 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009302 PyErr_Format(PyExc_TypeError,
9303 "Can't compare %.100s and %.100s",
9304 left->ob_type->tp_name,
9305 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306 return -1;
9307}
9308
Martin v. Löwis5b222132007-06-10 09:51:05 +00009309int
9310PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9311{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 Py_ssize_t i;
9313 int kind;
9314 void *data;
9315 Py_UCS4 chr;
9316
Martin v. Löwis5b222132007-06-10 09:51:05 +00009317 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 if (PyUnicode_READY(uni) == -1)
9319 return -1;
9320 kind = PyUnicode_KIND(uni);
9321 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009322 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9324 if (chr != str[i])
9325 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009326 /* This check keeps Python strings that end in '\0' from comparing equal
9327 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009329 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009330 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009331 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009332 return 0;
9333}
9334
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009335
Benjamin Peterson29060642009-01-31 22:14:21 +00009336#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009337 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009338
Alexander Belopolsky40018472011-02-26 01:02:56 +00009339PyObject *
9340PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009341{
9342 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009343
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009344 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9345 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 if (PyUnicode_READY(left) == -1 ||
9347 PyUnicode_READY(right) == -1)
9348 return NULL;
9349 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9350 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009351 if (op == Py_EQ) {
9352 Py_INCREF(Py_False);
9353 return Py_False;
9354 }
9355 if (op == Py_NE) {
9356 Py_INCREF(Py_True);
9357 return Py_True;
9358 }
9359 }
9360 if (left == right)
9361 result = 0;
9362 else
9363 result = unicode_compare((PyUnicodeObject *)left,
9364 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009365
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009366 /* Convert the return value to a Boolean */
9367 switch (op) {
9368 case Py_EQ:
9369 v = TEST_COND(result == 0);
9370 break;
9371 case Py_NE:
9372 v = TEST_COND(result != 0);
9373 break;
9374 case Py_LE:
9375 v = TEST_COND(result <= 0);
9376 break;
9377 case Py_GE:
9378 v = TEST_COND(result >= 0);
9379 break;
9380 case Py_LT:
9381 v = TEST_COND(result == -1);
9382 break;
9383 case Py_GT:
9384 v = TEST_COND(result == 1);
9385 break;
9386 default:
9387 PyErr_BadArgument();
9388 return NULL;
9389 }
9390 Py_INCREF(v);
9391 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009392 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009393
Brian Curtindfc80e32011-08-10 20:28:54 -05009394 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009395}
9396
Alexander Belopolsky40018472011-02-26 01:02:56 +00009397int
9398PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009399{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009400 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 int kind1, kind2, kind;
9402 void *buf1, *buf2;
9403 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009404 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009405
9406 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009407 sub = PyUnicode_FromObject(element);
9408 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009409 PyErr_Format(PyExc_TypeError,
9410 "'in <string>' requires string as left operand, not %s",
9411 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009412 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009413 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 if (PyUnicode_READY(sub) == -1)
9415 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009416
Thomas Wouters477c8d52006-05-27 19:21:47 +00009417 str = PyUnicode_FromObject(container);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418 if (!str || PyUnicode_READY(container) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009419 Py_DECREF(sub);
9420 return -1;
9421 }
9422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 kind1 = PyUnicode_KIND(str);
9424 kind2 = PyUnicode_KIND(sub);
9425 kind = kind1 > kind2 ? kind1 : kind2;
9426 buf1 = PyUnicode_DATA(str);
9427 buf2 = PyUnicode_DATA(sub);
9428 if (kind1 != kind)
9429 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9430 if (!buf1) {
9431 Py_DECREF(sub);
9432 return -1;
9433 }
9434 if (kind2 != kind)
9435 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9436 if (!buf2) {
9437 Py_DECREF(sub);
9438 if (kind1 != kind) PyMem_Free(buf1);
9439 return -1;
9440 }
9441 len1 = PyUnicode_GET_LENGTH(str);
9442 len2 = PyUnicode_GET_LENGTH(sub);
9443
9444 switch(kind) {
9445 case PyUnicode_1BYTE_KIND:
9446 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9447 break;
9448 case PyUnicode_2BYTE_KIND:
9449 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9450 break;
9451 case PyUnicode_4BYTE_KIND:
9452 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9453 break;
9454 default:
9455 result = -1;
9456 assert(0);
9457 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009458
9459 Py_DECREF(str);
9460 Py_DECREF(sub);
9461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 if (kind1 != kind)
9463 PyMem_Free(buf1);
9464 if (kind2 != kind)
9465 PyMem_Free(buf2);
9466
Guido van Rossum403d68b2000-03-13 15:55:09 +00009467 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009468}
9469
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470/* Concat to string or Unicode object giving a new Unicode object. */
9471
Alexander Belopolsky40018472011-02-26 01:02:56 +00009472PyObject *
9473PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 PyObject *u = NULL, *v = NULL, *w;
9476 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477
9478 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009481 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009484 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485
9486 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009488 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009492 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494 }
9495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 if (PyUnicode_READY(u) == -1 || PyUnicode_READY(v) == -1)
9497 goto onError;
9498
9499 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009500 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 w = PyUnicode_New(
9504 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9505 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009507 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009508 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9509 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009510 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009511 v, 0,
9512 PyUnicode_GET_LENGTH(v)) < 0)
9513 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514 Py_DECREF(u);
9515 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517
Benjamin Peterson29060642009-01-31 22:14:21 +00009518 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519 Py_XDECREF(u);
9520 Py_XDECREF(v);
9521 return NULL;
9522}
9523
Walter Dörwald1ab83302007-05-18 17:15:44 +00009524void
9525PyUnicode_Append(PyObject **pleft, PyObject *right)
9526{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009527 PyObject *new;
9528 if (*pleft == NULL)
9529 return;
9530 if (right == NULL || !PyUnicode_Check(*pleft)) {
9531 Py_DECREF(*pleft);
9532 *pleft = NULL;
9533 return;
9534 }
9535 new = PyUnicode_Concat(*pleft, right);
9536 Py_DECREF(*pleft);
9537 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009538}
9539
9540void
9541PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9542{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009543 PyUnicode_Append(pleft, right);
9544 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009545}
9546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009547PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009548 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009549\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009550Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009551string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009552interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009553
9554static PyObject *
9555unicode_count(PyUnicodeObject *self, PyObject *args)
9556{
9557 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009558 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009559 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009560 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 int kind1, kind2, kind;
9562 void *buf1, *buf2;
9563 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009564
Jesus Ceaac451502011-04-20 17:09:23 +02009565 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9566 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009567 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569 kind1 = PyUnicode_KIND(self);
9570 kind2 = PyUnicode_KIND(substring);
9571 kind = kind1 > kind2 ? kind1 : kind2;
9572 buf1 = PyUnicode_DATA(self);
9573 buf2 = PyUnicode_DATA(substring);
9574 if (kind1 != kind)
9575 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9576 if (!buf1) {
9577 Py_DECREF(substring);
9578 return NULL;
9579 }
9580 if (kind2 != kind)
9581 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9582 if (!buf2) {
9583 Py_DECREF(substring);
9584 if (kind1 != kind) PyMem_Free(buf1);
9585 return NULL;
9586 }
9587 len1 = PyUnicode_GET_LENGTH(self);
9588 len2 = PyUnicode_GET_LENGTH(substring);
9589
9590 ADJUST_INDICES(start, end, len1);
9591 switch(kind) {
9592 case PyUnicode_1BYTE_KIND:
9593 iresult = ucs1lib_count(
9594 ((Py_UCS1*)buf1) + start, end - start,
9595 buf2, len2, PY_SSIZE_T_MAX
9596 );
9597 break;
9598 case PyUnicode_2BYTE_KIND:
9599 iresult = ucs2lib_count(
9600 ((Py_UCS2*)buf1) + start, end - start,
9601 buf2, len2, PY_SSIZE_T_MAX
9602 );
9603 break;
9604 case PyUnicode_4BYTE_KIND:
9605 iresult = ucs4lib_count(
9606 ((Py_UCS4*)buf1) + start, end - start,
9607 buf2, len2, PY_SSIZE_T_MAX
9608 );
9609 break;
9610 default:
9611 assert(0); iresult = 0;
9612 }
9613
9614 result = PyLong_FromSsize_t(iresult);
9615
9616 if (kind1 != kind)
9617 PyMem_Free(buf1);
9618 if (kind2 != kind)
9619 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009620
9621 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009622
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623 return result;
9624}
9625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009626PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009627 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009629Encode S using the codec registered for encoding. Default encoding\n\
9630is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009631handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009632a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9633'xmlcharrefreplace' as well as any other name registered with\n\
9634codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635
9636static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009637unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009639 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640 char *encoding = NULL;
9641 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009642
Benjamin Peterson308d6372009-09-18 21:42:35 +00009643 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9644 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009646 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009647}
9648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009649PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009650 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651\n\
9652Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009653If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654
9655static PyObject*
9656unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9657{
9658 Py_UNICODE *e;
9659 Py_UNICODE *p;
9660 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009661 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009662 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663 PyUnicodeObject *u;
9664 int tabsize = 8;
9665
9666 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009667 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9670 return NULL;
9671
Thomas Wouters7e474022000-07-16 12:04:32 +00009672 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009673 i = 0; /* chars up to and including most recent \n or \r */
9674 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9676 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009678 if (tabsize > 0) {
9679 incr = tabsize - (j % tabsize); /* cannot overflow */
9680 if (j > PY_SSIZE_T_MAX - incr)
9681 goto overflow1;
9682 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009683 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009684 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009686 if (j > PY_SSIZE_T_MAX - 1)
9687 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 j++;
9689 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009690 if (i > PY_SSIZE_T_MAX - j)
9691 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009693 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694 }
9695 }
9696
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009697 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009698 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009699
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700 /* Second pass: create output string and fill it */
9701 u = _PyUnicode_New(i + j);
9702 if (!u)
9703 return NULL;
9704
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009705 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 q = _PyUnicode_WSTR(u); /* next output char */
9707 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009711 if (tabsize > 0) {
9712 i = tabsize - (j % tabsize);
9713 j += i;
9714 while (i--) {
9715 if (q >= qe)
9716 goto overflow2;
9717 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009718 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009719 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009720 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009721 else {
9722 if (q >= qe)
9723 goto overflow2;
9724 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009725 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726 if (*p == '\n' || *p == '\r')
9727 j = 0;
9728 }
9729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 if (PyUnicode_READY(u) == -1) {
9731 Py_DECREF(u);
9732 return NULL;
9733 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009735
9736 overflow2:
9737 Py_DECREF(u);
9738 overflow1:
9739 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009741}
9742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009743PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009744 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745\n\
9746Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009747such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748arguments start and end are interpreted as in slice notation.\n\
9749\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009750Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009751
9752static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009753unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754{
Jesus Ceaac451502011-04-20 17:09:23 +02009755 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009756 Py_ssize_t start;
9757 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009758 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759
Jesus Ceaac451502011-04-20 17:09:23 +02009760 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9761 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009763
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009764 if (PyUnicode_READY(self) == -1)
9765 return NULL;
9766 if (PyUnicode_READY(substring) == -1)
9767 return NULL;
9768
9769 result = any_find_slice(
9770 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9771 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009772 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773
9774 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009776 if (result == -2)
9777 return NULL;
9778
Christian Heimes217cfd12007-12-02 14:31:20 +00009779 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780}
9781
9782static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009783unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009784{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 Py_UCS4 ch;
9786
9787 if (PyUnicode_READY(self) == -1)
9788 return NULL;
9789 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790 PyErr_SetString(PyExc_IndexError, "string index out of range");
9791 return NULL;
9792 }
9793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9795 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009796}
9797
Guido van Rossumc2504932007-09-18 19:42:40 +00009798/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009799 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009800static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009801unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802{
Guido van Rossumc2504932007-09-18 19:42:40 +00009803 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009804 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009806 if (_PyUnicode_HASH(self) != -1)
9807 return _PyUnicode_HASH(self);
9808 if (PyUnicode_READY(self) == -1)
9809 return -1;
9810 len = PyUnicode_GET_LENGTH(self);
9811
9812 /* The hash function as a macro, gets expanded three times below. */
9813#define HASH(P) \
9814 x = (Py_uhash_t)*P << 7; \
9815 while (--len >= 0) \
9816 x = (1000003*x) ^ (Py_uhash_t)*P++;
9817
9818 switch (PyUnicode_KIND(self)) {
9819 case PyUnicode_1BYTE_KIND: {
9820 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9821 HASH(c);
9822 break;
9823 }
9824 case PyUnicode_2BYTE_KIND: {
9825 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9826 HASH(s);
9827 break;
9828 }
9829 default: {
9830 Py_UCS4 *l;
9831 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9832 "Impossible switch case in unicode_hash");
9833 l = PyUnicode_4BYTE_DATA(self);
9834 HASH(l);
9835 break;
9836 }
9837 }
9838 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9839
Guido van Rossumc2504932007-09-18 19:42:40 +00009840 if (x == -1)
9841 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009843 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009847PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009848 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009850Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851
9852static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009855 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009856 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009857 Py_ssize_t start;
9858 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859
Jesus Ceaac451502011-04-20 17:09:23 +02009860 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9861 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009862 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 if (PyUnicode_READY(self) == -1)
9865 return NULL;
9866 if (PyUnicode_READY(substring) == -1)
9867 return NULL;
9868
9869 result = any_find_slice(
9870 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9871 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009872 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873
9874 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 if (result == -2)
9877 return NULL;
9878
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879 if (result < 0) {
9880 PyErr_SetString(PyExc_ValueError, "substring not found");
9881 return NULL;
9882 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009883
Christian Heimes217cfd12007-12-02 14:31:20 +00009884 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885}
9886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009887PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009888 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009890Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009891at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009892
9893static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009894unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 Py_ssize_t i, length;
9897 int kind;
9898 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899 int cased;
9900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 if (PyUnicode_READY(self) == -1)
9902 return NULL;
9903 length = PyUnicode_GET_LENGTH(self);
9904 kind = PyUnicode_KIND(self);
9905 data = PyUnicode_DATA(self);
9906
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 if (length == 1)
9909 return PyBool_FromLong(
9910 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009912 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009914 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009915
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 for (i = 0; i < length; i++) {
9918 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009919
Benjamin Peterson29060642009-01-31 22:14:21 +00009920 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9921 return PyBool_FromLong(0);
9922 else if (!cased && Py_UNICODE_ISLOWER(ch))
9923 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009925 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926}
9927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009928PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009929 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009931Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009932at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933
9934static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009935unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 Py_ssize_t i, length;
9938 int kind;
9939 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940 int cased;
9941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 if (PyUnicode_READY(self) == -1)
9943 return NULL;
9944 length = PyUnicode_GET_LENGTH(self);
9945 kind = PyUnicode_KIND(self);
9946 data = PyUnicode_DATA(self);
9947
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 if (length == 1)
9950 return PyBool_FromLong(
9951 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009953 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009955 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009956
Guido van Rossumd57fd912000-03-10 22:53:23 +00009957 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009958 for (i = 0; i < length; i++) {
9959 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009960
Benjamin Peterson29060642009-01-31 22:14:21 +00009961 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9962 return PyBool_FromLong(0);
9963 else if (!cased && Py_UNICODE_ISUPPER(ch))
9964 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009965 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009966 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009967}
9968
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009969PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009970 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009971\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009972Return True if S is a titlecased string and there is at least one\n\
9973character in S, i.e. upper- and titlecase characters may only\n\
9974follow uncased characters and lowercase characters only cased ones.\n\
9975Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976
9977static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009978unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009979{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 Py_ssize_t i, length;
9981 int kind;
9982 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009983 int cased, previous_is_cased;
9984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 if (PyUnicode_READY(self) == -1)
9986 return NULL;
9987 length = PyUnicode_GET_LENGTH(self);
9988 kind = PyUnicode_KIND(self);
9989 data = PyUnicode_DATA(self);
9990
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 if (length == 1) {
9993 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
9994 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
9995 (Py_UNICODE_ISUPPER(ch) != 0));
9996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009998 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010000 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010001
Guido van Rossumd57fd912000-03-10 22:53:23 +000010002 cased = 0;
10003 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 for (i = 0; i < length; i++) {
10005 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010006
Benjamin Peterson29060642009-01-31 22:14:21 +000010007 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10008 if (previous_is_cased)
10009 return PyBool_FromLong(0);
10010 previous_is_cased = 1;
10011 cased = 1;
10012 }
10013 else if (Py_UNICODE_ISLOWER(ch)) {
10014 if (!previous_is_cased)
10015 return PyBool_FromLong(0);
10016 previous_is_cased = 1;
10017 cased = 1;
10018 }
10019 else
10020 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010022 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023}
10024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010025PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010026 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010028Return True if all characters in S are whitespace\n\
10029and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030
10031static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010032unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 Py_ssize_t i, length;
10035 int kind;
10036 void *data;
10037
10038 if (PyUnicode_READY(self) == -1)
10039 return NULL;
10040 length = PyUnicode_GET_LENGTH(self);
10041 kind = PyUnicode_KIND(self);
10042 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010043
Guido van Rossumd57fd912000-03-10 22:53:23 +000010044 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 if (length == 1)
10046 return PyBool_FromLong(
10047 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010048
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010049 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010051 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 for (i = 0; i < length; i++) {
10054 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010055 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010056 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010057 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010058 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010059}
10060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010061PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010062 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010063\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010064Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010065and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010066
10067static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010068unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010069{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 Py_ssize_t i, length;
10071 int kind;
10072 void *data;
10073
10074 if (PyUnicode_READY(self) == -1)
10075 return NULL;
10076 length = PyUnicode_GET_LENGTH(self);
10077 kind = PyUnicode_KIND(self);
10078 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010079
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010080 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 if (length == 1)
10082 return PyBool_FromLong(
10083 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010084
10085 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010087 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 for (i = 0; i < length; i++) {
10090 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010091 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010092 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010093 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010094}
10095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010096PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010097 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010098\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010099Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010100and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010101
10102static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010103unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 int kind;
10106 void *data;
10107 Py_ssize_t len, i;
10108
10109 if (PyUnicode_READY(self) == -1)
10110 return NULL;
10111
10112 kind = PyUnicode_KIND(self);
10113 data = PyUnicode_DATA(self);
10114 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010115
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010116 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 if (len == 1) {
10118 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10119 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10120 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010121
10122 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010124 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 for (i = 0; i < len; i++) {
10127 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010128 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010129 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010130 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010131 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010132}
10133
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010134PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010135 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010137Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010138False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139
10140static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010141unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 Py_ssize_t i, length;
10144 int kind;
10145 void *data;
10146
10147 if (PyUnicode_READY(self) == -1)
10148 return NULL;
10149 length = PyUnicode_GET_LENGTH(self);
10150 kind = PyUnicode_KIND(self);
10151 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 if (length == 1)
10155 return PyBool_FromLong(
10156 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010158 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010160 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 for (i = 0; i < length; i++) {
10163 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010164 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010166 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167}
10168
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010169PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010170 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010172Return True if all characters in S are digits\n\
10173and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
10175static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010176unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 Py_ssize_t i, length;
10179 int kind;
10180 void *data;
10181
10182 if (PyUnicode_READY(self) == -1)
10183 return NULL;
10184 length = PyUnicode_GET_LENGTH(self);
10185 kind = PyUnicode_KIND(self);
10186 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 if (length == 1) {
10190 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10191 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10192 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010193
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010194 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010196 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 for (i = 0; i < length; i++) {
10199 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010200 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010202 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203}
10204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010205PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010206 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010208Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010209False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210
10211static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010212unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 Py_ssize_t i, length;
10215 int kind;
10216 void *data;
10217
10218 if (PyUnicode_READY(self) == -1)
10219 return NULL;
10220 length = PyUnicode_GET_LENGTH(self);
10221 kind = PyUnicode_KIND(self);
10222 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010223
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 if (length == 1)
10226 return PyBool_FromLong(
10227 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010229 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010231 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 for (i = 0; i < length; i++) {
10234 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010235 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010237 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238}
10239
Martin v. Löwis47383402007-08-15 07:32:56 +000010240int
10241PyUnicode_IsIdentifier(PyObject *self)
10242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 int kind;
10244 void *data;
10245 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010246 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 if (PyUnicode_READY(self) == -1) {
10249 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010250 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 }
10252
10253 /* Special case for empty strings */
10254 if (PyUnicode_GET_LENGTH(self) == 0)
10255 return 0;
10256 kind = PyUnicode_KIND(self);
10257 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010258
10259 /* PEP 3131 says that the first character must be in
10260 XID_Start and subsequent characters in XID_Continue,
10261 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010262 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010263 letters, digits, underscore). However, given the current
10264 definition of XID_Start and XID_Continue, it is sufficient
10265 to check just for these, except that _ must be allowed
10266 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010268 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010269 return 0;
10270
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010271 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010273 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010274 return 1;
10275}
10276
10277PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010278 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010279\n\
10280Return True if S is a valid identifier according\n\
10281to the language definition.");
10282
10283static PyObject*
10284unicode_isidentifier(PyObject *self)
10285{
10286 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10287}
10288
Georg Brandl559e5d72008-06-11 18:37:52 +000010289PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010290 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010291\n\
10292Return True if all characters in S are considered\n\
10293printable in repr() or S is empty, False otherwise.");
10294
10295static PyObject*
10296unicode_isprintable(PyObject *self)
10297{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 Py_ssize_t i, length;
10299 int kind;
10300 void *data;
10301
10302 if (PyUnicode_READY(self) == -1)
10303 return NULL;
10304 length = PyUnicode_GET_LENGTH(self);
10305 kind = PyUnicode_KIND(self);
10306 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010307
10308 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 if (length == 1)
10310 return PyBool_FromLong(
10311 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 for (i = 0; i < length; i++) {
10314 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010315 Py_RETURN_FALSE;
10316 }
10317 }
10318 Py_RETURN_TRUE;
10319}
10320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010321PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010322 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010323\n\
10324Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010325iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326
10327static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010328unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010330 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331}
10332
Martin v. Löwis18e16552006-02-15 17:27:45 +000010333static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010334unicode_length(PyUnicodeObject *self)
10335{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 if (PyUnicode_READY(self) == -1)
10337 return -1;
10338 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010339}
10340
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010341PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010342 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010344Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010345done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346
10347static PyObject *
10348unicode_ljust(PyUnicodeObject *self, PyObject *args)
10349{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010350 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 Py_UCS4 fillchar = ' ';
10352
10353 if (PyUnicode_READY(self) == -1)
10354 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010355
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010356 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357 return NULL;
10358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010360 Py_INCREF(self);
10361 return (PyObject*) self;
10362 }
10363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365}
10366
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010367PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010368 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010370Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371
10372static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010373unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375 return fixup(self, fixlower);
10376}
10377
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010378#define LEFTSTRIP 0
10379#define RIGHTSTRIP 1
10380#define BOTHSTRIP 2
10381
10382/* Arrays indexed by above */
10383static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10384
10385#define STRIPNAME(i) (stripformat[i]+3)
10386
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010387/* externally visible for str.strip(unicode) */
10388PyObject *
10389_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10390{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 void *data;
10392 int kind;
10393 Py_ssize_t i, j, len;
10394 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10397 return NULL;
10398
10399 kind = PyUnicode_KIND(self);
10400 data = PyUnicode_DATA(self);
10401 len = PyUnicode_GET_LENGTH(self);
10402 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10403 PyUnicode_DATA(sepobj),
10404 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010405
Benjamin Peterson14339b62009-01-31 16:36:08 +000010406 i = 0;
10407 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 while (i < len &&
10409 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010410 i++;
10411 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010412 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010413
Benjamin Peterson14339b62009-01-31 16:36:08 +000010414 j = len;
10415 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010416 do {
10417 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 } while (j >= i &&
10419 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010420 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010421 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010422
Benjamin Peterson14339b62009-01-31 16:36:08 +000010423 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010424 Py_INCREF(self);
10425 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010426 }
10427 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 return PyUnicode_Substring((PyObject*)self, i, j);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010429}
10430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431/* Assumes an already ready self string. */
10432
10433static PyObject *
10434substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len)
10435{
10436 const int kind = PyUnicode_KIND(self);
10437 void *data = PyUnicode_DATA(self);
10438 Py_UCS4 maxchar = 0;
10439 Py_ssize_t i;
10440 PyObject *unicode;
10441
10442 if (start < 0 || len < 0 || (start + len) > PyUnicode_GET_LENGTH(self)) {
10443 PyErr_BadInternalCall();
10444 return NULL;
10445 }
10446
10447 if (len == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) {
10448 Py_INCREF(self);
10449 return (PyObject*)self;
10450 }
10451
10452 for (i = 0; i < len; ++i) {
10453 const Py_UCS4 ch = PyUnicode_READ(kind, data, start + i);
10454 if (ch > maxchar)
10455 maxchar = ch;
10456 }
10457
10458 unicode = PyUnicode_New(len, maxchar);
10459 if (unicode == NULL)
10460 return NULL;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010461 if (PyUnicode_CopyCharacters(unicode, 0,
10462 (PyObject*)self, start, len) < 0)
10463 {
10464 Py_DECREF(unicode);
10465 return NULL;
10466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 return unicode;
10468}
10469
10470PyObject*
10471PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10472{
10473 unsigned char *data;
10474 int kind;
10475
10476 if (start == 0 && end == PyUnicode_GET_LENGTH(self)
10477 && PyUnicode_CheckExact(self))
10478 {
10479 Py_INCREF(self);
10480 return (PyObject *)self;
10481 }
10482
10483 if ((end - start) == 1)
10484 return unicode_getitem((PyUnicodeObject*)self, start);
10485
10486 if (PyUnicode_READY(self) == -1)
10487 return NULL;
10488 kind = PyUnicode_KIND(self);
10489 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010490 return PyUnicode_FromKindAndData(kind,
10491 data + PyUnicode_KIND_SIZE(kind, start),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 end-start);
10493}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010494
10495static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010496do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 int kind;
10499 void *data;
10500 Py_ssize_t len, i, j;
10501
10502 if (PyUnicode_READY(self) == -1)
10503 return NULL;
10504
10505 kind = PyUnicode_KIND(self);
10506 data = PyUnicode_DATA(self);
10507 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010508
Benjamin Peterson14339b62009-01-31 16:36:08 +000010509 i = 0;
10510 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010512 i++;
10513 }
10514 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010515
Benjamin Peterson14339b62009-01-31 16:36:08 +000010516 j = len;
10517 if (striptype != LEFTSTRIP) {
10518 do {
10519 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010521 j++;
10522 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010523
Benjamin Peterson14339b62009-01-31 16:36:08 +000010524 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
10525 Py_INCREF(self);
10526 return (PyObject*)self;
10527 }
10528 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010529 return substring(self, i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530}
10531
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010532
10533static PyObject *
10534do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10535{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010536 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010537
Benjamin Peterson14339b62009-01-31 16:36:08 +000010538 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10539 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010540
Benjamin Peterson14339b62009-01-31 16:36:08 +000010541 if (sep != NULL && sep != Py_None) {
10542 if (PyUnicode_Check(sep))
10543 return _PyUnicode_XStrip(self, striptype, sep);
10544 else {
10545 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010546 "%s arg must be None or str",
10547 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010548 return NULL;
10549 }
10550 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010551
Benjamin Peterson14339b62009-01-31 16:36:08 +000010552 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010553}
10554
10555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010556PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010557 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010558\n\
10559Return a copy of the string S with leading and trailing\n\
10560whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010561If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010562
10563static PyObject *
10564unicode_strip(PyUnicodeObject *self, PyObject *args)
10565{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010566 if (PyTuple_GET_SIZE(args) == 0)
10567 return do_strip(self, BOTHSTRIP); /* Common case */
10568 else
10569 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010570}
10571
10572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010573PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010574 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010575\n\
10576Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010577If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010578
10579static PyObject *
10580unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10581{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010582 if (PyTuple_GET_SIZE(args) == 0)
10583 return do_strip(self, LEFTSTRIP); /* Common case */
10584 else
10585 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010586}
10587
10588
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010589PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010590 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010591\n\
10592Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010593If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010594
10595static PyObject *
10596unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10597{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010598 if (PyTuple_GET_SIZE(args) == 0)
10599 return do_strip(self, RIGHTSTRIP); /* Common case */
10600 else
10601 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010602}
10603
10604
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010606unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607{
10608 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 Py_ssize_t nchars, n;
10610 size_t nbytes, char_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
Georg Brandl222de0f2009-04-12 12:01:50 +000010612 if (len < 1) {
10613 Py_INCREF(unicode_empty);
10614 return (PyObject *)unicode_empty;
10615 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616
Tim Peters7a29bd52001-09-12 03:03:31 +000010617 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618 /* no repeat, return original string */
10619 Py_INCREF(str);
10620 return (PyObject*) str;
10621 }
Tim Peters8f422462000-09-09 06:13:41 +000010622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 if (PyUnicode_READY(str) == -1)
10624 return NULL;
10625
Tim Peters8f422462000-09-09 06:13:41 +000010626 /* ensure # of chars needed doesn't overflow int and # of bytes
10627 * needed doesn't overflow size_t
10628 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 nchars = len * PyUnicode_GET_LENGTH(str);
10630 if (nchars / len != PyUnicode_GET_LENGTH(str)) {
Tim Peters8f422462000-09-09 06:13:41 +000010631 PyErr_SetString(PyExc_OverflowError,
10632 "repeated string is too long");
10633 return NULL;
10634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 char_size = PyUnicode_CHARACTER_SIZE(str);
10636 nbytes = (nchars + 1) * char_size;
10637 if (nbytes / char_size != (size_t)(nchars + 1)) {
Tim Peters8f422462000-09-09 06:13:41 +000010638 PyErr_SetString(PyExc_OverflowError,
10639 "repeated string is too long");
10640 return NULL;
10641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643 if (!u)
10644 return NULL;
10645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 if (PyUnicode_GET_LENGTH(str) == 1) {
10647 const int kind = PyUnicode_KIND(str);
10648 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10649 void *to = PyUnicode_DATA(u);
10650 for (n = 0; n < len; ++n)
10651 PyUnicode_WRITE(kind, to, n, fill_char);
10652 }
10653 else {
10654 /* number of characters copied this far */
10655 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10656 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10657 char *to = (char *) PyUnicode_DATA(u);
10658 Py_MEMCPY(to, PyUnicode_DATA(str),
10659 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010660 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 n = (done <= nchars-done) ? done : nchars-done;
10662 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010663 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010665 }
10666
10667 return (PyObject*) u;
10668}
10669
Alexander Belopolsky40018472011-02-26 01:02:56 +000010670PyObject *
10671PyUnicode_Replace(PyObject *obj,
10672 PyObject *subobj,
10673 PyObject *replobj,
10674 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010675{
10676 PyObject *self;
10677 PyObject *str1;
10678 PyObject *str2;
10679 PyObject *result;
10680
10681 self = PyUnicode_FromObject(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 if (self == NULL || PyUnicode_READY(obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010683 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010684 str1 = PyUnicode_FromObject(subobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 if (str1 == NULL || PyUnicode_READY(obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010686 Py_DECREF(self);
10687 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688 }
10689 str2 = PyUnicode_FromObject(replobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 if (str2 == NULL || PyUnicode_READY(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010691 Py_DECREF(self);
10692 Py_DECREF(str1);
10693 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696 Py_DECREF(self);
10697 Py_DECREF(str1);
10698 Py_DECREF(str2);
10699 return result;
10700}
10701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010702PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010703 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704\n\
10705Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010706old replaced by new. If the optional argument count is\n\
10707given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708
10709static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 PyObject *str1;
10713 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010714 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715 PyObject *result;
10716
Martin v. Löwis18e16552006-02-15 17:27:45 +000010717 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010720 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 str1 = PyUnicode_FromObject(str1);
10722 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10723 return NULL;
10724 str2 = PyUnicode_FromObject(str2);
10725 if (str2 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010726 Py_DECREF(str1);
10727 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010728 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729
10730 result = replace(self, str1, str2, maxcount);
10731
10732 Py_DECREF(str1);
10733 Py_DECREF(str2);
10734 return result;
10735}
10736
Alexander Belopolsky40018472011-02-26 01:02:56 +000010737static PyObject *
10738unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010740 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 Py_ssize_t isize;
10742 Py_ssize_t osize, squote, dquote, i, o;
10743 Py_UCS4 max, quote;
10744 int ikind, okind;
10745 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010748 return NULL;
10749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 isize = PyUnicode_GET_LENGTH(unicode);
10751 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 /* Compute length of output, quote characters, and
10754 maximum character */
10755 osize = 2; /* quotes */
10756 max = 127;
10757 squote = dquote = 0;
10758 ikind = PyUnicode_KIND(unicode);
10759 for (i = 0; i < isize; i++) {
10760 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10761 switch (ch) {
10762 case '\'': squote++; osize++; break;
10763 case '"': dquote++; osize++; break;
10764 case '\\': case '\t': case '\r': case '\n':
10765 osize += 2; break;
10766 default:
10767 /* Fast-path ASCII */
10768 if (ch < ' ' || ch == 0x7f)
10769 osize += 4; /* \xHH */
10770 else if (ch < 0x7f)
10771 osize++;
10772 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10773 osize++;
10774 max = ch > max ? ch : max;
10775 }
10776 else if (ch < 0x100)
10777 osize += 4; /* \xHH */
10778 else if (ch < 0x10000)
10779 osize += 6; /* \uHHHH */
10780 else
10781 osize += 10; /* \uHHHHHHHH */
10782 }
10783 }
10784
10785 quote = '\'';
10786 if (squote) {
10787 if (dquote)
10788 /* Both squote and dquote present. Use squote,
10789 and escape them */
10790 osize += squote;
10791 else
10792 quote = '"';
10793 }
10794
10795 repr = PyUnicode_New(osize, max);
10796 if (repr == NULL)
10797 return NULL;
10798 okind = PyUnicode_KIND(repr);
10799 odata = PyUnicode_DATA(repr);
10800
10801 PyUnicode_WRITE(okind, odata, 0, quote);
10802 PyUnicode_WRITE(okind, odata, osize-1, quote);
10803
10804 for (i = 0, o = 1; i < isize; i++) {
10805 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010806
10807 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 if ((ch == quote) || (ch == '\\')) {
10809 PyUnicode_WRITE(okind, odata, o++, '\\');
10810 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010811 continue;
10812 }
10813
Benjamin Peterson29060642009-01-31 22:14:21 +000010814 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010815 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 PyUnicode_WRITE(okind, odata, o++, '\\');
10817 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010818 }
10819 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 PyUnicode_WRITE(okind, odata, o++, '\\');
10821 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010822 }
10823 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824 PyUnicode_WRITE(okind, odata, o++, '\\');
10825 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010826 }
10827
10828 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010829 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830 PyUnicode_WRITE(okind, odata, o++, '\\');
10831 PyUnicode_WRITE(okind, odata, o++, 'x');
10832 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10833 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010834 }
10835
Georg Brandl559e5d72008-06-11 18:37:52 +000010836 /* Copy ASCII characters as-is */
10837 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010839 }
10840
Benjamin Peterson29060642009-01-31 22:14:21 +000010841 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010842 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010843 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010844 (categories Z* and C* except ASCII space)
10845 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010847 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 if (ch <= 0xff) {
10849 PyUnicode_WRITE(okind, odata, o++, '\\');
10850 PyUnicode_WRITE(okind, odata, o++, 'x');
10851 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10852 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010853 }
10854 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 else if (ch >= 0x10000) {
10856 PyUnicode_WRITE(okind, odata, o++, '\\');
10857 PyUnicode_WRITE(okind, odata, o++, 'U');
10858 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10859 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10860 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10861 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10862 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10863 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10864 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10865 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010866 }
10867 /* Map 16-bit characters to '\uxxxx' */
10868 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 PyUnicode_WRITE(okind, odata, o++, '\\');
10870 PyUnicode_WRITE(okind, odata, o++, 'u');
10871 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10872 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10873 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10874 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010875 }
10876 }
10877 /* Copy characters as-is */
10878 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010879 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010880 }
10881 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010882 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010884 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885}
10886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010887PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010888 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889\n\
10890Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010891such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010892arguments start and end are interpreted as in slice notation.\n\
10893\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010894Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010895
10896static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010897unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898{
Jesus Ceaac451502011-04-20 17:09:23 +020010899 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010900 Py_ssize_t start;
10901 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010902 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903
Jesus Ceaac451502011-04-20 17:09:23 +020010904 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10905 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010906 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 if (PyUnicode_READY(self) == -1)
10909 return NULL;
10910 if (PyUnicode_READY(substring) == -1)
10911 return NULL;
10912
10913 result = any_find_slice(
10914 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10915 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010916 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917
10918 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010920 if (result == -2)
10921 return NULL;
10922
Christian Heimes217cfd12007-12-02 14:31:20 +000010923 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924}
10925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010926PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010927 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010929Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930
10931static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933{
Jesus Ceaac451502011-04-20 17:09:23 +020010934 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010935 Py_ssize_t start;
10936 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010937 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938
Jesus Ceaac451502011-04-20 17:09:23 +020010939 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10940 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010941 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943 if (PyUnicode_READY(self) == -1)
10944 return NULL;
10945 if (PyUnicode_READY(substring) == -1)
10946 return NULL;
10947
10948 result = any_find_slice(
10949 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10950 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010951 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952
10953 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 if (result == -2)
10956 return NULL;
10957
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958 if (result < 0) {
10959 PyErr_SetString(PyExc_ValueError, "substring not found");
10960 return NULL;
10961 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962
Christian Heimes217cfd12007-12-02 14:31:20 +000010963 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964}
10965
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010966PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010967 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010969Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010970done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971
10972static PyObject *
10973unicode_rjust(PyUnicodeObject *self, PyObject *args)
10974{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010975 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 Py_UCS4 fillchar = ' ';
10977
10978 if (PyUnicode_READY(self) == -1)
10979 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010980
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010981 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010982 return NULL;
10983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985 Py_INCREF(self);
10986 return (PyObject*) self;
10987 }
10988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990}
10991
Alexander Belopolsky40018472011-02-26 01:02:56 +000010992PyObject *
10993PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994{
10995 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010996
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997 s = PyUnicode_FromObject(s);
10998 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010999 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011000 if (sep != NULL) {
11001 sep = PyUnicode_FromObject(sep);
11002 if (sep == NULL) {
11003 Py_DECREF(s);
11004 return NULL;
11005 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 }
11007
11008 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11009
11010 Py_DECREF(s);
11011 Py_XDECREF(sep);
11012 return result;
11013}
11014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011015PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017\n\
11018Return a list of the words in S, using sep as the\n\
11019delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011020splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011021whitespace string is a separator and empty strings are\n\
11022removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023
11024static PyObject*
11025unicode_split(PyUnicodeObject *self, PyObject *args)
11026{
11027 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011028 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029
Martin v. Löwis18e16552006-02-15 17:27:45 +000011030 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 return NULL;
11032
11033 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011034 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011036 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011038 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039}
11040
Thomas Wouters477c8d52006-05-27 19:21:47 +000011041PyObject *
11042PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11043{
11044 PyObject* str_obj;
11045 PyObject* sep_obj;
11046 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 int kind1, kind2, kind;
11048 void *buf1 = NULL, *buf2 = NULL;
11049 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011050
11051 str_obj = PyUnicode_FromObject(str_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 if (!str_obj || PyUnicode_READY(str_in) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011053 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011054 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011055 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011056 Py_DECREF(str_obj);
11057 return NULL;
11058 }
11059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 kind1 = PyUnicode_KIND(str_in);
11061 kind2 = PyUnicode_KIND(sep_obj);
11062 kind = kind1 > kind2 ? kind1 : kind2;
11063 buf1 = PyUnicode_DATA(str_in);
11064 if (kind1 != kind)
11065 buf1 = _PyUnicode_AsKind(str_in, kind);
11066 if (!buf1)
11067 goto onError;
11068 buf2 = PyUnicode_DATA(sep_obj);
11069 if (kind2 != kind)
11070 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11071 if (!buf2)
11072 goto onError;
11073 len1 = PyUnicode_GET_LENGTH(str_obj);
11074 len2 = PyUnicode_GET_LENGTH(sep_obj);
11075
11076 switch(PyUnicode_KIND(str_in)) {
11077 case PyUnicode_1BYTE_KIND:
11078 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11079 break;
11080 case PyUnicode_2BYTE_KIND:
11081 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11082 break;
11083 case PyUnicode_4BYTE_KIND:
11084 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11085 break;
11086 default:
11087 assert(0);
11088 out = 0;
11089 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011090
11091 Py_DECREF(sep_obj);
11092 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 if (kind1 != kind)
11094 PyMem_Free(buf1);
11095 if (kind2 != kind)
11096 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011097
11098 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011099 onError:
11100 Py_DECREF(sep_obj);
11101 Py_DECREF(str_obj);
11102 if (kind1 != kind && buf1)
11103 PyMem_Free(buf1);
11104 if (kind2 != kind && buf2)
11105 PyMem_Free(buf2);
11106 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011107}
11108
11109
11110PyObject *
11111PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11112{
11113 PyObject* str_obj;
11114 PyObject* sep_obj;
11115 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 int kind1, kind2, kind;
11117 void *buf1 = NULL, *buf2 = NULL;
11118 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011119
11120 str_obj = PyUnicode_FromObject(str_in);
11121 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011122 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011123 sep_obj = PyUnicode_FromObject(sep_in);
11124 if (!sep_obj) {
11125 Py_DECREF(str_obj);
11126 return NULL;
11127 }
11128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011129 kind1 = PyUnicode_KIND(str_in);
11130 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011131 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 buf1 = PyUnicode_DATA(str_in);
11133 if (kind1 != kind)
11134 buf1 = _PyUnicode_AsKind(str_in, kind);
11135 if (!buf1)
11136 goto onError;
11137 buf2 = PyUnicode_DATA(sep_obj);
11138 if (kind2 != kind)
11139 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11140 if (!buf2)
11141 goto onError;
11142 len1 = PyUnicode_GET_LENGTH(str_obj);
11143 len2 = PyUnicode_GET_LENGTH(sep_obj);
11144
11145 switch(PyUnicode_KIND(str_in)) {
11146 case PyUnicode_1BYTE_KIND:
11147 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11148 break;
11149 case PyUnicode_2BYTE_KIND:
11150 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11151 break;
11152 case PyUnicode_4BYTE_KIND:
11153 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11154 break;
11155 default:
11156 assert(0);
11157 out = 0;
11158 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011159
11160 Py_DECREF(sep_obj);
11161 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 if (kind1 != kind)
11163 PyMem_Free(buf1);
11164 if (kind2 != kind)
11165 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011166
11167 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 onError:
11169 Py_DECREF(sep_obj);
11170 Py_DECREF(str_obj);
11171 if (kind1 != kind && buf1)
11172 PyMem_Free(buf1);
11173 if (kind2 != kind && buf2)
11174 PyMem_Free(buf2);
11175 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011176}
11177
11178PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011179 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011180\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011181Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011182the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011183found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011184
11185static PyObject*
11186unicode_partition(PyUnicodeObject *self, PyObject *separator)
11187{
11188 return PyUnicode_Partition((PyObject *)self, separator);
11189}
11190
11191PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011192 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011193\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011194Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011195the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011196separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011197
11198static PyObject*
11199unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11200{
11201 return PyUnicode_RPartition((PyObject *)self, separator);
11202}
11203
Alexander Belopolsky40018472011-02-26 01:02:56 +000011204PyObject *
11205PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011206{
11207 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011208
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011209 s = PyUnicode_FromObject(s);
11210 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011211 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011212 if (sep != NULL) {
11213 sep = PyUnicode_FromObject(sep);
11214 if (sep == NULL) {
11215 Py_DECREF(s);
11216 return NULL;
11217 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011218 }
11219
11220 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11221
11222 Py_DECREF(s);
11223 Py_XDECREF(sep);
11224 return result;
11225}
11226
11227PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011228 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011229\n\
11230Return a list of the words in S, using sep as the\n\
11231delimiter string, starting at the end of the string and\n\
11232working to the front. If maxsplit is given, at most maxsplit\n\
11233splits are done. If sep is not specified, any whitespace string\n\
11234is a separator.");
11235
11236static PyObject*
11237unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11238{
11239 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011240 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011241
Martin v. Löwis18e16552006-02-15 17:27:45 +000011242 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011243 return NULL;
11244
11245 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011246 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011247 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011248 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011249 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011250 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011251}
11252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011253PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011254 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255\n\
11256Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011257Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011258is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259
11260static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011261unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011263 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011264 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011266 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11267 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268 return NULL;
11269
Guido van Rossum86662912000-04-11 15:38:46 +000011270 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271}
11272
11273static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011274PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275{
Walter Dörwald346737f2007-05-31 10:44:43 +000011276 if (PyUnicode_CheckExact(self)) {
11277 Py_INCREF(self);
11278 return self;
11279 } else
11280 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011281 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282}
11283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011284PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011285 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286\n\
11287Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011288and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289
11290static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011291unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293 return fixup(self, fixswapcase);
11294}
11295
Georg Brandlceee0772007-11-27 23:48:05 +000011296PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011297 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011298\n\
11299Return a translation table usable for str.translate().\n\
11300If there is only one argument, it must be a dictionary mapping Unicode\n\
11301ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011302Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011303If there are two arguments, they must be strings of equal length, and\n\
11304in the resulting dictionary, each character in x will be mapped to the\n\
11305character at the same position in y. If there is a third argument, it\n\
11306must be a string, whose characters will be mapped to None in the result.");
11307
11308static PyObject*
11309unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11310{
11311 PyObject *x, *y = NULL, *z = NULL;
11312 PyObject *new = NULL, *key, *value;
11313 Py_ssize_t i = 0;
11314 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011315
Georg Brandlceee0772007-11-27 23:48:05 +000011316 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11317 return NULL;
11318 new = PyDict_New();
11319 if (!new)
11320 return NULL;
11321 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 int x_kind, y_kind, z_kind;
11323 void *x_data, *y_data, *z_data;
11324
Georg Brandlceee0772007-11-27 23:48:05 +000011325 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011326 if (!PyUnicode_Check(x)) {
11327 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11328 "be a string if there is a second argument");
11329 goto err;
11330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011332 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11333 "arguments must have equal length");
11334 goto err;
11335 }
11336 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 x_kind = PyUnicode_KIND(x);
11338 y_kind = PyUnicode_KIND(y);
11339 x_data = PyUnicode_DATA(x);
11340 y_data = PyUnicode_DATA(y);
11341 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11342 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11343 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011344 if (!key || !value)
11345 goto err;
11346 res = PyDict_SetItem(new, key, value);
11347 Py_DECREF(key);
11348 Py_DECREF(value);
11349 if (res < 0)
11350 goto err;
11351 }
11352 /* create entries for deleting chars in z */
11353 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011354 z_kind = PyUnicode_KIND(z);
11355 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011356 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011358 if (!key)
11359 goto err;
11360 res = PyDict_SetItem(new, key, Py_None);
11361 Py_DECREF(key);
11362 if (res < 0)
11363 goto err;
11364 }
11365 }
11366 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011367 int kind;
11368 void *data;
11369
Georg Brandlceee0772007-11-27 23:48:05 +000011370 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011371 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011372 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11373 "to maketrans it must be a dict");
11374 goto err;
11375 }
11376 /* copy entries into the new dict, converting string keys to int keys */
11377 while (PyDict_Next(x, &i, &key, &value)) {
11378 if (PyUnicode_Check(key)) {
11379 /* convert string keys to integer keys */
11380 PyObject *newkey;
11381 if (PyUnicode_GET_SIZE(key) != 1) {
11382 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11383 "table must be of length 1");
11384 goto err;
11385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 kind = PyUnicode_KIND(key);
11387 data = PyUnicode_DATA(key);
11388 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011389 if (!newkey)
11390 goto err;
11391 res = PyDict_SetItem(new, newkey, value);
11392 Py_DECREF(newkey);
11393 if (res < 0)
11394 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011395 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011396 /* just keep integer keys */
11397 if (PyDict_SetItem(new, key, value) < 0)
11398 goto err;
11399 } else {
11400 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11401 "be strings or integers");
11402 goto err;
11403 }
11404 }
11405 }
11406 return new;
11407 err:
11408 Py_DECREF(new);
11409 return NULL;
11410}
11411
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011412PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011413 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414\n\
11415Return a copy of the string S, where all characters have been mapped\n\
11416through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011417Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011418Unmapped characters are left untouched. Characters mapped to None\n\
11419are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420
11421static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425}
11426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011427PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011428 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011430Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431
11432static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011433unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435 return fixup(self, fixupper);
11436}
11437
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011438PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011439 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011441Pad a numeric string S with zeros on the left, to fill a field\n\
11442of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
11444static PyObject *
11445unicode_zfill(PyUnicodeObject *self, PyObject *args)
11446{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011447 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011449 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 int kind;
11451 void *data;
11452 Py_UCS4 chr;
11453
11454 if (PyUnicode_READY(self) == -1)
11455 return NULL;
11456
Martin v. Löwis18e16552006-02-15 17:27:45 +000011457 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458 return NULL;
11459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011461 if (PyUnicode_CheckExact(self)) {
11462 Py_INCREF(self);
11463 return (PyObject*) self;
11464 }
11465 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011466 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467 }
11468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470
11471 u = pad(self, fill, 0, '0');
11472
Walter Dörwald068325e2002-04-15 13:36:47 +000011473 if (u == NULL)
11474 return NULL;
11475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 kind = PyUnicode_KIND(u);
11477 data = PyUnicode_DATA(u);
11478 chr = PyUnicode_READ(kind, data, fill);
11479
11480 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011482 PyUnicode_WRITE(kind, data, 0, chr);
11483 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484 }
11485
11486 return (PyObject*) u;
11487}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488
11489#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011490static PyObject *
11491unicode__decimal2ascii(PyObject *self)
11492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011494}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495#endif
11496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011497PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011498 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011500Return True if S starts with the specified prefix, False otherwise.\n\
11501With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011502With optional end, stop comparing S at that position.\n\
11503prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504
11505static PyObject *
11506unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011509 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011511 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011512 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011513 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514
Jesus Ceaac451502011-04-20 17:09:23 +020011515 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011517 if (PyTuple_Check(subobj)) {
11518 Py_ssize_t i;
11519 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11520 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011521 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011522 if (substring == NULL)
11523 return NULL;
11524 result = tailmatch(self, substring, start, end, -1);
11525 Py_DECREF(substring);
11526 if (result) {
11527 Py_RETURN_TRUE;
11528 }
11529 }
11530 /* nothing matched */
11531 Py_RETURN_FALSE;
11532 }
11533 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011534 if (substring == NULL) {
11535 if (PyErr_ExceptionMatches(PyExc_TypeError))
11536 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11537 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011538 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011539 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011540 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011542 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543}
11544
11545
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011546PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011547 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011549Return True if S ends with the specified suffix, False otherwise.\n\
11550With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011551With optional end, stop comparing S at that position.\n\
11552suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553
11554static PyObject *
11555unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011556 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011558 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011560 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011561 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011562 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563
Jesus Ceaac451502011-04-20 17:09:23 +020011564 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011566 if (PyTuple_Check(subobj)) {
11567 Py_ssize_t i;
11568 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11569 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011570 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011571 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011572 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011573 result = tailmatch(self, substring, start, end, +1);
11574 Py_DECREF(substring);
11575 if (result) {
11576 Py_RETURN_TRUE;
11577 }
11578 }
11579 Py_RETURN_FALSE;
11580 }
11581 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011582 if (substring == NULL) {
11583 if (PyErr_ExceptionMatches(PyExc_TypeError))
11584 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11585 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011587 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011588 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011590 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591}
11592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011593#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011594
11595PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011596 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011597\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011598Return a formatted version of S, using substitutions from args and kwargs.\n\
11599The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011600
Eric Smith27bbca62010-11-04 17:06:58 +000011601PyDoc_STRVAR(format_map__doc__,
11602 "S.format_map(mapping) -> str\n\
11603\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011604Return a formatted version of S, using substitutions from mapping.\n\
11605The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011606
Eric Smith4a7d76d2008-05-30 18:10:19 +000011607static PyObject *
11608unicode__format__(PyObject* self, PyObject* args)
11609{
11610 PyObject *format_spec;
11611
11612 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11613 return NULL;
11614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11616 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011617}
11618
Eric Smith8c663262007-08-25 02:26:07 +000011619PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011620 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011621\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011622Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011623
11624static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011625unicode__sizeof__(PyUnicodeObject *v)
11626{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 Py_ssize_t size;
11628
11629 /* If it's a compact object, account for base structure +
11630 character data. */
11631 if (PyUnicode_IS_COMPACT_ASCII(v))
11632 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11633 else if (PyUnicode_IS_COMPACT(v))
11634 size = sizeof(PyCompactUnicodeObject) +
11635 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11636 else {
11637 /* If it is a two-block object, account for base object, and
11638 for character block if present. */
11639 size = sizeof(PyUnicodeObject);
11640 if (v->data.any)
11641 size += (PyUnicode_GET_LENGTH(v) + 1) *
11642 PyUnicode_CHARACTER_SIZE(v);
11643 }
11644 /* If the wstr pointer is present, account for it unless it is shared
11645 with the data pointer. Since PyUnicode_DATA will crash if the object
11646 is not ready, check whether it's either not ready (in which case the
11647 data is entirely in wstr) or if the data is not shared. */
11648 if (_PyUnicode_WSTR(v) &&
11649 (!PyUnicode_IS_READY(v) ||
11650 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11651 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11652 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11653 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11654
11655 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011656}
11657
11658PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011659 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011660
11661static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011662unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011663{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011664 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 if (!copy)
11666 return NULL;
11667 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011668}
11669
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670static PyMethodDef unicode_methods[] = {
11671
11672 /* Order is according to common usage: often used methods should
11673 appear first, since lookup is done sequentially. */
11674
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011675 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011676 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11677 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011678 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011679 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11680 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11681 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11682 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11683 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11684 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11685 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011686 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011687 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11688 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11689 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011690 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011691 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11692 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11693 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011694 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011695 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011696 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011697 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011698 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11699 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11700 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11701 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11702 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11703 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11704 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11705 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11706 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11707 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11708 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11709 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11710 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11711 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011712 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011713 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011714 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011715 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011716 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011717 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011718 {"maketrans", (PyCFunction) unicode_maketrans,
11719 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011720 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011721#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011722 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723#endif
11724
11725#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011726 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011727 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728#endif
11729
Benjamin Peterson14339b62009-01-31 16:36:08 +000011730 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731 {NULL, NULL}
11732};
11733
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011734static PyObject *
11735unicode_mod(PyObject *v, PyObject *w)
11736{
Brian Curtindfc80e32011-08-10 20:28:54 -050011737 if (!PyUnicode_Check(v))
11738 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011740}
11741
11742static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011743 0, /*nb_add*/
11744 0, /*nb_subtract*/
11745 0, /*nb_multiply*/
11746 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011747};
11748
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011750 (lenfunc) unicode_length, /* sq_length */
11751 PyUnicode_Concat, /* sq_concat */
11752 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11753 (ssizeargfunc) unicode_getitem, /* sq_item */
11754 0, /* sq_slice */
11755 0, /* sq_ass_item */
11756 0, /* sq_ass_slice */
11757 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758};
11759
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011760static PyObject*
11761unicode_subscript(PyUnicodeObject* self, PyObject* item)
11762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 if (PyUnicode_READY(self) == -1)
11764 return NULL;
11765
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011766 if (PyIndex_Check(item)) {
11767 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011768 if (i == -1 && PyErr_Occurred())
11769 return NULL;
11770 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011772 return unicode_getitem(self, i);
11773 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011774 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011776 Py_UNICODE* result_buf;
11777 PyObject* result;
11778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011780 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011781 return NULL;
11782 }
11783
11784 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 return PyUnicode_New(0, 0);
11786 } else if (start == 0 && step == 1 &&
11787 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011788 PyUnicode_CheckExact(self)) {
11789 Py_INCREF(self);
11790 return (PyObject *)self;
11791 } else if (step == 1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 return substring(self, start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011793 } else {
11794 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011795 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11796 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011797
Benjamin Peterson29060642009-01-31 22:14:21 +000011798 if (result_buf == NULL)
11799 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011800
11801 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11802 result_buf[i] = source_buf[cur];
11803 }
Tim Petersced69f82003-09-16 20:30:58 +000011804
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011805 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011806 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011807 return result;
11808 }
11809 } else {
11810 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11811 return NULL;
11812 }
11813}
11814
11815static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011816 (lenfunc)unicode_length, /* mp_length */
11817 (binaryfunc)unicode_subscript, /* mp_subscript */
11818 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011819};
11820
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822/* Helpers for PyUnicode_Format() */
11823
11824static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011825getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011827 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011829 (*p_argidx)++;
11830 if (arglen < 0)
11831 return args;
11832 else
11833 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834 }
11835 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011836 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837 return NULL;
11838}
11839
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011840/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011842static PyObject *
11843formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011845 char *p;
11846 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011848
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849 x = PyFloat_AsDouble(v);
11850 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011851 return NULL;
11852
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011854 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011855
Eric Smith0923d1d2009-04-16 20:16:10 +000011856 p = PyOS_double_to_string(x, type, prec,
11857 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011858 if (p == NULL)
11859 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011861 PyMem_Free(p);
11862 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863}
11864
Tim Peters38fd5b62000-09-21 05:43:11 +000011865static PyObject*
11866formatlong(PyObject *val, int flags, int prec, int type)
11867{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011868 char *buf;
11869 int len;
11870 PyObject *str; /* temporary string object. */
11871 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011872
Benjamin Peterson14339b62009-01-31 16:36:08 +000011873 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11874 if (!str)
11875 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011877 Py_DECREF(str);
11878 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011879}
11880
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011883 size_t buflen,
11884 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011886 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011887 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 if (PyUnicode_GET_LENGTH(v) == 1) {
11889 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011890 buf[1] = '\0';
11891 return 1;
11892 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011893 goto onError;
11894 }
11895 else {
11896 /* Integer input truncated to a character */
11897 long x;
11898 x = PyLong_AsLong(v);
11899 if (x == -1 && PyErr_Occurred())
11900 goto onError;
11901
11902 if (x < 0 || x > 0x10ffff) {
11903 PyErr_SetString(PyExc_OverflowError,
11904 "%c arg not in range(0x110000)");
11905 return -1;
11906 }
11907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011909 buf[1] = '\0';
11910 return 1;
11911 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011912
Benjamin Peterson29060642009-01-31 22:14:21 +000011913 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011914 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011915 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011916 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917}
11918
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011919/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011920 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011921*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011922#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011923
Alexander Belopolsky40018472011-02-26 01:02:56 +000011924PyObject *
11925PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 void *fmt;
11928 int fmtkind;
11929 PyObject *result;
11930 Py_UCS4 *res, *res0;
11931 Py_UCS4 max;
11932 int kind;
11933 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011937
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011939 PyErr_BadInternalCall();
11940 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11943 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 fmt = PyUnicode_DATA(uformat);
11946 fmtkind = PyUnicode_KIND(uformat);
11947 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11948 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949
11950 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11952 if (res0 == NULL) {
11953 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956
11957 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011958 arglen = PyTuple_Size(args);
11959 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960 }
11961 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011962 arglen = -1;
11963 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011965 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011966 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011967 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968
11969 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 if (--rescnt < 0) {
11972 rescnt = fmtcnt + 100;
11973 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11975 if (res0 == NULL){
11976 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 }
11979 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011980 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011981 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011983 }
11984 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011985 /* Got a format specifier */
11986 int flags = 0;
11987 Py_ssize_t width = -1;
11988 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 Py_UCS4 c = '\0';
11990 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 int isnumok;
11992 PyObject *v = NULL;
11993 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 void *pbuf;
11995 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 Py_ssize_t len, len1;
11998 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 fmtpos++;
12001 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12002 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012003 Py_ssize_t keylen;
12004 PyObject *key;
12005 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012006
Benjamin Peterson29060642009-01-31 22:14:21 +000012007 if (dict == NULL) {
12008 PyErr_SetString(PyExc_TypeError,
12009 "format requires a mapping");
12010 goto onError;
12011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012013 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012015 /* Skip over balanced parentheses */
12016 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012018 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012024 if (fmtcnt < 0 || pcount > 0) {
12025 PyErr_SetString(PyExc_ValueError,
12026 "incomplete format key");
12027 goto onError;
12028 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012029 key = substring(uformat, keystart, keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012030 if (key == NULL)
12031 goto onError;
12032 if (args_owned) {
12033 Py_DECREF(args);
12034 args_owned = 0;
12035 }
12036 args = PyObject_GetItem(dict, key);
12037 Py_DECREF(key);
12038 if (args == NULL) {
12039 goto onError;
12040 }
12041 args_owned = 1;
12042 arglen = -1;
12043 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012044 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012045 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 case '-': flags |= F_LJUST; continue;
12048 case '+': flags |= F_SIGN; continue;
12049 case ' ': flags |= F_BLANK; continue;
12050 case '#': flags |= F_ALT; continue;
12051 case '0': flags |= F_ZERO; continue;
12052 }
12053 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012054 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012055 if (c == '*') {
12056 v = getnextarg(args, arglen, &argidx);
12057 if (v == NULL)
12058 goto onError;
12059 if (!PyLong_Check(v)) {
12060 PyErr_SetString(PyExc_TypeError,
12061 "* wants int");
12062 goto onError;
12063 }
12064 width = PyLong_AsLong(v);
12065 if (width == -1 && PyErr_Occurred())
12066 goto onError;
12067 if (width < 0) {
12068 flags |= F_LJUST;
12069 width = -width;
12070 }
12071 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012073 }
12074 else if (c >= '0' && c <= '9') {
12075 width = c - '0';
12076 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012078 if (c < '0' || c > '9')
12079 break;
12080 if ((width*10) / 10 != width) {
12081 PyErr_SetString(PyExc_ValueError,
12082 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012083 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012084 }
12085 width = width*10 + (c - '0');
12086 }
12087 }
12088 if (c == '.') {
12089 prec = 0;
12090 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012092 if (c == '*') {
12093 v = getnextarg(args, arglen, &argidx);
12094 if (v == NULL)
12095 goto onError;
12096 if (!PyLong_Check(v)) {
12097 PyErr_SetString(PyExc_TypeError,
12098 "* wants int");
12099 goto onError;
12100 }
12101 prec = PyLong_AsLong(v);
12102 if (prec == -1 && PyErr_Occurred())
12103 goto onError;
12104 if (prec < 0)
12105 prec = 0;
12106 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012108 }
12109 else if (c >= '0' && c <= '9') {
12110 prec = c - '0';
12111 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012113 if (c < '0' || c > '9')
12114 break;
12115 if ((prec*10) / 10 != prec) {
12116 PyErr_SetString(PyExc_ValueError,
12117 "prec too big");
12118 goto onError;
12119 }
12120 prec = prec*10 + (c - '0');
12121 }
12122 }
12123 } /* prec */
12124 if (fmtcnt >= 0) {
12125 if (c == 'h' || c == 'l' || c == 'L') {
12126 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012128 }
12129 }
12130 if (fmtcnt < 0) {
12131 PyErr_SetString(PyExc_ValueError,
12132 "incomplete format");
12133 goto onError;
12134 }
12135 if (c != '%') {
12136 v = getnextarg(args, arglen, &argidx);
12137 if (v == NULL)
12138 goto onError;
12139 }
12140 sign = 0;
12141 fill = ' ';
12142 switch (c) {
12143
12144 case '%':
12145 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012146 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012147 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012149 len = 1;
12150 break;
12151
12152 case 's':
12153 case 'r':
12154 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012155 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012156 temp = v;
12157 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012158 }
12159 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012160 if (c == 's')
12161 temp = PyObject_Str(v);
12162 else if (c == 'r')
12163 temp = PyObject_Repr(v);
12164 else
12165 temp = PyObject_ASCII(v);
12166 if (temp == NULL)
12167 goto onError;
12168 if (PyUnicode_Check(temp))
12169 /* nothing to do */;
12170 else {
12171 Py_DECREF(temp);
12172 PyErr_SetString(PyExc_TypeError,
12173 "%s argument has non-string str()");
12174 goto onError;
12175 }
12176 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 if (PyUnicode_READY(temp) == -1) {
12178 Py_CLEAR(temp);
12179 goto onError;
12180 }
12181 pbuf = PyUnicode_DATA(temp);
12182 kind = PyUnicode_KIND(temp);
12183 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012184 if (prec >= 0 && len > prec)
12185 len = prec;
12186 break;
12187
12188 case 'i':
12189 case 'd':
12190 case 'u':
12191 case 'o':
12192 case 'x':
12193 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012194 isnumok = 0;
12195 if (PyNumber_Check(v)) {
12196 PyObject *iobj=NULL;
12197
12198 if (PyLong_Check(v)) {
12199 iobj = v;
12200 Py_INCREF(iobj);
12201 }
12202 else {
12203 iobj = PyNumber_Long(v);
12204 }
12205 if (iobj!=NULL) {
12206 if (PyLong_Check(iobj)) {
12207 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012208 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012209 Py_DECREF(iobj);
12210 if (!temp)
12211 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 if (PyUnicode_READY(temp) == -1) {
12213 Py_CLEAR(temp);
12214 goto onError;
12215 }
12216 pbuf = PyUnicode_DATA(temp);
12217 kind = PyUnicode_KIND(temp);
12218 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012219 sign = 1;
12220 }
12221 else {
12222 Py_DECREF(iobj);
12223 }
12224 }
12225 }
12226 if (!isnumok) {
12227 PyErr_Format(PyExc_TypeError,
12228 "%%%c format: a number is required, "
12229 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12230 goto onError;
12231 }
12232 if (flags & F_ZERO)
12233 fill = '0';
12234 break;
12235
12236 case 'e':
12237 case 'E':
12238 case 'f':
12239 case 'F':
12240 case 'g':
12241 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012242 temp = formatfloat(v, flags, prec, c);
12243 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012244 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012245 if (PyUnicode_READY(temp) == -1) {
12246 Py_CLEAR(temp);
12247 goto onError;
12248 }
12249 pbuf = PyUnicode_DATA(temp);
12250 kind = PyUnicode_KIND(temp);
12251 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 sign = 1;
12253 if (flags & F_ZERO)
12254 fill = '0';
12255 break;
12256
12257 case 'c':
12258 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012260 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012261 if (len < 0)
12262 goto onError;
12263 break;
12264
12265 default:
12266 PyErr_Format(PyExc_ValueError,
12267 "unsupported format character '%c' (0x%x) "
12268 "at index %zd",
12269 (31<=c && c<=126) ? (char)c : '?',
12270 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012272 goto onError;
12273 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 /* pbuf is initialized here. */
12275 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012276 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12278 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12279 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012280 len--;
12281 }
12282 else if (flags & F_SIGN)
12283 sign = '+';
12284 else if (flags & F_BLANK)
12285 sign = ' ';
12286 else
12287 sign = 0;
12288 }
12289 if (width < len)
12290 width = len;
12291 if (rescnt - (sign != 0) < width) {
12292 reslen -= rescnt;
12293 rescnt = width + fmtcnt + 100;
12294 reslen += rescnt;
12295 if (reslen < 0) {
12296 Py_XDECREF(temp);
12297 PyErr_NoMemory();
12298 goto onError;
12299 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12301 if (res0 == 0) {
12302 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 Py_XDECREF(temp);
12304 goto onError;
12305 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 }
12308 if (sign) {
12309 if (fill != ' ')
12310 *res++ = sign;
12311 rescnt--;
12312 if (width > len)
12313 width--;
12314 }
12315 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12317 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012318 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12320 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012321 }
12322 rescnt -= 2;
12323 width -= 2;
12324 if (width < 0)
12325 width = 0;
12326 len -= 2;
12327 }
12328 if (width > len && !(flags & F_LJUST)) {
12329 do {
12330 --rescnt;
12331 *res++ = fill;
12332 } while (--width > len);
12333 }
12334 if (fill == ' ') {
12335 if (sign)
12336 *res++ = sign;
12337 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12339 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12340 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12341 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012342 }
12343 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 /* Copy all characters, preserving len */
12345 len1 = len;
12346 while (len1--) {
12347 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12348 rescnt--;
12349 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012350 while (--width >= len) {
12351 --rescnt;
12352 *res++ = ' ';
12353 }
12354 if (dict && (argidx < arglen) && c != '%') {
12355 PyErr_SetString(PyExc_TypeError,
12356 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012357 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012358 goto onError;
12359 }
12360 Py_XDECREF(temp);
12361 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362 } /* until end */
12363 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012364 PyErr_SetString(PyExc_TypeError,
12365 "not all arguments converted during string formatting");
12366 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012367 }
12368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369
12370 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12371 if (*res > max)
12372 max = *res;
12373 result = PyUnicode_New(reslen - rescnt, max);
12374 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 kind = PyUnicode_KIND(result);
12377 for (res = res0; res < res0+reslen-rescnt; res++)
12378 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12379 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012381 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382 }
12383 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384 return (PyObject *)result;
12385
Benjamin Peterson29060642009-01-31 22:14:21 +000012386 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388 Py_DECREF(uformat);
12389 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012390 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391 }
12392 return NULL;
12393}
12394
Jeremy Hylton938ace62002-07-17 16:30:39 +000012395static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012396unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12397
Tim Peters6d6c1a32001-08-02 04:15:00 +000012398static PyObject *
12399unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12400{
Benjamin Peterson29060642009-01-31 22:14:21 +000012401 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012402 static char *kwlist[] = {"object", "encoding", "errors", 0};
12403 char *encoding = NULL;
12404 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012405
Benjamin Peterson14339b62009-01-31 16:36:08 +000012406 if (type != &PyUnicode_Type)
12407 return unicode_subtype_new(type, args, kwds);
12408 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012409 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012410 return NULL;
12411 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012412 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012413 if (encoding == NULL && errors == NULL)
12414 return PyObject_Str(x);
12415 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012416 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012417}
12418
Guido van Rossume023fe02001-08-30 03:12:59 +000012419static PyObject *
12420unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12421{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012422 PyUnicodeObject *tmp, *pnew;
12423 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012425
Benjamin Peterson14339b62009-01-31 16:36:08 +000012426 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12427 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12428 if (tmp == NULL)
12429 return NULL;
12430 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12432 // it seems kind of strange that tp_alloc gets passed the size
12433 // of the unicode string because there will follow another
12434 // malloc.
12435 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12436 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012437 if (pnew == NULL) {
12438 Py_DECREF(tmp);
12439 return NULL;
12440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12442 if (_PyUnicode_WSTR(pnew) == NULL) {
12443 err = PyErr_NoMemory();
12444 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12447 _PyUnicode_WSTR_LENGTH(pnew) = n;
12448 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12449 _PyUnicode_STATE(pnew).interned = 0;
12450 _PyUnicode_STATE(pnew).kind = 0;
12451 _PyUnicode_STATE(pnew).compact = 0;
12452 _PyUnicode_STATE(pnew).ready = 0;
12453 _PyUnicode_STATE(pnew).ascii = 0;
12454 pnew->data.any = NULL;
12455 _PyUnicode_LENGTH(pnew) = 0;
12456 pnew->_base.utf8 = NULL;
12457 pnew->_base.utf8_length = 0;
12458
12459 if (PyUnicode_READY(pnew) == -1) {
12460 PyObject_FREE(_PyUnicode_WSTR(pnew));
12461 goto onError;
12462 }
12463
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464 Py_DECREF(tmp);
12465 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466
12467 onError:
12468 _Py_ForgetReference((PyObject *)pnew);
12469 PyObject_Del(pnew);
12470 Py_DECREF(tmp);
12471 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012472}
12473
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012474PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012475 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012476\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012477Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012478encoding defaults to the current default string encoding.\n\
12479errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012480
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012481static PyObject *unicode_iter(PyObject *seq);
12482
Guido van Rossumd57fd912000-03-10 22:53:23 +000012483PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012484 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012485 "str", /* tp_name */
12486 sizeof(PyUnicodeObject), /* tp_size */
12487 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012488 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012489 (destructor)unicode_dealloc, /* tp_dealloc */
12490 0, /* tp_print */
12491 0, /* tp_getattr */
12492 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012493 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012494 unicode_repr, /* tp_repr */
12495 &unicode_as_number, /* tp_as_number */
12496 &unicode_as_sequence, /* tp_as_sequence */
12497 &unicode_as_mapping, /* tp_as_mapping */
12498 (hashfunc) unicode_hash, /* tp_hash*/
12499 0, /* tp_call*/
12500 (reprfunc) unicode_str, /* tp_str */
12501 PyObject_GenericGetAttr, /* tp_getattro */
12502 0, /* tp_setattro */
12503 0, /* tp_as_buffer */
12504 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012505 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012506 unicode_doc, /* tp_doc */
12507 0, /* tp_traverse */
12508 0, /* tp_clear */
12509 PyUnicode_RichCompare, /* tp_richcompare */
12510 0, /* tp_weaklistoffset */
12511 unicode_iter, /* tp_iter */
12512 0, /* tp_iternext */
12513 unicode_methods, /* tp_methods */
12514 0, /* tp_members */
12515 0, /* tp_getset */
12516 &PyBaseObject_Type, /* tp_base */
12517 0, /* tp_dict */
12518 0, /* tp_descr_get */
12519 0, /* tp_descr_set */
12520 0, /* tp_dictoffset */
12521 0, /* tp_init */
12522 0, /* tp_alloc */
12523 unicode_new, /* tp_new */
12524 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525};
12526
12527/* Initialize the Unicode implementation */
12528
Thomas Wouters78890102000-07-22 19:25:51 +000012529void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012531 int i;
12532
Thomas Wouters477c8d52006-05-27 19:21:47 +000012533 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012535 0x000A, /* LINE FEED */
12536 0x000D, /* CARRIAGE RETURN */
12537 0x001C, /* FILE SEPARATOR */
12538 0x001D, /* GROUP SEPARATOR */
12539 0x001E, /* RECORD SEPARATOR */
12540 0x0085, /* NEXT LINE */
12541 0x2028, /* LINE SEPARATOR */
12542 0x2029, /* PARAGRAPH SEPARATOR */
12543 };
12544
Fred Drakee4315f52000-05-09 19:53:39 +000012545 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012547 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012549
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012550 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012551 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012552 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012553 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012554
12555 /* initialize the linebreak bloom filter */
12556 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012558 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012559
12560 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561}
12562
12563/* Finalize the Unicode implementation */
12564
Christian Heimesa156e092008-02-16 07:38:31 +000012565int
12566PyUnicode_ClearFreeList(void)
12567{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012569}
12570
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571void
Thomas Wouters78890102000-07-22 19:25:51 +000012572_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012574 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012576 Py_XDECREF(unicode_empty);
12577 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012578
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012579 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012580 if (unicode_latin1[i]) {
12581 Py_DECREF(unicode_latin1[i]);
12582 unicode_latin1[i] = NULL;
12583 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012584 }
Christian Heimesa156e092008-02-16 07:38:31 +000012585 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012587
Walter Dörwald16807132007-05-25 13:52:07 +000012588void
12589PyUnicode_InternInPlace(PyObject **p)
12590{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012591 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12592 PyObject *t;
12593 if (s == NULL || !PyUnicode_Check(s))
12594 Py_FatalError(
12595 "PyUnicode_InternInPlace: unicode strings only please!");
12596 /* If it's a subclass, we don't really know what putting
12597 it in the interned dict might do. */
12598 if (!PyUnicode_CheckExact(s))
12599 return;
12600 if (PyUnicode_CHECK_INTERNED(s))
12601 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 if (PyUnicode_READY(s) == -1) {
12603 assert(0 && "ready fail in intern...");
12604 return;
12605 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012606 if (interned == NULL) {
12607 interned = PyDict_New();
12608 if (interned == NULL) {
12609 PyErr_Clear(); /* Don't leave an exception */
12610 return;
12611 }
12612 }
12613 /* It might be that the GetItem call fails even
12614 though the key is present in the dictionary,
12615 namely when this happens during a stack overflow. */
12616 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012617 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012618 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012619
Benjamin Peterson29060642009-01-31 22:14:21 +000012620 if (t) {
12621 Py_INCREF(t);
12622 Py_DECREF(*p);
12623 *p = t;
12624 return;
12625 }
Walter Dörwald16807132007-05-25 13:52:07 +000012626
Benjamin Peterson14339b62009-01-31 16:36:08 +000012627 PyThreadState_GET()->recursion_critical = 1;
12628 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12629 PyErr_Clear();
12630 PyThreadState_GET()->recursion_critical = 0;
12631 return;
12632 }
12633 PyThreadState_GET()->recursion_critical = 0;
12634 /* The two references in interned are not counted by refcnt.
12635 The deallocator will take care of this */
12636 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012638}
12639
12640void
12641PyUnicode_InternImmortal(PyObject **p)
12642{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12644
Benjamin Peterson14339b62009-01-31 16:36:08 +000012645 PyUnicode_InternInPlace(p);
12646 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012648 Py_INCREF(*p);
12649 }
Walter Dörwald16807132007-05-25 13:52:07 +000012650}
12651
12652PyObject *
12653PyUnicode_InternFromString(const char *cp)
12654{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012655 PyObject *s = PyUnicode_FromString(cp);
12656 if (s == NULL)
12657 return NULL;
12658 PyUnicode_InternInPlace(&s);
12659 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012660}
12661
Alexander Belopolsky40018472011-02-26 01:02:56 +000012662void
12663_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012664{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012665 PyObject *keys;
12666 PyUnicodeObject *s;
12667 Py_ssize_t i, n;
12668 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012669
Benjamin Peterson14339b62009-01-31 16:36:08 +000012670 if (interned == NULL || !PyDict_Check(interned))
12671 return;
12672 keys = PyDict_Keys(interned);
12673 if (keys == NULL || !PyList_Check(keys)) {
12674 PyErr_Clear();
12675 return;
12676 }
Walter Dörwald16807132007-05-25 13:52:07 +000012677
Benjamin Peterson14339b62009-01-31 16:36:08 +000012678 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12679 detector, interned unicode strings are not forcibly deallocated;
12680 rather, we give them their stolen references back, and then clear
12681 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012682
Benjamin Peterson14339b62009-01-31 16:36:08 +000012683 n = PyList_GET_SIZE(keys);
12684 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012685 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012686 for (i = 0; i < n; i++) {
12687 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 if (PyUnicode_READY(s) == -1)
12689 fprintf(stderr, "could not ready string\n");
12690 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012691 case SSTATE_NOT_INTERNED:
12692 /* XXX Shouldn't happen */
12693 break;
12694 case SSTATE_INTERNED_IMMORTAL:
12695 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012697 break;
12698 case SSTATE_INTERNED_MORTAL:
12699 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012701 break;
12702 default:
12703 Py_FatalError("Inconsistent interned string state.");
12704 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012706 }
12707 fprintf(stderr, "total size of all interned strings: "
12708 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12709 "mortal/immortal\n", mortal_size, immortal_size);
12710 Py_DECREF(keys);
12711 PyDict_Clear(interned);
12712 Py_DECREF(interned);
12713 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012714}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012715
12716
12717/********************* Unicode Iterator **************************/
12718
12719typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012720 PyObject_HEAD
12721 Py_ssize_t it_index;
12722 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012723} unicodeiterobject;
12724
12725static void
12726unicodeiter_dealloc(unicodeiterobject *it)
12727{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012728 _PyObject_GC_UNTRACK(it);
12729 Py_XDECREF(it->it_seq);
12730 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012731}
12732
12733static int
12734unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12735{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012736 Py_VISIT(it->it_seq);
12737 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012738}
12739
12740static PyObject *
12741unicodeiter_next(unicodeiterobject *it)
12742{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012743 PyUnicodeObject *seq;
12744 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012745
Benjamin Peterson14339b62009-01-31 16:36:08 +000012746 assert(it != NULL);
12747 seq = it->it_seq;
12748 if (seq == NULL)
12749 return NULL;
12750 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12753 int kind = PyUnicode_KIND(seq);
12754 void *data = PyUnicode_DATA(seq);
12755 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12756 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012757 if (item != NULL)
12758 ++it->it_index;
12759 return item;
12760 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012761
Benjamin Peterson14339b62009-01-31 16:36:08 +000012762 Py_DECREF(seq);
12763 it->it_seq = NULL;
12764 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012765}
12766
12767static PyObject *
12768unicodeiter_len(unicodeiterobject *it)
12769{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012770 Py_ssize_t len = 0;
12771 if (it->it_seq)
12772 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12773 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012774}
12775
12776PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12777
12778static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012779 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012780 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012781 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012782};
12783
12784PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012785 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12786 "str_iterator", /* tp_name */
12787 sizeof(unicodeiterobject), /* tp_basicsize */
12788 0, /* tp_itemsize */
12789 /* methods */
12790 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12791 0, /* tp_print */
12792 0, /* tp_getattr */
12793 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012794 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012795 0, /* tp_repr */
12796 0, /* tp_as_number */
12797 0, /* tp_as_sequence */
12798 0, /* tp_as_mapping */
12799 0, /* tp_hash */
12800 0, /* tp_call */
12801 0, /* tp_str */
12802 PyObject_GenericGetAttr, /* tp_getattro */
12803 0, /* tp_setattro */
12804 0, /* tp_as_buffer */
12805 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12806 0, /* tp_doc */
12807 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12808 0, /* tp_clear */
12809 0, /* tp_richcompare */
12810 0, /* tp_weaklistoffset */
12811 PyObject_SelfIter, /* tp_iter */
12812 (iternextfunc)unicodeiter_next, /* tp_iternext */
12813 unicodeiter_methods, /* tp_methods */
12814 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012815};
12816
12817static PyObject *
12818unicode_iter(PyObject *seq)
12819{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012820 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012821
Benjamin Peterson14339b62009-01-31 16:36:08 +000012822 if (!PyUnicode_Check(seq)) {
12823 PyErr_BadInternalCall();
12824 return NULL;
12825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012826 if (PyUnicode_READY(seq) == -1)
12827 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012828 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12829 if (it == NULL)
12830 return NULL;
12831 it->it_index = 0;
12832 Py_INCREF(seq);
12833 it->it_seq = (PyUnicodeObject *)seq;
12834 _PyObject_GC_TRACK(it);
12835 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012836}
12837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012838#define UNIOP(x) Py_UNICODE_##x
12839#define UNIOP_t Py_UNICODE
12840#include "uniops.h"
12841#undef UNIOP
12842#undef UNIOP_t
12843#define UNIOP(x) Py_UCS4_##x
12844#define UNIOP_t Py_UCS4
12845#include "uniops.h"
12846#undef UNIOP
12847#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012848
Victor Stinner71133ff2010-09-01 23:43:53 +000012849Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012850PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012851{
12852 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12853 Py_UNICODE *copy;
12854 Py_ssize_t size;
12855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012856 if (!PyUnicode_Check(unicode)) {
12857 PyErr_BadArgument();
12858 return NULL;
12859 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012860 /* Ensure we won't overflow the size. */
12861 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12862 PyErr_NoMemory();
12863 return NULL;
12864 }
12865 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12866 size *= sizeof(Py_UNICODE);
12867 copy = PyMem_Malloc(size);
12868 if (copy == NULL) {
12869 PyErr_NoMemory();
12870 return NULL;
12871 }
12872 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12873 return copy;
12874}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012875
Georg Brandl66c221e2010-10-14 07:04:07 +000012876/* A _string module, to export formatter_parser and formatter_field_name_split
12877 to the string.Formatter class implemented in Python. */
12878
12879static PyMethodDef _string_methods[] = {
12880 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12881 METH_O, PyDoc_STR("split the argument as a field name")},
12882 {"formatter_parser", (PyCFunction) formatter_parser,
12883 METH_O, PyDoc_STR("parse the argument as a format string")},
12884 {NULL, NULL}
12885};
12886
12887static struct PyModuleDef _string_module = {
12888 PyModuleDef_HEAD_INIT,
12889 "_string",
12890 PyDoc_STR("string helper module"),
12891 0,
12892 _string_methods,
12893 NULL,
12894 NULL,
12895 NULL,
12896 NULL
12897};
12898
12899PyMODINIT_FUNC
12900PyInit__string(void)
12901{
12902 return PyModule_Create(&_string_module);
12903}
12904
12905
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012906#ifdef __cplusplus
12907}
12908#endif