blob: 4b6f651673ee08c4ecc7c970d19bb582aa401914 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020093/* Generic helper macro to convert characters of different types.
94 from_type and to_type have to be valid type names, begin and end
95 are pointers to the source characters which should be of type
96 "from_type *". to is a pointer of type "to_type *" and points to the
97 buffer where the result characters are written to. */
98#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
99 do { \
100 const from_type *iter_; to_type *to_; \
101 for (iter_ = (begin), to_ = (to_type *)(to); \
102 iter_ < (end); \
103 ++iter_, ++to_) { \
104 *to_ = (to_type)*iter_; \
105 } \
106 } while (0)
107
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200108#define _PyUnicode_UTF8(op) \
109 (PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 ((PyCompactUnicodeObject*)(op))->utf8)
112#define _PyUnicode_UTF8_LENGTH(op) \
113 (PyUnicode_IS_COMPACT_ASCII(op) ? \
114 ((PyASCIIObject*)(op))->length : \
115 ((PyCompactUnicodeObject*)(op))->utf8_length)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200116#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
117#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
119#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
120#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
121#define _PyUnicode_KIND(op) \
122 (assert(PyUnicode_Check(op)), \
123 ((PyASCIIObject *)(op))->state.kind)
124#define _PyUnicode_GET_LENGTH(op) \
125 (assert(PyUnicode_Check(op)), \
126 ((PyASCIIObject *)(op))->length)
127
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200128/* The Unicode string has been modified: reset the hash */
129#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131
Walter Dörwald16807132007-05-25 13:52:07 +0000132/* This dictionary holds all interned unicode strings. Note that references
133 to strings in this dictionary are *not* counted in the string's ob_refcnt.
134 When the interned string reaches a refcnt of 0 the string deallocation
135 function will delete the reference from this dictionary.
136
137 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000138 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000139*/
140static PyObject *interned;
141
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000142/* The empty Unicode object is shared to improve performance. */
143static PyUnicodeObject *unicode_empty;
144
145/* Single character Unicode strings in the Latin-1 range are being
146 shared as well. */
147static PyUnicodeObject *unicode_latin1[256];
148
Christian Heimes190d79e2008-01-30 11:58:22 +0000149/* Fast detection of the most frequent whitespace characters */
150const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000151 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000152/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000153/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000154/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000155/* case 0x000C: * FORM FEED */
156/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000157 0, 1, 1, 1, 1, 1, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* case 0x001C: * FILE SEPARATOR */
160/* case 0x001D: * GROUP SEPARATOR */
161/* case 0x001E: * RECORD SEPARATOR */
162/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000163 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000165 1, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000169
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000178};
179
Alexander Belopolsky40018472011-02-26 01:02:56 +0000180static PyObject *
181unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000182 PyObject **errorHandler,const char *encoding, const char *reason,
183 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
184 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
185
Alexander Belopolsky40018472011-02-26 01:02:56 +0000186static void
187raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300188 const char *encoding,
189 const Py_UNICODE *unicode, Py_ssize_t size,
190 Py_ssize_t startpos, Py_ssize_t endpos,
191 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000192
Christian Heimes190d79e2008-01-30 11:58:22 +0000193/* Same for linebreaks */
194static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000196/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* 0x000B, * LINE TABULATION */
198/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000202/* 0x001C, * FILE SEPARATOR */
203/* 0x001D, * GROUP SEPARATOR */
204/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 1, 1, 1, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000210
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000219};
220
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300221/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
222 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000223Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000224PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000225{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000226#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000227 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000228#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 /* This is actually an illegal character, so it should
230 not be passed to unichr. */
231 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000232#endif
233}
234
Thomas Wouters477c8d52006-05-27 19:21:47 +0000235/* --- Bloom Filters ----------------------------------------------------- */
236
237/* stuff to implement simple "bloom filters" for Unicode characters.
238 to keep things simple, we use a single bitmask, using the least 5
239 bits from each unicode characters as the bit index. */
240
241/* the linebreak mask is set up by Unicode_Init below */
242
Antoine Pitrouf068f942010-01-13 14:19:12 +0000243#if LONG_BIT >= 128
244#define BLOOM_WIDTH 128
245#elif LONG_BIT >= 64
246#define BLOOM_WIDTH 64
247#elif LONG_BIT >= 32
248#define BLOOM_WIDTH 32
249#else
250#error "LONG_BIT is smaller than 32"
251#endif
252
Thomas Wouters477c8d52006-05-27 19:21:47 +0000253#define BLOOM_MASK unsigned long
254
255static BLOOM_MASK bloom_linebreak;
256
Antoine Pitrouf068f942010-01-13 14:19:12 +0000257#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
258#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000259
Benjamin Peterson29060642009-01-31 22:14:21 +0000260#define BLOOM_LINEBREAK(ch) \
261 ((ch) < 128U ? ascii_linebreak[(ch)] : \
262 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000263
Alexander Belopolsky40018472011-02-26 01:02:56 +0000264Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200265make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000266{
267 /* calculate simple bloom-style bitmask for a given unicode string */
268
Antoine Pitrouf068f942010-01-13 14:19:12 +0000269 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000270 Py_ssize_t i;
271
272 mask = 0;
273 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200274 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000275
276 return mask;
277}
278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200279#define BLOOM_MEMBER(mask, chr, str) \
280 (BLOOM(mask, chr) \
281 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
Guido van Rossumd57fd912000-03-10 22:53:23 +0000283/* --- Unicode Object ----------------------------------------------------- */
284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200285static PyObject *
286substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len);
287
288static PyObject *
289fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
290
291Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
292 Py_ssize_t size, Py_UCS4 ch,
293 int direction)
294{
295 /* like wcschr, but doesn't stop at NULL characters */
296 Py_ssize_t i;
297 if (direction == 1) {
298 for(i = 0; i < size; i++)
299 if (PyUnicode_READ(kind, s, i) == ch)
300 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
301 }
302 else {
303 for(i = size-1; i >= 0; i--)
304 if (PyUnicode_READ(kind, s, i) == ch)
305 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
306 }
307 return NULL;
308}
309
Alexander Belopolsky40018472011-02-26 01:02:56 +0000310static int
311unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200312 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313{
314 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200316 /* Resizing is only supported for old unicode objects. */
317 assert(!PyUnicode_IS_COMPACT(unicode));
318 assert(_PyUnicode_WSTR(unicode) != NULL);
319
320 /* ... and only if they have not been readied yet, because
321 callees usually rely on the wstr representation when resizing. */
322 assert(unicode->data.any == NULL);
323
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000324 /* Shortcut if there's nothing much to do. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200325 if (_PyUnicode_WSTR_LENGTH(unicode) == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000326 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 /* Resizing shared object (unicode_empty or single character
329 objects) in-place is not allowed. Use PyUnicode_Resize()
330 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000331
Benjamin Peterson14339b62009-01-31 16:36:08 +0000332 if (unicode == unicode_empty ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200333 (_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
334 _PyUnicode_WSTR(unicode)[0] < 256U &&
335 unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000337 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 return -1;
339 }
340
Thomas Wouters477c8d52006-05-27 19:21:47 +0000341 /* We allocate one more byte to make sure the string is Ux0000 terminated.
342 The overallocation is also used by fastsearch, which assumes that it's
343 safe to look at str[length] (without making any assumptions about what
344 it contains). */
345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200346 oldstr = _PyUnicode_WSTR(unicode);
347 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
348 sizeof(Py_UNICODE) * (length + 1));
349 if (!_PyUnicode_WSTR(unicode)) {
350 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 PyErr_NoMemory();
352 return -1;
353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 _PyUnicode_WSTR(unicode)[length] = 0;
355 _PyUnicode_WSTR_LENGTH(unicode) = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356
Benjamin Peterson29060642009-01-31 22:14:21 +0000357 reset:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200358 if (unicode->data.any != NULL) {
359 PyObject_FREE(unicode->data.any);
360 if (unicode->_base.utf8 && unicode->_base.utf8 != unicode->data.any) {
361 PyObject_FREE(unicode->_base.utf8);
362 }
363 unicode->_base.utf8 = NULL;
364 unicode->_base.utf8_length = 0;
365 unicode->data.any = NULL;
366 _PyUnicode_LENGTH(unicode) = 0;
367 _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
368 _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000369 }
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200370 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000371
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 return 0;
373}
374
375/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000376 Ux0000 terminated; some code (e.g. new_identifier)
377 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378
379 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000380 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381
382*/
383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384#ifdef Py_DEBUG
385int unicode_old_new_calls = 0;
386#endif
387
Alexander Belopolsky40018472011-02-26 01:02:56 +0000388static PyUnicodeObject *
389_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000390{
391 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200392 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393
Thomas Wouters477c8d52006-05-27 19:21:47 +0000394 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000395 if (length == 0 && unicode_empty != NULL) {
396 Py_INCREF(unicode_empty);
397 return unicode_empty;
398 }
399
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000400 /* Ensure we won't overflow the size. */
401 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
402 return (PyUnicodeObject *)PyErr_NoMemory();
403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200404 if (length < 0) {
405 PyErr_SetString(PyExc_SystemError,
406 "Negative size passed to _PyUnicode_New");
407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000408 }
409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410#ifdef Py_DEBUG
411 ++unicode_old_new_calls;
412#endif
413
414 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
415 if (unicode == NULL)
416 return NULL;
417 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
418 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
419 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 PyErr_NoMemory();
421 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200423
Jeremy Hyltond8082792003-09-16 19:41:39 +0000424 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000425 * the caller fails before initializing str -- unicode_resize()
426 * reads str[0], and the Keep-Alive optimization can keep memory
427 * allocated for str alive across a call to unicode_dealloc(unicode).
428 * We don't want unicode_resize to read uninitialized memory in
429 * that case.
430 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431 _PyUnicode_WSTR(unicode)[0] = 0;
432 _PyUnicode_WSTR(unicode)[length] = 0;
433 _PyUnicode_WSTR_LENGTH(unicode) = length;
434 _PyUnicode_HASH(unicode) = -1;
435 _PyUnicode_STATE(unicode).interned = 0;
436 _PyUnicode_STATE(unicode).kind = 0;
437 _PyUnicode_STATE(unicode).compact = 0;
438 _PyUnicode_STATE(unicode).ready = 0;
439 _PyUnicode_STATE(unicode).ascii = 0;
440 unicode->data.any = NULL;
441 _PyUnicode_LENGTH(unicode) = 0;
442 unicode->_base.utf8 = NULL;
443 unicode->_base.utf8_length = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000445
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000447 /* XXX UNREF/NEWREF interface should be more symmetrical */
448 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000449 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000450 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000451 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452}
453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454#ifdef Py_DEBUG
455int unicode_new_new_calls = 0;
456
457/* Functions wrapping macros for use in debugger */
458char *_PyUnicode_utf8(void *unicode){
459 return _PyUnicode_UTF8(unicode);
460}
461
462void *_PyUnicode_compact_data(void *unicode) {
463 return _PyUnicode_COMPACT_DATA(unicode);
464}
465void *_PyUnicode_data(void *unicode){
466 printf("obj %p\n", unicode);
467 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
468 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
469 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
470 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
471 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
472 return PyUnicode_DATA(unicode);
473}
474#endif
475
476PyObject *
477PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
478{
479 PyObject *obj;
480 PyCompactUnicodeObject *unicode;
481 void *data;
482 int kind_state;
483 int is_sharing = 0, is_ascii = 0;
484 Py_ssize_t char_size;
485 Py_ssize_t struct_size;
486
487 /* Optimization for empty strings */
488 if (size == 0 && unicode_empty != NULL) {
489 Py_INCREF(unicode_empty);
490 return (PyObject *)unicode_empty;
491 }
492
493#ifdef Py_DEBUG
494 ++unicode_new_new_calls;
495#endif
496
497 struct_size = sizeof(PyCompactUnicodeObject);
498 if (maxchar < 128) {
499 kind_state = PyUnicode_1BYTE_KIND;
500 char_size = 1;
501 is_ascii = 1;
502 struct_size = sizeof(PyASCIIObject);
503 }
504 else if (maxchar < 256) {
505 kind_state = PyUnicode_1BYTE_KIND;
506 char_size = 1;
507 }
508 else if (maxchar < 65536) {
509 kind_state = PyUnicode_2BYTE_KIND;
510 char_size = 2;
511 if (sizeof(wchar_t) == 2)
512 is_sharing = 1;
513 }
514 else {
515 kind_state = PyUnicode_4BYTE_KIND;
516 char_size = 4;
517 if (sizeof(wchar_t) == 4)
518 is_sharing = 1;
519 }
520
521 /* Ensure we won't overflow the size. */
522 if (size < 0) {
523 PyErr_SetString(PyExc_SystemError,
524 "Negative size passed to PyUnicode_New");
525 return NULL;
526 }
527 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
528 return PyErr_NoMemory();
529
530 /* Duplicated allocation code from _PyObject_New() instead of a call to
531 * PyObject_New() so we are able to allocate space for the object and
532 * it's data buffer.
533 */
534 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
535 if (obj == NULL)
536 return PyErr_NoMemory();
537 obj = PyObject_INIT(obj, &PyUnicode_Type);
538 if (obj == NULL)
539 return NULL;
540
541 unicode = (PyCompactUnicodeObject *)obj;
542 if (is_ascii)
543 data = ((PyASCIIObject*)obj) + 1;
544 else
545 data = unicode + 1;
546 _PyUnicode_LENGTH(unicode) = size;
547 _PyUnicode_HASH(unicode) = -1;
548 _PyUnicode_STATE(unicode).interned = 0;
549 _PyUnicode_STATE(unicode).kind = kind_state;
550 _PyUnicode_STATE(unicode).compact = 1;
551 _PyUnicode_STATE(unicode).ready = 1;
552 _PyUnicode_STATE(unicode).ascii = is_ascii;
553 if (is_ascii) {
554 ((char*)data)[size] = 0;
555 _PyUnicode_WSTR(unicode) = NULL;
556 }
557 else if (kind_state == PyUnicode_1BYTE_KIND) {
558 ((char*)data)[size] = 0;
559 _PyUnicode_WSTR(unicode) = NULL;
560 _PyUnicode_WSTR_LENGTH(unicode) = 0;
561 unicode->utf8_length = 0;
562 unicode->utf8 = NULL;
563 }
564 else {
565 unicode->utf8 = NULL;
566 if (kind_state == PyUnicode_2BYTE_KIND)
567 ((Py_UCS2*)data)[size] = 0;
568 else /* kind_state == PyUnicode_4BYTE_KIND */
569 ((Py_UCS4*)data)[size] = 0;
570 if (is_sharing) {
571 _PyUnicode_WSTR_LENGTH(unicode) = size;
572 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
573 }
574 else {
575 _PyUnicode_WSTR_LENGTH(unicode) = 0;
576 _PyUnicode_WSTR(unicode) = NULL;
577 }
578 }
579 return obj;
580}
581
582#if SIZEOF_WCHAR_T == 2
583/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
584 will decode surrogate pairs, the other conversions are implemented as macros
585 for efficency.
586
587 This function assumes that unicode can hold one more code point than wstr
588 characters for a terminating null character. */
589static int
590unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
591 PyUnicodeObject *unicode)
592{
593 const wchar_t *iter;
594 Py_UCS4 *ucs4_out;
595
596 assert(unicode && PyUnicode_Check(unicode));
597 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
598 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
599
600 for (iter = begin; iter < end; ) {
601 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
602 _PyUnicode_GET_LENGTH(unicode)));
603 if (*iter >= 0xD800 && *iter <= 0xDBFF
604 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
605 {
606 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
607 iter += 2;
608 }
609 else {
610 *ucs4_out++ = *iter;
611 iter++;
612 }
613 }
614 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
615 _PyUnicode_GET_LENGTH(unicode)));
616
617 return 0;
618}
619#endif
620
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200621Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200622PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
623 PyObject *from, Py_ssize_t from_start,
624 Py_ssize_t how_many)
625{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200626 unsigned int from_kind, to_kind;
627 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200628
Victor Stinnerb1536152011-09-30 02:26:10 +0200629 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
630 PyErr_BadInternalCall();
631 return -1;
632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200633
634 if (PyUnicode_READY(from))
635 return -1;
636 if (PyUnicode_READY(to))
637 return -1;
638
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200639 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200640 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
641 PyErr_Format(PyExc_ValueError,
642 "Cannot write %zi characters at %zi "
643 "in a string of %zi characters",
644 how_many, to_start, PyUnicode_GET_LENGTH(to));
645 return -1;
646 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200647 if (how_many == 0)
648 return 0;
649
650 if (Py_REFCNT(to) != 1) {
651 PyErr_SetString(PyExc_ValueError,
652 "Cannot modify a string having more than 1 reference");
653 return -1;
654 }
Victor Stinnerc17f5402011-09-29 00:16:58 +0200655 _PyUnicode_DIRTY(to);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200657 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200658 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200659 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200660 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200661
662 if (from_kind == to_kind) {
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200663 /* fast path */
Victor Stinnera0702ab2011-09-29 14:14:38 +0200664 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200665 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200666 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200667 + PyUnicode_KIND_SIZE(from_kind, from_start),
668 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200669 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200670 else if (from_kind == PyUnicode_1BYTE_KIND
671 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200672 {
673 _PyUnicode_CONVERT_BYTES(
674 Py_UCS1, Py_UCS2,
675 PyUnicode_1BYTE_DATA(from) + from_start,
676 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
677 PyUnicode_2BYTE_DATA(to) + to_start
678 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200679 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200680 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200681 && to_kind == PyUnicode_4BYTE_KIND)
682 {
683 _PyUnicode_CONVERT_BYTES(
684 Py_UCS1, Py_UCS4,
685 PyUnicode_1BYTE_DATA(from) + from_start,
686 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
687 PyUnicode_4BYTE_DATA(to) + to_start
688 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200689 }
690 else if (from_kind == PyUnicode_2BYTE_KIND
691 && to_kind == PyUnicode_4BYTE_KIND)
692 {
693 _PyUnicode_CONVERT_BYTES(
694 Py_UCS2, Py_UCS4,
695 PyUnicode_2BYTE_DATA(from) + from_start,
696 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
697 PyUnicode_4BYTE_DATA(to) + to_start
698 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200699 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200700 else {
701 int invalid_kinds;
702 if (from_kind > to_kind) {
703 /* slow path to check for character overflow */
704 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
705 Py_UCS4 ch, maxchar;
706 Py_ssize_t i;
707
708 maxchar = 0;
709 invalid_kinds = 0;
710 for (i=0; i < how_many; i++) {
711 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
712 if (ch > maxchar) {
713 maxchar = ch;
714 if (maxchar > to_maxchar) {
715 invalid_kinds = 1;
716 break;
717 }
718 }
719 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
720 }
721 }
722 else
723 invalid_kinds = 1;
724 if (invalid_kinds) {
725 PyErr_Format(PyExc_ValueError,
726 "Cannot copy UCS%u characters "
727 "into a string of UCS%u characters",
728 1 << (from_kind - 1),
729 1 << (to_kind -1));
730 return -1;
731 }
732 }
733 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200734}
735
Victor Stinner17222162011-09-28 22:15:37 +0200736/* Find the maximum code point and count the number of surrogate pairs so a
737 correct string length can be computed before converting a string to UCS4.
738 This function counts single surrogates as a character and not as a pair.
739
740 Return 0 on success, or -1 on error. */
741static int
742find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
743 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744{
745 const wchar_t *iter;
746
747 if (num_surrogates == NULL || maxchar == NULL) {
748 PyErr_SetString(PyExc_SystemError,
749 "unexpected NULL arguments to "
750 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
751 return -1;
752 }
753
754 *num_surrogates = 0;
755 *maxchar = 0;
756
757 for (iter = begin; iter < end; ) {
758 if (*iter > *maxchar)
759 *maxchar = *iter;
760#if SIZEOF_WCHAR_T == 2
761 if (*iter >= 0xD800 && *iter <= 0xDBFF
762 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
763 {
764 Py_UCS4 surrogate_val;
765 surrogate_val = (((iter[0] & 0x3FF)<<10)
766 | (iter[1] & 0x3FF)) + 0x10000;
767 ++(*num_surrogates);
768 if (surrogate_val > *maxchar)
769 *maxchar = surrogate_val;
770 iter += 2;
771 }
772 else
773 iter++;
774#else
775 iter++;
776#endif
777 }
778 return 0;
779}
780
781#ifdef Py_DEBUG
782int unicode_ready_calls = 0;
783#endif
784
785int
Victor Stinnerd8f65102011-09-29 19:43:17 +0200786_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787{
Victor Stinnerd8f65102011-09-29 19:43:17 +0200788 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200789 wchar_t *end;
790 Py_UCS4 maxchar = 0;
791 Py_ssize_t num_surrogates;
792#if SIZEOF_WCHAR_T == 2
793 Py_ssize_t length_wo_surrogates;
794#endif
795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200796 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +0200797 strings were created using _PyObject_New() and where no canonical
798 representation (the str field) has been set yet aka strings
799 which are not yet ready. */
800 assert(PyUnicode_Check(obj));
801 assert(!PyUnicode_IS_READY(obj));
802 assert(!PyUnicode_IS_COMPACT(obj));
803 assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200804 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200805 assert(unicode->data.any == NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200806 assert(unicode->_base.utf8 == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +0200807 /* Actually, it should neither be interned nor be anything else: */
808 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809
810#ifdef Py_DEBUG
811 ++unicode_ready_calls;
812#endif
813
814 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +0200815 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +0200816 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200817 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200818
819 if (maxchar < 256) {
820 unicode->data.any = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
821 if (!unicode->data.any) {
822 PyErr_NoMemory();
823 return -1;
824 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200825 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826 _PyUnicode_WSTR(unicode), end,
827 PyUnicode_1BYTE_DATA(unicode));
828 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
829 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
830 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
831 if (maxchar < 128) {
832 unicode->_base.utf8 = unicode->data.any;
833 unicode->_base.utf8_length = _PyUnicode_WSTR_LENGTH(unicode);
834 }
835 else {
836 unicode->_base.utf8 = NULL;
837 unicode->_base.utf8_length = 0;
838 }
839 PyObject_FREE(_PyUnicode_WSTR(unicode));
840 _PyUnicode_WSTR(unicode) = NULL;
841 _PyUnicode_WSTR_LENGTH(unicode) = 0;
842 }
843 /* In this case we might have to convert down from 4-byte native
844 wchar_t to 2-byte unicode. */
845 else if (maxchar < 65536) {
846 assert(num_surrogates == 0 &&
847 "FindMaxCharAndNumSurrogatePairs() messed up");
848
Victor Stinner506f5922011-09-28 22:34:18 +0200849#if SIZEOF_WCHAR_T == 2
850 /* We can share representations and are done. */
851 unicode->data.any = _PyUnicode_WSTR(unicode);
852 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
853 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
854 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
855 unicode->_base.utf8 = NULL;
856 unicode->_base.utf8_length = 0;
857#else
858 /* sizeof(wchar_t) == 4 */
859 unicode->data.any = PyObject_MALLOC(
860 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
861 if (!unicode->data.any) {
862 PyErr_NoMemory();
863 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200864 }
Victor Stinner506f5922011-09-28 22:34:18 +0200865 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
866 _PyUnicode_WSTR(unicode), end,
867 PyUnicode_2BYTE_DATA(unicode));
868 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
869 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
870 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
871 unicode->_base.utf8 = NULL;
872 unicode->_base.utf8_length = 0;
873 PyObject_FREE(_PyUnicode_WSTR(unicode));
874 _PyUnicode_WSTR(unicode) = NULL;
875 _PyUnicode_WSTR_LENGTH(unicode) = 0;
876#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200877 }
878 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
879 else {
880#if SIZEOF_WCHAR_T == 2
881 /* in case the native representation is 2-bytes, we need to allocate a
882 new normalized 4-byte version. */
883 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
884 unicode->data.any = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
885 if (!unicode->data.any) {
886 PyErr_NoMemory();
887 return -1;
888 }
889 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
890 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
891 unicode->_base.utf8 = NULL;
892 unicode->_base.utf8_length = 0;
893 if (unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end,
894 unicode) < 0) {
895 assert(0 && "ConvertWideCharToUCS4 failed");
896 return -1;
897 }
898 PyObject_FREE(_PyUnicode_WSTR(unicode));
899 _PyUnicode_WSTR(unicode) = NULL;
900 _PyUnicode_WSTR_LENGTH(unicode) = 0;
901#else
902 assert(num_surrogates == 0);
903
904 unicode->data.any = _PyUnicode_WSTR(unicode);
905 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
906 unicode->_base.utf8 = NULL;
907 unicode->_base.utf8_length = 0;
908 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
909#endif
910 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
911 }
912 _PyUnicode_STATE(unicode).ready = 1;
913 return 0;
914}
915
Alexander Belopolsky40018472011-02-26 01:02:56 +0000916static void
917unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000918{
Walter Dörwald16807132007-05-25 13:52:07 +0000919 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 case SSTATE_NOT_INTERNED:
921 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000922
Benjamin Peterson29060642009-01-31 22:14:21 +0000923 case SSTATE_INTERNED_MORTAL:
924 /* revive dead object temporarily for DelItem */
925 Py_REFCNT(unicode) = 3;
926 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
927 Py_FatalError(
928 "deletion of interned string failed");
929 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000930
Benjamin Peterson29060642009-01-31 22:14:21 +0000931 case SSTATE_INTERNED_IMMORTAL:
932 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000933
Benjamin Peterson29060642009-01-31 22:14:21 +0000934 default:
935 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000936 }
937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200938 if (_PyUnicode_WSTR(unicode) &&
939 (!PyUnicode_IS_READY(unicode) ||
940 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
941 PyObject_DEL(_PyUnicode_WSTR(unicode));
942 if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != PyUnicode_DATA(unicode))
943 PyObject_DEL(unicode->_base.utf8);
944
945 if (PyUnicode_IS_COMPACT(unicode)) {
946 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947 }
948 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200949 if (unicode->data.any)
950 PyObject_DEL(unicode->data.any);
Benjamin Peterson29060642009-01-31 22:14:21 +0000951 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000952 }
953}
954
Alexander Belopolsky40018472011-02-26 01:02:56 +0000955static int
956_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000957{
958 register PyUnicodeObject *v;
959
960 /* Argument checks */
961 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000962 PyErr_BadInternalCall();
963 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000964 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000965 v = *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 ||
967 PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000968 PyErr_BadInternalCall();
969 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000970 }
971
972 /* Resizing unicode_empty and single character objects is not
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973 possible since these are being shared.
974 The same goes for new-representation unicode objects or objects which
975 have already been readied.
976 For these, we simply return a fresh copy with the same Unicode content.
977 */
978 if ((_PyUnicode_WSTR_LENGTH(v) != length &&
979 (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) ||
980 PyUnicode_IS_COMPACT(v) || v->data.any) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000981 PyUnicodeObject *w = _PyUnicode_New(length);
982 if (w == NULL)
983 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v),
985 length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v));
Benjamin Peterson29060642009-01-31 22:14:21 +0000986 Py_DECREF(*unicode);
987 *unicode = w;
988 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000989 }
990
991 /* Note that we don't have to modify *unicode for unshared Unicode
992 objects, since we can modify them in-place. */
993 return unicode_resize(v, length);
994}
995
Alexander Belopolsky40018472011-02-26 01:02:56 +0000996int
997PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000998{
999 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
1000}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002static PyObject*
1003get_latin1_char(unsigned char ch)
1004{
1005 PyUnicodeObject *unicode = unicode_latin1[ch];
1006 if (!unicode) {
1007 unicode = (PyUnicodeObject *)PyUnicode_New(1, ch);
1008 if (!unicode)
1009 return NULL;
1010 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1011 unicode_latin1[ch] = unicode;
1012 }
1013 Py_INCREF(unicode);
1014 return (PyObject *)unicode;
1015}
1016
Alexander Belopolsky40018472011-02-26 01:02:56 +00001017PyObject *
1018PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019{
1020 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001021 Py_UCS4 maxchar = 0;
1022 Py_ssize_t num_surrogates;
1023
1024 if (u == NULL)
1025 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001027 /* If the Unicode data is known at construction time, we can apply
1028 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030 /* Optimization for empty strings */
1031 if (size == 0 && unicode_empty != NULL) {
1032 Py_INCREF(unicode_empty);
1033 return (PyObject *)unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001034 }
Tim Petersced69f82003-09-16 20:30:58 +00001035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001036 /* Single character Unicode objects in the Latin-1 range are
1037 shared when using this constructor */
1038 if (size == 1 && *u < 256)
1039 return get_latin1_char((unsigned char)*u);
1040
1041 /* If not empty and not single character, copy the Unicode data
1042 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001043 if (find_maxchar_surrogates(u, u + size,
1044 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 return NULL;
1046
1047 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1048 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049 if (!unicode)
1050 return NULL;
1051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 switch (PyUnicode_KIND(unicode)) {
1053 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001054 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001055 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1056 break;
1057 case PyUnicode_2BYTE_KIND:
1058#if Py_UNICODE_SIZE == 2
1059 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1060#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001061 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1063#endif
1064 break;
1065 case PyUnicode_4BYTE_KIND:
1066#if SIZEOF_WCHAR_T == 2
1067 /* This is the only case which has to process surrogates, thus
1068 a simple copy loop is not enough and we need a function. */
1069 if (unicode_convert_wchar_to_ucs4(u, u + size, unicode) < 0) {
1070 Py_DECREF(unicode);
1071 return NULL;
1072 }
1073#else
1074 assert(num_surrogates == 0);
1075 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1076#endif
1077 break;
1078 default:
1079 assert(0 && "Impossible state");
1080 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001081
1082 return (PyObject *)unicode;
1083}
1084
Alexander Belopolsky40018472011-02-26 01:02:56 +00001085PyObject *
1086PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001087{
1088 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001089
Benjamin Peterson14339b62009-01-31 16:36:08 +00001090 if (size < 0) {
1091 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001092 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001093 return NULL;
1094 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001095
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001096 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001097 some optimizations which share commonly used objects.
1098 Also, this means the input must be UTF-8, so fall back to the
1099 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001100 if (u != NULL) {
1101
Benjamin Peterson29060642009-01-31 22:14:21 +00001102 /* Optimization for empty strings */
1103 if (size == 0 && unicode_empty != NULL) {
1104 Py_INCREF(unicode_empty);
1105 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001106 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001107
1108 /* Single characters are shared when using this constructor.
1109 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110 if (size == 1 && Py_CHARMASK(*u) < 128)
1111 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001112
1113 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001114 }
1115
Walter Dörwald55507312007-05-18 13:12:10 +00001116 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001117 if (!unicode)
1118 return NULL;
1119
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001120 return (PyObject *)unicode;
1121}
1122
Alexander Belopolsky40018472011-02-26 01:02:56 +00001123PyObject *
1124PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001125{
1126 size_t size = strlen(u);
1127 if (size > PY_SSIZE_T_MAX) {
1128 PyErr_SetString(PyExc_OverflowError, "input too long");
1129 return NULL;
1130 }
1131
1132 return PyUnicode_FromStringAndSize(u, size);
1133}
1134
Victor Stinnere57b1c02011-09-28 22:20:48 +02001135static PyObject*
1136_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138 PyObject *res;
1139 unsigned char max = 127;
1140 Py_ssize_t i;
1141 for (i = 0; i < size; i++) {
1142 if (u[i] & 0x80) {
1143 max = 255;
1144 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001145 }
1146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147 res = PyUnicode_New(size, max);
1148 if (!res)
1149 return NULL;
1150 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1151 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001152}
1153
Victor Stinnere57b1c02011-09-28 22:20:48 +02001154static PyObject*
1155_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156{
1157 PyObject *res;
1158 Py_UCS2 max = 0;
1159 Py_ssize_t i;
1160 for (i = 0; i < size; i++)
1161 if (u[i] > max)
1162 max = u[i];
1163 res = PyUnicode_New(size, max);
1164 if (!res)
1165 return NULL;
1166 if (max >= 256)
1167 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1168 else
1169 for (i = 0; i < size; i++)
1170 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1171 return res;
1172}
1173
Victor Stinnere57b1c02011-09-28 22:20:48 +02001174static PyObject*
1175_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176{
1177 PyObject *res;
1178 Py_UCS4 max = 0;
1179 Py_ssize_t i;
1180 for (i = 0; i < size; i++)
1181 if (u[i] > max)
1182 max = u[i];
1183 res = PyUnicode_New(size, max);
1184 if (!res)
1185 return NULL;
1186 if (max >= 0x10000)
1187 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1188 else {
1189 int kind = PyUnicode_KIND(res);
1190 void *data = PyUnicode_DATA(res);
1191 for (i = 0; i < size; i++)
1192 PyUnicode_WRITE(kind, data, i, u[i]);
1193 }
1194 return res;
1195}
1196
1197PyObject*
1198PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1199{
1200 switch(kind) {
1201 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001202 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001204 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001205 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001206 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 }
1208 assert(0);
1209 return NULL;
1210}
1211
Victor Stinner034f6cf2011-09-30 02:26:44 +02001212PyObject*
1213PyUnicode_Copy(PyObject *unicode)
1214{
1215 if (!PyUnicode_Check(unicode)) {
1216 PyErr_BadInternalCall();
1217 return NULL;
1218 }
1219 if (PyUnicode_READY(unicode))
1220 return NULL;
1221 return PyUnicode_FromKindAndData(PyUnicode_KIND(unicode),
1222 PyUnicode_DATA(unicode),
1223 PyUnicode_GET_LENGTH(unicode));
1224}
1225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226
1227/* Widen Unicode objects to larger buffers.
1228 Return NULL if the string is too wide already. */
1229
1230void*
1231_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1232{
1233 Py_ssize_t i;
1234 Py_ssize_t len = PyUnicode_GET_LENGTH(s);
1235 void *d = PyUnicode_DATA(s);
1236 unsigned int skind = PyUnicode_KIND(s);
1237 if (PyUnicode_KIND(s) >= kind) {
1238 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1239 return NULL;
1240 }
1241 switch(kind) {
1242 case PyUnicode_2BYTE_KIND: {
1243 Py_UCS2 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS2));
1244 if (!result) {
1245 PyErr_NoMemory();
1246 return 0;
1247 }
1248 for (i = 0; i < len; i++)
1249 result[i] = ((Py_UCS1*)d)[i];
1250 return result;
1251 }
1252 case PyUnicode_4BYTE_KIND: {
1253 Py_UCS4 *result = PyMem_Malloc(PyUnicode_GET_LENGTH(s) * sizeof(Py_UCS4));
1254 if (!result) {
1255 PyErr_NoMemory();
1256 return 0;
1257 }
1258 for (i = 0; i < len; i++)
1259 result[i] = PyUnicode_READ(skind, d, i);
1260 return result;
1261 }
1262 }
1263 Py_FatalError("invalid kind");
1264 return NULL;
1265}
1266
1267static Py_UCS4*
1268as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1269 int copy_null)
1270{
1271 int kind;
1272 void *data;
1273 Py_ssize_t len, targetlen;
1274 if (PyUnicode_READY(string) == -1)
1275 return NULL;
1276 kind = PyUnicode_KIND(string);
1277 data = PyUnicode_DATA(string);
1278 len = PyUnicode_GET_LENGTH(string);
1279 targetlen = len;
1280 if (copy_null)
1281 targetlen++;
1282 if (!target) {
1283 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1284 PyErr_NoMemory();
1285 return NULL;
1286 }
1287 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1288 if (!target) {
1289 PyErr_NoMemory();
1290 return NULL;
1291 }
1292 }
1293 else {
1294 if (targetsize < targetlen) {
1295 PyErr_Format(PyExc_SystemError,
1296 "string is longer than the buffer");
1297 if (copy_null && 0 < targetsize)
1298 target[0] = 0;
1299 return NULL;
1300 }
1301 }
1302 if (kind != PyUnicode_4BYTE_KIND) {
1303 Py_ssize_t i;
1304 for (i = 0; i < len; i++)
1305 target[i] = PyUnicode_READ(kind, data, i);
1306 }
1307 else
1308 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1309 if (copy_null)
1310 target[len] = 0;
1311 return target;
1312}
1313
1314Py_UCS4*
1315PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1316 int copy_null)
1317{
1318 if (target == NULL || targetsize < 1) {
1319 PyErr_BadInternalCall();
1320 return NULL;
1321 }
1322 return as_ucs4(string, target, targetsize, copy_null);
1323}
1324
1325Py_UCS4*
1326PyUnicode_AsUCS4Copy(PyObject *string)
1327{
1328 return as_ucs4(string, NULL, 0, 1);
1329}
1330
1331#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001332
Alexander Belopolsky40018472011-02-26 01:02:56 +00001333PyObject *
1334PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001335{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001336 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001337 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001339 PyErr_BadInternalCall();
1340 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341 }
1342
Martin v. Löwis790465f2008-04-05 20:41:37 +00001343 if (size == -1) {
1344 size = wcslen(w);
1345 }
1346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348}
1349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001351
Walter Dörwald346737f2007-05-31 10:44:43 +00001352static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001353makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1354 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001355{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001356 *fmt++ = '%';
1357 if (width) {
1358 if (zeropad)
1359 *fmt++ = '0';
1360 fmt += sprintf(fmt, "%d", width);
1361 }
1362 if (precision)
1363 fmt += sprintf(fmt, ".%d", precision);
1364 if (longflag)
1365 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001366 else if (longlongflag) {
1367 /* longlongflag should only ever be nonzero on machines with
1368 HAVE_LONG_LONG defined */
1369#ifdef HAVE_LONG_LONG
1370 char *f = PY_FORMAT_LONG_LONG;
1371 while (*f)
1372 *fmt++ = *f++;
1373#else
1374 /* we shouldn't ever get here */
1375 assert(0);
1376 *fmt++ = 'l';
1377#endif
1378 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001379 else if (size_tflag) {
1380 char *f = PY_FORMAT_SIZE_T;
1381 while (*f)
1382 *fmt++ = *f++;
1383 }
1384 *fmt++ = c;
1385 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001386}
1387
Victor Stinner96865452011-03-01 23:44:09 +00001388/* helper for PyUnicode_FromFormatV() */
1389
1390static const char*
1391parse_format_flags(const char *f,
1392 int *p_width, int *p_precision,
1393 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1394{
1395 int width, precision, longflag, longlongflag, size_tflag;
1396
1397 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1398 f++;
1399 width = 0;
1400 while (Py_ISDIGIT((unsigned)*f))
1401 width = (width*10) + *f++ - '0';
1402 precision = 0;
1403 if (*f == '.') {
1404 f++;
1405 while (Py_ISDIGIT((unsigned)*f))
1406 precision = (precision*10) + *f++ - '0';
1407 if (*f == '%') {
1408 /* "%.3%s" => f points to "3" */
1409 f--;
1410 }
1411 }
1412 if (*f == '\0') {
1413 /* bogus format "%.1" => go backward, f points to "1" */
1414 f--;
1415 }
1416 if (p_width != NULL)
1417 *p_width = width;
1418 if (p_precision != NULL)
1419 *p_precision = precision;
1420
1421 /* Handle %ld, %lu, %lld and %llu. */
1422 longflag = 0;
1423 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001424 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001425
1426 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001427 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001428 longflag = 1;
1429 ++f;
1430 }
1431#ifdef HAVE_LONG_LONG
1432 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001433 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001434 longlongflag = 1;
1435 f += 2;
1436 }
1437#endif
1438 }
1439 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001440 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001441 size_tflag = 1;
1442 ++f;
1443 }
1444 if (p_longflag != NULL)
1445 *p_longflag = longflag;
1446 if (p_longlongflag != NULL)
1447 *p_longlongflag = longlongflag;
1448 if (p_size_tflag != NULL)
1449 *p_size_tflag = size_tflag;
1450 return f;
1451}
1452
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001453/* maximum number of characters required for output of %ld. 21 characters
1454 allows for 64-bit integers (in decimal) and an optional sign. */
1455#define MAX_LONG_CHARS 21
1456/* maximum number of characters required for output of %lld.
1457 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1458 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1459#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1460
Walter Dörwaldd2034312007-05-18 16:29:38 +00001461PyObject *
1462PyUnicode_FromFormatV(const char *format, va_list vargs)
1463{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001464 va_list count;
1465 Py_ssize_t callcount = 0;
1466 PyObject **callresults = NULL;
1467 PyObject **callresult = NULL;
1468 Py_ssize_t n = 0;
1469 int width = 0;
1470 int precision = 0;
1471 int zeropad;
1472 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001474 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001475 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1477 Py_UCS4 argmaxchar;
1478 Py_ssize_t numbersize = 0;
1479 char *numberresults = NULL;
1480 char *numberresult = NULL;
1481 Py_ssize_t i;
1482 int kind;
1483 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001484
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001485 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001486 /* step 1: count the number of %S/%R/%A/%s format specifications
1487 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1488 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 * result in an array)
1490 * also esimate a upper bound for all the number formats in the string,
1491 * numbers will be formated in step 3 and be keept in a '\0'-separated
1492 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001493 for (f = format; *f; f++) {
1494 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001495 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1497 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1498 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1499 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001502#ifdef HAVE_LONG_LONG
1503 if (longlongflag) {
1504 if (width < MAX_LONG_LONG_CHARS)
1505 width = MAX_LONG_LONG_CHARS;
1506 }
1507 else
1508#endif
1509 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1510 including sign. Decimal takes the most space. This
1511 isn't enough for octal. If a width is specified we
1512 need more (which we allocate later). */
1513 if (width < MAX_LONG_CHARS)
1514 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515
1516 /* account for the size + '\0' to separate numbers
1517 inside of the numberresults buffer */
1518 numbersize += (width + 1);
1519 }
1520 }
1521 else if ((unsigned char)*f > 127) {
1522 PyErr_Format(PyExc_ValueError,
1523 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1524 "string, got a non-ASCII byte: 0x%02x",
1525 (unsigned char)*f);
1526 return NULL;
1527 }
1528 }
1529 /* step 2: allocate memory for the results of
1530 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1531 if (callcount) {
1532 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1533 if (!callresults) {
1534 PyErr_NoMemory();
1535 return NULL;
1536 }
1537 callresult = callresults;
1538 }
1539 /* step 2.5: allocate memory for the results of formating numbers */
1540 if (numbersize) {
1541 numberresults = PyObject_Malloc(numbersize);
1542 if (!numberresults) {
1543 PyErr_NoMemory();
1544 goto fail;
1545 }
1546 numberresult = numberresults;
1547 }
1548
1549 /* step 3: format numbers and figure out how large a buffer we need */
1550 for (f = format; *f; f++) {
1551 if (*f == '%') {
1552 const char* p;
1553 int longflag;
1554 int longlongflag;
1555 int size_tflag;
1556 int numprinted;
1557
1558 p = f;
1559 zeropad = (f[1] == '0');
1560 f = parse_format_flags(f, &width, &precision,
1561 &longflag, &longlongflag, &size_tflag);
1562 switch (*f) {
1563 case 'c':
1564 {
1565 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001566 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001567 n++;
1568 break;
1569 }
1570 case '%':
1571 n++;
1572 break;
1573 case 'i':
1574 case 'd':
1575 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1576 width, precision, *f);
1577 if (longflag)
1578 numprinted = sprintf(numberresult, fmt,
1579 va_arg(count, long));
1580#ifdef HAVE_LONG_LONG
1581 else if (longlongflag)
1582 numprinted = sprintf(numberresult, fmt,
1583 va_arg(count, PY_LONG_LONG));
1584#endif
1585 else if (size_tflag)
1586 numprinted = sprintf(numberresult, fmt,
1587 va_arg(count, Py_ssize_t));
1588 else
1589 numprinted = sprintf(numberresult, fmt,
1590 va_arg(count, int));
1591 n += numprinted;
1592 /* advance by +1 to skip over the '\0' */
1593 numberresult += (numprinted + 1);
1594 assert(*(numberresult - 1) == '\0');
1595 assert(*(numberresult - 2) != '\0');
1596 assert(numprinted >= 0);
1597 assert(numberresult <= numberresults + numbersize);
1598 break;
1599 case 'u':
1600 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1601 width, precision, 'u');
1602 if (longflag)
1603 numprinted = sprintf(numberresult, fmt,
1604 va_arg(count, unsigned long));
1605#ifdef HAVE_LONG_LONG
1606 else if (longlongflag)
1607 numprinted = sprintf(numberresult, fmt,
1608 va_arg(count, unsigned PY_LONG_LONG));
1609#endif
1610 else if (size_tflag)
1611 numprinted = sprintf(numberresult, fmt,
1612 va_arg(count, size_t));
1613 else
1614 numprinted = sprintf(numberresult, fmt,
1615 va_arg(count, unsigned int));
1616 n += numprinted;
1617 numberresult += (numprinted + 1);
1618 assert(*(numberresult - 1) == '\0');
1619 assert(*(numberresult - 2) != '\0');
1620 assert(numprinted >= 0);
1621 assert(numberresult <= numberresults + numbersize);
1622 break;
1623 case 'x':
1624 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1625 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1626 n += numprinted;
1627 numberresult += (numprinted + 1);
1628 assert(*(numberresult - 1) == '\0');
1629 assert(*(numberresult - 2) != '\0');
1630 assert(numprinted >= 0);
1631 assert(numberresult <= numberresults + numbersize);
1632 break;
1633 case 'p':
1634 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1635 /* %p is ill-defined: ensure leading 0x. */
1636 if (numberresult[1] == 'X')
1637 numberresult[1] = 'x';
1638 else if (numberresult[1] != 'x') {
1639 memmove(numberresult + 2, numberresult,
1640 strlen(numberresult) + 1);
1641 numberresult[0] = '0';
1642 numberresult[1] = 'x';
1643 numprinted += 2;
1644 }
1645 n += numprinted;
1646 numberresult += (numprinted + 1);
1647 assert(*(numberresult - 1) == '\0');
1648 assert(*(numberresult - 2) != '\0');
1649 assert(numprinted >= 0);
1650 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001651 break;
1652 case 's':
1653 {
1654 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001655 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001656 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1657 if (!str)
1658 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001659 /* since PyUnicode_DecodeUTF8 returns already flexible
1660 unicode objects, there is no need to call ready on them */
1661 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001662 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001664 /* Remember the str and switch to the next slot */
1665 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001666 break;
1667 }
1668 case 'U':
1669 {
1670 PyObject *obj = va_arg(count, PyObject *);
1671 assert(obj && PyUnicode_Check(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 if (PyUnicode_READY(obj) == -1)
1673 goto fail;
1674 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001675 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001677 break;
1678 }
1679 case 'V':
1680 {
1681 PyObject *obj = va_arg(count, PyObject *);
1682 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001683 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001684 assert(obj || str);
1685 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001686 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687 if (PyUnicode_READY(obj) == -1)
1688 goto fail;
1689 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001690 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001692 *callresult++ = NULL;
1693 }
1694 else {
1695 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1696 if (!str_obj)
1697 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001699 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001701 *callresult++ = str_obj;
1702 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001703 break;
1704 }
1705 case 'S':
1706 {
1707 PyObject *obj = va_arg(count, PyObject *);
1708 PyObject *str;
1709 assert(obj);
1710 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001712 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001714 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001716 /* Remember the str and switch to the next slot */
1717 *callresult++ = str;
1718 break;
1719 }
1720 case 'R':
1721 {
1722 PyObject *obj = va_arg(count, PyObject *);
1723 PyObject *repr;
1724 assert(obj);
1725 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001726 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001727 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001728 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001729 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001731 /* Remember the repr and switch to the next slot */
1732 *callresult++ = repr;
1733 break;
1734 }
1735 case 'A':
1736 {
1737 PyObject *obj = va_arg(count, PyObject *);
1738 PyObject *ascii;
1739 assert(obj);
1740 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00001742 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001744 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001746 /* Remember the repr and switch to the next slot */
1747 *callresult++ = ascii;
1748 break;
1749 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001750 default:
1751 /* if we stumble upon an unknown
1752 formatting code, copy the rest of
1753 the format string to the output
1754 string. (we cannot just skip the
1755 code, since there's no way to know
1756 what's in the argument list) */
1757 n += strlen(p);
1758 goto expand;
1759 }
1760 } else
1761 n++;
1762 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001763 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001764 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00001766 we don't have to resize the string.
1767 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001769 if (!string)
1770 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 kind = PyUnicode_KIND(string);
1772 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001773 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001777 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001778 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00001779
1780 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
1782 /* checking for == because the last argument could be a empty
1783 string, which causes i to point to end, the assert at the end of
1784 the loop */
1785 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001786
Benjamin Peterson14339b62009-01-31 16:36:08 +00001787 switch (*f) {
1788 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001789 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 const int ordinal = va_arg(vargs, int);
1791 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001792 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001793 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001794 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001795 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001796 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001797 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 case 'p':
1799 /* unused, since we already have the result */
1800 if (*f == 'p')
1801 (void) va_arg(vargs, void *);
1802 else
1803 (void) va_arg(vargs, int);
1804 /* extract the result from numberresults and append. */
1805 for (; *numberresult; ++i, ++numberresult)
1806 PyUnicode_WRITE(kind, data, i, *numberresult);
1807 /* skip over the separating '\0' */
1808 assert(*numberresult == '\0');
1809 numberresult++;
1810 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001811 break;
1812 case 's':
1813 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001814 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001816 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 size = PyUnicode_GET_LENGTH(*callresult);
1818 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001819 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1820 *callresult, 0,
1821 size) < 0)
1822 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001824 /* We're done with the unicode()/repr() => forget it */
1825 Py_DECREF(*callresult);
1826 /* switch to next unicode()/repr() result */
1827 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001828 break;
1829 }
1830 case 'U':
1831 {
1832 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833 Py_ssize_t size;
1834 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
1835 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001836 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1837 obj, 0,
1838 size) < 0)
1839 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001841 break;
1842 }
1843 case 'V':
1844 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001846 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001847 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001848 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 size = PyUnicode_GET_LENGTH(obj);
1850 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001851 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1852 obj, 0,
1853 size) < 0)
1854 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001856 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 size = PyUnicode_GET_LENGTH(*callresult);
1858 assert(PyUnicode_KIND(*callresult) <=
1859 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001860 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1861 *callresult,
1862 0, size) < 0)
1863 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00001865 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001866 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001867 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001868 break;
1869 }
1870 case 'S':
1871 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001872 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001873 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001874 /* unused, since we already have the result */
1875 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001876 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02001877 if (PyUnicode_CopyCharacters((PyObject*)string, i,
1878 *callresult, 0,
1879 PyUnicode_GET_LENGTH(*callresult)) < 0)
1880 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001881 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001882 /* We're done with the unicode()/repr() => forget it */
1883 Py_DECREF(*callresult);
1884 /* switch to next unicode()/repr() result */
1885 ++callresult;
1886 break;
1887 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001888 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001889 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001890 break;
1891 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 for (; *p; ++p, ++i)
1893 PyUnicode_WRITE(kind, data, i, *p);
1894 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00001895 goto end;
1896 }
Victor Stinner1205f272010-09-11 00:54:47 +00001897 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 else {
1899 assert(i < PyUnicode_GET_LENGTH(string));
1900 PyUnicode_WRITE(kind, data, i++, *f);
1901 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00001904
Benjamin Peterson29060642009-01-31 22:14:21 +00001905 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001906 if (callresults)
1907 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908 if (numberresults)
1909 PyObject_Free(numberresults);
1910 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001911 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001912 if (callresults) {
1913 PyObject **callresult2 = callresults;
1914 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001915 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001916 ++callresult2;
1917 }
1918 PyObject_Free(callresults);
1919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920 if (numberresults)
1921 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001922 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001923}
1924
Walter Dörwaldd2034312007-05-18 16:29:38 +00001925PyObject *
1926PyUnicode_FromFormat(const char *format, ...)
1927{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001928 PyObject* ret;
1929 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001930
1931#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001932 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001933#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001934 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001935#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001936 ret = PyUnicode_FromFormatV(format, vargs);
1937 va_end(vargs);
1938 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001939}
1940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001941#ifdef HAVE_WCHAR_H
1942
Victor Stinner5593d8a2010-10-02 11:11:27 +00001943/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1944 convert a Unicode object to a wide character string.
1945
Victor Stinnerd88d9832011-09-06 02:00:05 +02001946 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001947 character) required to convert the unicode object. Ignore size argument.
1948
Victor Stinnerd88d9832011-09-06 02:00:05 +02001949 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001950 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001951 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001952static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001953unicode_aswidechar(PyUnicodeObject *unicode,
1954 wchar_t *w,
1955 Py_ssize_t size)
1956{
Victor Stinner5593d8a2010-10-02 11:11:27 +00001957 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 const wchar_t *wstr;
1959
1960 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
1961 if (wstr == NULL)
1962 return -1;
1963
Victor Stinner5593d8a2010-10-02 11:11:27 +00001964 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00001965 if (size > res)
1966 size = res + 1;
1967 else
1968 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00001970 return res;
1971 }
1972 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001974}
1975
1976Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001977PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001978 wchar_t *w,
1979 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980{
1981 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001982 PyErr_BadInternalCall();
1983 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001985 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986}
1987
Victor Stinner137c34c2010-09-29 10:25:54 +00001988wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001989PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001990 Py_ssize_t *size)
1991{
1992 wchar_t* buffer;
1993 Py_ssize_t buflen;
1994
1995 if (unicode == NULL) {
1996 PyErr_BadInternalCall();
1997 return NULL;
1998 }
1999
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002000 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 if (buflen == -1)
2002 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002003 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002004 PyErr_NoMemory();
2005 return NULL;
2006 }
2007
Victor Stinner137c34c2010-09-29 10:25:54 +00002008 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2009 if (buffer == NULL) {
2010 PyErr_NoMemory();
2011 return NULL;
2012 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002013 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 if (buflen == -1)
2015 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002016 if (size != NULL)
2017 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002018 return buffer;
2019}
2020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022
Alexander Belopolsky40018472011-02-26 01:02:56 +00002023PyObject *
2024PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002025{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002027 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002028 PyErr_SetString(PyExc_ValueError,
2029 "chr() arg not in range(0x110000)");
2030 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002031 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 if (ordinal < 256)
2034 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 v = PyUnicode_New(1, ordinal);
2037 if (v == NULL)
2038 return NULL;
2039 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2040 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002041}
2042
Alexander Belopolsky40018472011-02-26 01:02:56 +00002043PyObject *
2044PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002046 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002047 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002048 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002049 Py_INCREF(obj);
2050 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002051 }
2052 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002053 /* For a Unicode subtype that's not a Unicode object,
2054 return a true Unicode object with the same data. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 if (PyUnicode_READY(obj) == -1)
2056 return NULL;
2057 return substring((PyUnicodeObject *)obj, 0, PyUnicode_GET_LENGTH(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002058 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002059 PyErr_Format(PyExc_TypeError,
2060 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002061 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002062 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002063}
2064
Alexander Belopolsky40018472011-02-26 01:02:56 +00002065PyObject *
2066PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002067 const char *encoding,
2068 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002069{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002070 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002071 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002072
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002074 PyErr_BadInternalCall();
2075 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002077
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002078 /* Decoding bytes objects is the most common case and should be fast */
2079 if (PyBytes_Check(obj)) {
2080 if (PyBytes_GET_SIZE(obj) == 0) {
2081 Py_INCREF(unicode_empty);
2082 v = (PyObject *) unicode_empty;
2083 }
2084 else {
2085 v = PyUnicode_Decode(
2086 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2087 encoding, errors);
2088 }
2089 return v;
2090 }
2091
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002092 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002093 PyErr_SetString(PyExc_TypeError,
2094 "decoding str is not supported");
2095 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002096 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002097
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002098 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2099 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2100 PyErr_Format(PyExc_TypeError,
2101 "coercing to str: need bytes, bytearray "
2102 "or buffer-like object, %.80s found",
2103 Py_TYPE(obj)->tp_name);
2104 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002105 }
Tim Petersced69f82003-09-16 20:30:58 +00002106
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002107 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002108 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002109 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 }
Tim Petersced69f82003-09-16 20:30:58 +00002111 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002112 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002113
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002114 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002115 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116}
2117
Victor Stinner600d3be2010-06-10 12:00:55 +00002118/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002119 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2120 1 on success. */
2121static int
2122normalize_encoding(const char *encoding,
2123 char *lower,
2124 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002126 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002127 char *l;
2128 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002129
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002130 e = encoding;
2131 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002132 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002133 while (*e) {
2134 if (l == l_end)
2135 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002136 if (Py_ISUPPER(*e)) {
2137 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002138 }
2139 else if (*e == '_') {
2140 *l++ = '-';
2141 e++;
2142 }
2143 else {
2144 *l++ = *e++;
2145 }
2146 }
2147 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002148 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002149}
2150
Alexander Belopolsky40018472011-02-26 01:02:56 +00002151PyObject *
2152PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002153 Py_ssize_t size,
2154 const char *encoding,
2155 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002156{
2157 PyObject *buffer = NULL, *unicode;
2158 Py_buffer info;
2159 char lower[11]; /* Enough for any encoding shortcut */
2160
2161 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002162 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002163
2164 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002165 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002166 if ((strcmp(lower, "utf-8") == 0) ||
2167 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002168 return PyUnicode_DecodeUTF8(s, size, errors);
2169 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002170 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002171 (strcmp(lower, "iso-8859-1") == 0))
2172 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002173#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002174 else if (strcmp(lower, "mbcs") == 0)
2175 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002176#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002177 else if (strcmp(lower, "ascii") == 0)
2178 return PyUnicode_DecodeASCII(s, size, errors);
2179 else if (strcmp(lower, "utf-16") == 0)
2180 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2181 else if (strcmp(lower, "utf-32") == 0)
2182 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002184
2185 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002186 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002187 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002188 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002189 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002190 if (buffer == NULL)
2191 goto onError;
2192 unicode = PyCodec_Decode(buffer, encoding, errors);
2193 if (unicode == NULL)
2194 goto onError;
2195 if (!PyUnicode_Check(unicode)) {
2196 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002197 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002198 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 Py_DECREF(unicode);
2200 goto onError;
2201 }
2202 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 if (PyUnicode_READY(unicode)) {
2204 Py_DECREF(unicode);
2205 return NULL;
2206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002208
Benjamin Peterson29060642009-01-31 22:14:21 +00002209 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210 Py_XDECREF(buffer);
2211 return NULL;
2212}
2213
Alexander Belopolsky40018472011-02-26 01:02:56 +00002214PyObject *
2215PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002216 const char *encoding,
2217 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002218{
2219 PyObject *v;
2220
2221 if (!PyUnicode_Check(unicode)) {
2222 PyErr_BadArgument();
2223 goto onError;
2224 }
2225
2226 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002227 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002228
2229 /* Decode via the codec registry */
2230 v = PyCodec_Decode(unicode, encoding, errors);
2231 if (v == NULL)
2232 goto onError;
2233 return v;
2234
Benjamin Peterson29060642009-01-31 22:14:21 +00002235 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002236 return NULL;
2237}
2238
Alexander Belopolsky40018472011-02-26 01:02:56 +00002239PyObject *
2240PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002241 const char *encoding,
2242 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002243{
2244 PyObject *v;
2245
2246 if (!PyUnicode_Check(unicode)) {
2247 PyErr_BadArgument();
2248 goto onError;
2249 }
2250
2251 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002252 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002253
2254 /* Decode via the codec registry */
2255 v = PyCodec_Decode(unicode, encoding, errors);
2256 if (v == NULL)
2257 goto onError;
2258 if (!PyUnicode_Check(v)) {
2259 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002260 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002261 Py_TYPE(v)->tp_name);
2262 Py_DECREF(v);
2263 goto onError;
2264 }
2265 return v;
2266
Benjamin Peterson29060642009-01-31 22:14:21 +00002267 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002268 return NULL;
2269}
2270
Alexander Belopolsky40018472011-02-26 01:02:56 +00002271PyObject *
2272PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002273 Py_ssize_t size,
2274 const char *encoding,
2275 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276{
2277 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002278
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279 unicode = PyUnicode_FromUnicode(s, size);
2280 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002281 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2283 Py_DECREF(unicode);
2284 return v;
2285}
2286
Alexander Belopolsky40018472011-02-26 01:02:56 +00002287PyObject *
2288PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002289 const char *encoding,
2290 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002291{
2292 PyObject *v;
2293
2294 if (!PyUnicode_Check(unicode)) {
2295 PyErr_BadArgument();
2296 goto onError;
2297 }
2298
2299 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002300 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002301
2302 /* Encode via the codec registry */
2303 v = PyCodec_Encode(unicode, encoding, errors);
2304 if (v == NULL)
2305 goto onError;
2306 return v;
2307
Benjamin Peterson29060642009-01-31 22:14:21 +00002308 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002309 return NULL;
2310}
2311
Victor Stinnerad158722010-10-27 00:25:46 +00002312PyObject *
2313PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002314{
Victor Stinner99b95382011-07-04 14:23:54 +02002315#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002316 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2317 PyUnicode_GET_SIZE(unicode),
2318 NULL);
2319#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002320 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002321#else
Victor Stinner793b5312011-04-27 00:24:21 +02002322 PyInterpreterState *interp = PyThreadState_GET()->interp;
2323 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2324 cannot use it to encode and decode filenames before it is loaded. Load
2325 the Python codec requires to encode at least its own filename. Use the C
2326 version of the locale codec until the codec registry is initialized and
2327 the Python codec is loaded.
2328
2329 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2330 cannot only rely on it: check also interp->fscodec_initialized for
2331 subinterpreters. */
2332 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002333 return PyUnicode_AsEncodedString(unicode,
2334 Py_FileSystemDefaultEncoding,
2335 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002336 }
2337 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002338 /* locale encoding with surrogateescape */
2339 wchar_t *wchar;
2340 char *bytes;
2341 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002342 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002343
2344 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2345 if (wchar == NULL)
2346 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002347 bytes = _Py_wchar2char(wchar, &error_pos);
2348 if (bytes == NULL) {
2349 if (error_pos != (size_t)-1) {
2350 char *errmsg = strerror(errno);
2351 PyObject *exc = NULL;
2352 if (errmsg == NULL)
2353 errmsg = "Py_wchar2char() failed";
2354 raise_encode_exception(&exc,
2355 "filesystemencoding",
2356 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2357 error_pos, error_pos+1,
2358 errmsg);
2359 Py_XDECREF(exc);
2360 }
2361 else
2362 PyErr_NoMemory();
2363 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002364 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002365 }
2366 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002367
2368 bytes_obj = PyBytes_FromString(bytes);
2369 PyMem_Free(bytes);
2370 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002371 }
Victor Stinnerad158722010-10-27 00:25:46 +00002372#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002373}
2374
Alexander Belopolsky40018472011-02-26 01:02:56 +00002375PyObject *
2376PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002377 const char *encoding,
2378 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002379{
2380 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002381 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002382
Guido van Rossumd57fd912000-03-10 22:53:23 +00002383 if (!PyUnicode_Check(unicode)) {
2384 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386 }
Fred Drakee4315f52000-05-09 19:53:39 +00002387
Victor Stinner2f283c22011-03-02 01:21:46 +00002388 if (encoding == NULL) {
2389 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002391 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002393 }
Fred Drakee4315f52000-05-09 19:53:39 +00002394
2395 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002396 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002397 if ((strcmp(lower, "utf-8") == 0) ||
2398 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002399 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002400 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002401 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002402 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002404 }
Victor Stinner37296e82010-06-10 13:36:23 +00002405 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002406 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002407 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002409#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002410 else if (strcmp(lower, "mbcs") == 0)
2411 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2412 PyUnicode_GET_SIZE(unicode),
2413 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002414#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002415 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002416 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418
2419 /* Encode via the codec registry */
2420 v = PyCodec_Encode(unicode, encoding, errors);
2421 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002422 return NULL;
2423
2424 /* The normal path */
2425 if (PyBytes_Check(v))
2426 return v;
2427
2428 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002429 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002430 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002431 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002432
2433 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2434 "encoder %s returned bytearray instead of bytes",
2435 encoding);
2436 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002437 Py_DECREF(v);
2438 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002439 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002440
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002441 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2442 Py_DECREF(v);
2443 return b;
2444 }
2445
2446 PyErr_Format(PyExc_TypeError,
2447 "encoder did not return a bytes object (type=%.400s)",
2448 Py_TYPE(v)->tp_name);
2449 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002450 return NULL;
2451}
2452
Alexander Belopolsky40018472011-02-26 01:02:56 +00002453PyObject *
2454PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002455 const char *encoding,
2456 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002457{
2458 PyObject *v;
2459
2460 if (!PyUnicode_Check(unicode)) {
2461 PyErr_BadArgument();
2462 goto onError;
2463 }
2464
2465 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002466 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002467
2468 /* Encode via the codec registry */
2469 v = PyCodec_Encode(unicode, encoding, errors);
2470 if (v == NULL)
2471 goto onError;
2472 if (!PyUnicode_Check(v)) {
2473 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002474 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002475 Py_TYPE(v)->tp_name);
2476 Py_DECREF(v);
2477 goto onError;
2478 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002480
Benjamin Peterson29060642009-01-31 22:14:21 +00002481 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 return NULL;
2483}
2484
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002485PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002486PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002487 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002488 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2489}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002490
Christian Heimes5894ba72007-11-04 11:43:14 +00002491PyObject*
2492PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2493{
Victor Stinner99b95382011-07-04 14:23:54 +02002494#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002495 return PyUnicode_DecodeMBCS(s, size, NULL);
2496#elif defined(__APPLE__)
2497 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2498#else
Victor Stinner793b5312011-04-27 00:24:21 +02002499 PyInterpreterState *interp = PyThreadState_GET()->interp;
2500 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2501 cannot use it to encode and decode filenames before it is loaded. Load
2502 the Python codec requires to encode at least its own filename. Use the C
2503 version of the locale codec until the codec registry is initialized and
2504 the Python codec is loaded.
2505
2506 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2507 cannot only rely on it: check also interp->fscodec_initialized for
2508 subinterpreters. */
2509 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002510 return PyUnicode_Decode(s, size,
2511 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002512 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002513 }
2514 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002515 /* locale encoding with surrogateescape */
2516 wchar_t *wchar;
2517 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002518 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002519
2520 if (s[size] != '\0' || size != strlen(s)) {
2521 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2522 return NULL;
2523 }
2524
Victor Stinner168e1172010-10-16 23:16:16 +00002525 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002526 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002527 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002528
Victor Stinner168e1172010-10-16 23:16:16 +00002529 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002530 PyMem_Free(wchar);
2531 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002532 }
Victor Stinnerad158722010-10-27 00:25:46 +00002533#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002534}
2535
Martin v. Löwis011e8422009-05-05 04:43:17 +00002536
2537int
2538PyUnicode_FSConverter(PyObject* arg, void* addr)
2539{
2540 PyObject *output = NULL;
2541 Py_ssize_t size;
2542 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002543 if (arg == NULL) {
2544 Py_DECREF(*(PyObject**)addr);
2545 return 1;
2546 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002547 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002548 output = arg;
2549 Py_INCREF(output);
2550 }
2551 else {
2552 arg = PyUnicode_FromObject(arg);
2553 if (!arg)
2554 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002555 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002556 Py_DECREF(arg);
2557 if (!output)
2558 return 0;
2559 if (!PyBytes_Check(output)) {
2560 Py_DECREF(output);
2561 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2562 return 0;
2563 }
2564 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002565 size = PyBytes_GET_SIZE(output);
2566 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002567 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002568 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002569 Py_DECREF(output);
2570 return 0;
2571 }
2572 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002573 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002574}
2575
2576
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002577int
2578PyUnicode_FSDecoder(PyObject* arg, void* addr)
2579{
2580 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002581 if (arg == NULL) {
2582 Py_DECREF(*(PyObject**)addr);
2583 return 1;
2584 }
2585 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002586 if (PyUnicode_READY(arg))
2587 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002588 output = arg;
2589 Py_INCREF(output);
2590 }
2591 else {
2592 arg = PyBytes_FromObject(arg);
2593 if (!arg)
2594 return 0;
2595 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2596 PyBytes_GET_SIZE(arg));
2597 Py_DECREF(arg);
2598 if (!output)
2599 return 0;
2600 if (!PyUnicode_Check(output)) {
2601 Py_DECREF(output);
2602 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2603 return 0;
2604 }
2605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002606 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2607 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002608 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2609 Py_DECREF(output);
2610 return 0;
2611 }
2612 *(PyObject**)addr = output;
2613 return Py_CLEANUP_SUPPORTED;
2614}
2615
2616
Martin v. Löwis5b222132007-06-10 09:51:05 +00002617char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002619{
Christian Heimesf3863112007-11-22 07:46:41 +00002620 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2622
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002623 if (!PyUnicode_Check(unicode)) {
2624 PyErr_BadArgument();
2625 return NULL;
2626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002627 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002628 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002629
2630 if (_PyUnicode_UTF8(unicode) == NULL) {
2631 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2632 if (bytes == NULL)
2633 return NULL;
2634 u->_base.utf8 = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2635 if (u->_base.utf8 == NULL) {
2636 Py_DECREF(bytes);
2637 return NULL;
2638 }
2639 u->_base.utf8_length = PyBytes_GET_SIZE(bytes);
2640 Py_MEMCPY(u->_base.utf8, PyBytes_AS_STRING(bytes), u->_base.utf8_length + 1);
2641 Py_DECREF(bytes);
2642 }
2643
2644 if (psize)
2645 *psize = _PyUnicode_UTF8_LENGTH(unicode);
2646 return _PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002647}
2648
2649char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002650PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002651{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2653}
2654
2655#ifdef Py_DEBUG
2656int unicode_as_unicode_calls = 0;
2657#endif
2658
2659
2660Py_UNICODE *
2661PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2662{
2663 PyUnicodeObject *u;
2664 const unsigned char *one_byte;
2665#if SIZEOF_WCHAR_T == 4
2666 const Py_UCS2 *two_bytes;
2667#else
2668 const Py_UCS4 *four_bytes;
2669 const Py_UCS4 *ucs4_end;
2670 Py_ssize_t num_surrogates;
2671#endif
2672 wchar_t *w;
2673 wchar_t *wchar_end;
2674
2675 if (!PyUnicode_Check(unicode)) {
2676 PyErr_BadArgument();
2677 return NULL;
2678 }
2679 u = (PyUnicodeObject*)unicode;
2680 if (_PyUnicode_WSTR(u) == NULL) {
2681 /* Non-ASCII compact unicode object */
2682 assert(_PyUnicode_KIND(u) != 0);
2683 assert(PyUnicode_IS_READY(u));
2684
2685#ifdef Py_DEBUG
2686 ++unicode_as_unicode_calls;
2687#endif
2688
2689 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2690#if SIZEOF_WCHAR_T == 2
2691 four_bytes = PyUnicode_4BYTE_DATA(u);
2692 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2693 num_surrogates = 0;
2694
2695 for (; four_bytes < ucs4_end; ++four_bytes) {
2696 if (*four_bytes > 0xFFFF)
2697 ++num_surrogates;
2698 }
2699
2700 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
2701 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
2702 if (!_PyUnicode_WSTR(u)) {
2703 PyErr_NoMemory();
2704 return NULL;
2705 }
2706 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
2707
2708 w = _PyUnicode_WSTR(u);
2709 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
2710 four_bytes = PyUnicode_4BYTE_DATA(u);
2711 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
2712 if (*four_bytes > 0xFFFF) {
2713 /* encode surrogate pair in this case */
2714 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
2715 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
2716 }
2717 else
2718 *w = *four_bytes;
2719
2720 if (w > wchar_end) {
2721 assert(0 && "Miscalculated string end");
2722 }
2723 }
2724 *w = 0;
2725#else
2726 /* sizeof(wchar_t) == 4 */
2727 Py_FatalError("Impossible unicode object state, wstr and str "
2728 "should share memory already.");
2729 return NULL;
2730#endif
2731 }
2732 else {
2733 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
2734 (_PyUnicode_LENGTH(u) + 1));
2735 if (!_PyUnicode_WSTR(u)) {
2736 PyErr_NoMemory();
2737 return NULL;
2738 }
2739 if (!PyUnicode_IS_COMPACT_ASCII(u))
2740 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
2741 w = _PyUnicode_WSTR(u);
2742 wchar_end = w + _PyUnicode_LENGTH(u);
2743
2744 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
2745 one_byte = PyUnicode_1BYTE_DATA(u);
2746 for (; w < wchar_end; ++one_byte, ++w)
2747 *w = *one_byte;
2748 /* null-terminate the wstr */
2749 *w = 0;
2750 }
2751 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
2752#if SIZEOF_WCHAR_T == 4
2753 two_bytes = PyUnicode_2BYTE_DATA(u);
2754 for (; w < wchar_end; ++two_bytes, ++w)
2755 *w = *two_bytes;
2756 /* null-terminate the wstr */
2757 *w = 0;
2758#else
2759 /* sizeof(wchar_t) == 2 */
2760 PyObject_FREE(_PyUnicode_WSTR(u));
2761 _PyUnicode_WSTR(u) = NULL;
2762 Py_FatalError("Impossible unicode object state, wstr "
2763 "and str should share memory already.");
2764 return NULL;
2765#endif
2766 }
2767 else {
2768 assert(0 && "This should never happen.");
2769 }
2770 }
2771 }
2772 if (size != NULL)
2773 *size = PyUnicode_WSTR_LENGTH(u);
2774 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00002775}
2776
Alexander Belopolsky40018472011-02-26 01:02:56 +00002777Py_UNICODE *
2778PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781}
2782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783
Alexander Belopolsky40018472011-02-26 01:02:56 +00002784Py_ssize_t
2785PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786{
2787 if (!PyUnicode_Check(unicode)) {
2788 PyErr_BadArgument();
2789 goto onError;
2790 }
2791 return PyUnicode_GET_SIZE(unicode);
2792
Benjamin Peterson29060642009-01-31 22:14:21 +00002793 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 return -1;
2795}
2796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002797Py_ssize_t
2798PyUnicode_GetLength(PyObject *unicode)
2799{
2800 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2801 PyErr_BadArgument();
2802 return -1;
2803 }
2804
2805 return PyUnicode_GET_LENGTH(unicode);
2806}
2807
2808Py_UCS4
2809PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
2810{
2811 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) != -1) {
2812 return PyErr_BadArgument();
2813 return (Py_UCS4)-1;
2814 }
2815 return PyUnicode_READ_CHAR(unicode, index);
2816}
2817
2818int
2819PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
2820{
2821 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
2822 return PyErr_BadArgument();
2823 return -1;
2824 }
2825
2826 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2827 index, ch);
2828 return 0;
2829}
2830
Alexander Belopolsky40018472011-02-26 01:02:56 +00002831const char *
2832PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002833{
Victor Stinner42cb4622010-09-01 19:39:01 +00002834 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002835}
2836
Victor Stinner554f3f02010-06-16 23:33:54 +00002837/* create or adjust a UnicodeDecodeError */
2838static void
2839make_decode_exception(PyObject **exceptionObject,
2840 const char *encoding,
2841 const char *input, Py_ssize_t length,
2842 Py_ssize_t startpos, Py_ssize_t endpos,
2843 const char *reason)
2844{
2845 if (*exceptionObject == NULL) {
2846 *exceptionObject = PyUnicodeDecodeError_Create(
2847 encoding, input, length, startpos, endpos, reason);
2848 }
2849 else {
2850 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2851 goto onError;
2852 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2853 goto onError;
2854 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2855 goto onError;
2856 }
2857 return;
2858
2859onError:
2860 Py_DECREF(*exceptionObject);
2861 *exceptionObject = NULL;
2862}
2863
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002864/* error handling callback helper:
2865 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002866 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002867 and adjust various state variables.
2868 return 0 on success, -1 on error
2869*/
2870
Alexander Belopolsky40018472011-02-26 01:02:56 +00002871static int
2872unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002873 const char *encoding, const char *reason,
2874 const char **input, const char **inend, Py_ssize_t *startinpos,
2875 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2876 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002877{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002878 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879
2880 PyObject *restuple = NULL;
2881 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002882 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002883 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002884 Py_ssize_t requiredsize;
2885 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002886 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002887 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002888 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002889 int res = -1;
2890
2891 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002892 *errorHandler = PyCodec_LookupError(errors);
2893 if (*errorHandler == NULL)
2894 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002895 }
2896
Victor Stinner554f3f02010-06-16 23:33:54 +00002897 make_decode_exception(exceptionObject,
2898 encoding,
2899 *input, *inend - *input,
2900 *startinpos, *endinpos,
2901 reason);
2902 if (*exceptionObject == NULL)
2903 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002904
2905 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2906 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002907 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002908 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002909 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002910 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002911 }
2912 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002913 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002914
2915 /* Copy back the bytes variables, which might have been modified by the
2916 callback */
2917 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2918 if (!inputobj)
2919 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002920 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002921 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002922 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002923 *input = PyBytes_AS_STRING(inputobj);
2924 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002925 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002926 /* we can DECREF safely, as the exception has another reference,
2927 so the object won't go away. */
2928 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002929
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002930 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002931 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002932 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002933 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2934 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002935 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002936
2937 /* need more space? (at least enough for what we
2938 have+the replacement+the rest of the string (starting
2939 at the new input position), so we won't have to check space
2940 when there are no errors in the rest of the string) */
2941 repptr = PyUnicode_AS_UNICODE(repunicode);
2942 repsize = PyUnicode_GET_SIZE(repunicode);
2943 requiredsize = *outpos + repsize + insize-newpos;
2944 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002945 if (requiredsize<2*outsize)
2946 requiredsize = 2*outsize;
2947 if (_PyUnicode_Resize(output, requiredsize) < 0)
2948 goto onError;
2949 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002950 }
2951 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002952 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002953 Py_UNICODE_COPY(*outptr, repptr, repsize);
2954 *outptr += repsize;
2955 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002956
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002957 /* we made it! */
2958 res = 0;
2959
Benjamin Peterson29060642009-01-31 22:14:21 +00002960 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002961 Py_XDECREF(restuple);
2962 return res;
2963}
2964
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002965/* --- UTF-7 Codec -------------------------------------------------------- */
2966
Antoine Pitrou244651a2009-05-04 18:56:13 +00002967/* See RFC2152 for details. We encode conservatively and decode liberally. */
2968
2969/* Three simple macros defining base-64. */
2970
2971/* Is c a base-64 character? */
2972
2973#define IS_BASE64(c) \
2974 (((c) >= 'A' && (c) <= 'Z') || \
2975 ((c) >= 'a' && (c) <= 'z') || \
2976 ((c) >= '0' && (c) <= '9') || \
2977 (c) == '+' || (c) == '/')
2978
2979/* given that c is a base-64 character, what is its base-64 value? */
2980
2981#define FROM_BASE64(c) \
2982 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2983 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2984 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2985 (c) == '+' ? 62 : 63)
2986
2987/* What is the base-64 character of the bottom 6 bits of n? */
2988
2989#define TO_BASE64(n) \
2990 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2991
2992/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2993 * decoded as itself. We are permissive on decoding; the only ASCII
2994 * byte not decoding to itself is the + which begins a base64
2995 * string. */
2996
2997#define DECODE_DIRECT(c) \
2998 ((c) <= 127 && (c) != '+')
2999
3000/* The UTF-7 encoder treats ASCII characters differently according to
3001 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3002 * the above). See RFC2152. This array identifies these different
3003 * sets:
3004 * 0 : "Set D"
3005 * alphanumeric and '(),-./:?
3006 * 1 : "Set O"
3007 * !"#$%&*;<=>@[]^_`{|}
3008 * 2 : "whitespace"
3009 * ht nl cr sp
3010 * 3 : special (must be base64 encoded)
3011 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3012 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003013
Tim Petersced69f82003-09-16 20:30:58 +00003014static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003015char utf7_category[128] = {
3016/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3017 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3018/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3019 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3020/* sp ! " # $ % & ' ( ) * + , - . / */
3021 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3022/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3023 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3024/* @ A B C D E F G H I J K L M N O */
3025 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3026/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3027 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3028/* ` a b c d e f g h i j k l m n o */
3029 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3030/* p q r s t u v w x y z { | } ~ del */
3031 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003032};
3033
Antoine Pitrou244651a2009-05-04 18:56:13 +00003034/* ENCODE_DIRECT: this character should be encoded as itself. The
3035 * answer depends on whether we are encoding set O as itself, and also
3036 * on whether we are encoding whitespace as itself. RFC2152 makes it
3037 * clear that the answers to these questions vary between
3038 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003039
Antoine Pitrou244651a2009-05-04 18:56:13 +00003040#define ENCODE_DIRECT(c, directO, directWS) \
3041 ((c) < 128 && (c) > 0 && \
3042 ((utf7_category[(c)] == 0) || \
3043 (directWS && (utf7_category[(c)] == 2)) || \
3044 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003045
Alexander Belopolsky40018472011-02-26 01:02:56 +00003046PyObject *
3047PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003048 Py_ssize_t size,
3049 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003050{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003051 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3052}
3053
Antoine Pitrou244651a2009-05-04 18:56:13 +00003054/* The decoder. The only state we preserve is our read position,
3055 * i.e. how many characters we have consumed. So if we end in the
3056 * middle of a shift sequence we have to back off the read position
3057 * and the output to the beginning of the sequence, otherwise we lose
3058 * all the shift state (seen bits, number of bits seen, high
3059 * surrogate). */
3060
Alexander Belopolsky40018472011-02-26 01:02:56 +00003061PyObject *
3062PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003063 Py_ssize_t size,
3064 const char *errors,
3065 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003066{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003067 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003068 Py_ssize_t startinpos;
3069 Py_ssize_t endinpos;
3070 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003071 const char *e;
3072 PyUnicodeObject *unicode;
3073 Py_UNICODE *p;
3074 const char *errmsg = "";
3075 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003076 Py_UNICODE *shiftOutStart;
3077 unsigned int base64bits = 0;
3078 unsigned long base64buffer = 0;
3079 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003080 PyObject *errorHandler = NULL;
3081 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003082
3083 unicode = _PyUnicode_New(size);
3084 if (!unicode)
3085 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003086 if (size == 0) {
3087 if (consumed)
3088 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003089 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003090 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003092 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003093 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003094 e = s + size;
3095
3096 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003097 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003098 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003099 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003100
Antoine Pitrou244651a2009-05-04 18:56:13 +00003101 if (inShift) { /* in a base-64 section */
3102 if (IS_BASE64(ch)) { /* consume a base-64 character */
3103 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3104 base64bits += 6;
3105 s++;
3106 if (base64bits >= 16) {
3107 /* we have enough bits for a UTF-16 value */
3108 Py_UNICODE outCh = (Py_UNICODE)
3109 (base64buffer >> (base64bits-16));
3110 base64bits -= 16;
3111 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3112 if (surrogate) {
3113 /* expecting a second surrogate */
3114 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3115#ifdef Py_UNICODE_WIDE
3116 *p++ = (((surrogate & 0x3FF)<<10)
3117 | (outCh & 0x3FF)) + 0x10000;
3118#else
3119 *p++ = surrogate;
3120 *p++ = outCh;
3121#endif
3122 surrogate = 0;
3123 }
3124 else {
3125 surrogate = 0;
3126 errmsg = "second surrogate missing";
3127 goto utf7Error;
3128 }
3129 }
3130 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3131 /* first surrogate */
3132 surrogate = outCh;
3133 }
3134 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3135 errmsg = "unexpected second surrogate";
3136 goto utf7Error;
3137 }
3138 else {
3139 *p++ = outCh;
3140 }
3141 }
3142 }
3143 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003144 inShift = 0;
3145 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003146 if (surrogate) {
3147 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003148 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003149 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003150 if (base64bits > 0) { /* left-over bits */
3151 if (base64bits >= 6) {
3152 /* We've seen at least one base-64 character */
3153 errmsg = "partial character in shift sequence";
3154 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003155 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003156 else {
3157 /* Some bits remain; they should be zero */
3158 if (base64buffer != 0) {
3159 errmsg = "non-zero padding bits in shift sequence";
3160 goto utf7Error;
3161 }
3162 }
3163 }
3164 if (ch != '-') {
3165 /* '-' is absorbed; other terminating
3166 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003167 *p++ = ch;
3168 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003169 }
3170 }
3171 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003172 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003173 s++; /* consume '+' */
3174 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003175 s++;
3176 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003177 }
3178 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003179 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003180 shiftOutStart = p;
3181 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003182 }
3183 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003184 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003185 *p++ = ch;
3186 s++;
3187 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003188 else {
3189 startinpos = s-starts;
3190 s++;
3191 errmsg = "unexpected special character";
3192 goto utf7Error;
3193 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003194 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003195utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003196 outpos = p-PyUnicode_AS_UNICODE(unicode);
3197 endinpos = s-starts;
3198 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003199 errors, &errorHandler,
3200 "utf7", errmsg,
3201 &starts, &e, &startinpos, &endinpos, &exc, &s,
3202 &unicode, &outpos, &p))
3203 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003204 }
3205
Antoine Pitrou244651a2009-05-04 18:56:13 +00003206 /* end of string */
3207
3208 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3209 /* if we're in an inconsistent state, that's an error */
3210 if (surrogate ||
3211 (base64bits >= 6) ||
3212 (base64bits > 0 && base64buffer != 0)) {
3213 outpos = p-PyUnicode_AS_UNICODE(unicode);
3214 endinpos = size;
3215 if (unicode_decode_call_errorhandler(
3216 errors, &errorHandler,
3217 "utf7", "unterminated shift sequence",
3218 &starts, &e, &startinpos, &endinpos, &exc, &s,
3219 &unicode, &outpos, &p))
3220 goto onError;
3221 if (s < e)
3222 goto restart;
3223 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003224 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003225
3226 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003227 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003228 if (inShift) {
3229 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003230 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003231 }
3232 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003233 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003234 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003235 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003236
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003237 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003238 goto onError;
3239
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 Py_XDECREF(errorHandler);
3241 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003242 if (PyUnicode_READY(unicode) == -1) {
3243 Py_DECREF(unicode);
3244 return NULL;
3245 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003246 return (PyObject *)unicode;
3247
Benjamin Peterson29060642009-01-31 22:14:21 +00003248 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003249 Py_XDECREF(errorHandler);
3250 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003251 Py_DECREF(unicode);
3252 return NULL;
3253}
3254
3255
Alexander Belopolsky40018472011-02-26 01:02:56 +00003256PyObject *
3257PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003258 Py_ssize_t size,
3259 int base64SetO,
3260 int base64WhiteSpace,
3261 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003262{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003263 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003264 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003265 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003266 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003267 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003268 unsigned int base64bits = 0;
3269 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003270 char * out;
3271 char * start;
3272
3273 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003274 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003275
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003276 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003277 return PyErr_NoMemory();
3278
Antoine Pitrou244651a2009-05-04 18:56:13 +00003279 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003280 if (v == NULL)
3281 return NULL;
3282
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003283 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003284 for (;i < size; ++i) {
3285 Py_UNICODE ch = s[i];
3286
Antoine Pitrou244651a2009-05-04 18:56:13 +00003287 if (inShift) {
3288 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3289 /* shifting out */
3290 if (base64bits) { /* output remaining bits */
3291 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3292 base64buffer = 0;
3293 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003294 }
3295 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003296 /* Characters not in the BASE64 set implicitly unshift the sequence
3297 so no '-' is required, except if the character is itself a '-' */
3298 if (IS_BASE64(ch) || ch == '-') {
3299 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003300 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003301 *out++ = (char) ch;
3302 }
3303 else {
3304 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003305 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003306 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003307 else { /* not in a shift sequence */
3308 if (ch == '+') {
3309 *out++ = '+';
3310 *out++ = '-';
3311 }
3312 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3313 *out++ = (char) ch;
3314 }
3315 else {
3316 *out++ = '+';
3317 inShift = 1;
3318 goto encode_char;
3319 }
3320 }
3321 continue;
3322encode_char:
3323#ifdef Py_UNICODE_WIDE
3324 if (ch >= 0x10000) {
3325 /* code first surrogate */
3326 base64bits += 16;
3327 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3328 while (base64bits >= 6) {
3329 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3330 base64bits -= 6;
3331 }
3332 /* prepare second surrogate */
3333 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3334 }
3335#endif
3336 base64bits += 16;
3337 base64buffer = (base64buffer << 16) | ch;
3338 while (base64bits >= 6) {
3339 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3340 base64bits -= 6;
3341 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003342 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003343 if (base64bits)
3344 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3345 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003346 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003347 if (_PyBytes_Resize(&v, out - start) < 0)
3348 return NULL;
3349 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003350}
3351
Antoine Pitrou244651a2009-05-04 18:56:13 +00003352#undef IS_BASE64
3353#undef FROM_BASE64
3354#undef TO_BASE64
3355#undef DECODE_DIRECT
3356#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003357
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358/* --- UTF-8 Codec -------------------------------------------------------- */
3359
Tim Petersced69f82003-09-16 20:30:58 +00003360static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003362 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3363 illegal prefix. See RFC 3629 for details */
3364 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3365 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003366 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003367 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3368 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3369 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3370 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003371 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3372 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3374 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003375 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3376 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3377 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3378 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3379 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380};
3381
Alexander Belopolsky40018472011-02-26 01:02:56 +00003382PyObject *
3383PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003384 Py_ssize_t size,
3385 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386{
Walter Dörwald69652032004-09-07 20:24:22 +00003387 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3388}
3389
Antoine Pitrouab868312009-01-10 15:40:25 +00003390/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3391#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3392
3393/* Mask to quickly check whether a C 'long' contains a
3394 non-ASCII, UTF8-encoded char. */
3395#if (SIZEOF_LONG == 8)
3396# define ASCII_CHAR_MASK 0x8080808080808080L
3397#elif (SIZEOF_LONG == 4)
3398# define ASCII_CHAR_MASK 0x80808080L
3399#else
3400# error C 'long' size should be either 4 or 8!
3401#endif
3402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003403/* Scans a UTF-8 string and returns the maximum character to be expected,
3404 the size of the decoded unicode string and if any major errors were
3405 encountered.
3406
3407 This function does check basic UTF-8 sanity, it does however NOT CHECK
3408 if the string contains surrogates, and if all continuation bytes are
3409 within the correct ranges, these checks are performed in
3410 PyUnicode_DecodeUTF8Stateful.
3411
3412 If it sets has_errors to 1, it means the value of unicode_size and max_char
3413 will be bogus and you should not rely on useful information in them.
3414 */
3415static Py_UCS4
3416utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3417 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3418 int *has_errors)
3419{
3420 Py_ssize_t n;
3421 Py_ssize_t char_count = 0;
3422 Py_UCS4 max_char = 127, new_max;
3423 Py_UCS4 upper_bound;
3424 const unsigned char *p = (const unsigned char *)s;
3425 const unsigned char *end = p + string_size;
3426 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3427 int err = 0;
3428
3429 for (; p < end && !err; ++p, ++char_count) {
3430 /* Only check value if it's not a ASCII char... */
3431 if (*p < 0x80) {
3432 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3433 an explanation. */
3434 if (!((size_t) p & LONG_PTR_MASK)) {
3435 /* Help register allocation */
3436 register const unsigned char *_p = p;
3437 while (_p < aligned_end) {
3438 unsigned long value = *(unsigned long *) _p;
3439 if (value & ASCII_CHAR_MASK)
3440 break;
3441 _p += SIZEOF_LONG;
3442 char_count += SIZEOF_LONG;
3443 }
3444 p = _p;
3445 if (p == end)
3446 break;
3447 }
3448 }
3449 if (*p >= 0x80) {
3450 n = utf8_code_length[*p];
3451 new_max = max_char;
3452 switch (n) {
3453 /* invalid start byte */
3454 case 0:
3455 err = 1;
3456 break;
3457 case 2:
3458 /* Code points between 0x00FF and 0x07FF inclusive.
3459 Approximate the upper bound of the code point,
3460 if this flips over 255 we can be sure it will be more
3461 than 255 and the string will need 2 bytes per code coint,
3462 if it stays under or equal to 255, we can be sure 1 byte
3463 is enough.
3464 ((*p & 0b00011111) << 6) | 0b00111111 */
3465 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3466 if (max_char < upper_bound)
3467 new_max = upper_bound;
3468 /* Ensure we track at least that we left ASCII space. */
3469 if (new_max < 128)
3470 new_max = 128;
3471 break;
3472 case 3:
3473 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3474 always > 255 and <= 65535 and will always need 2 bytes. */
3475 if (max_char < 65535)
3476 new_max = 65535;
3477 break;
3478 case 4:
3479 /* Code point will be above 0xFFFF for sure in this case. */
3480 new_max = 65537;
3481 break;
3482 /* Internal error, this should be caught by the first if */
3483 case 1:
3484 default:
3485 assert(0 && "Impossible case in utf8_max_char_and_size");
3486 err = 1;
3487 }
3488 /* Instead of number of overall bytes for this code point,
3489 n containts the number of following bytes: */
3490 --n;
3491 /* Check if the follow up chars are all valid continuation bytes */
3492 if (n >= 1) {
3493 const unsigned char *cont;
3494 if ((p + n) >= end) {
3495 if (consumed == 0)
3496 /* incomplete data, non-incremental decoding */
3497 err = 1;
3498 break;
3499 }
3500 for (cont = p + 1; cont < (p + n); ++cont) {
3501 if ((*cont & 0xc0) != 0x80) {
3502 err = 1;
3503 break;
3504 }
3505 }
3506 p += n;
3507 }
3508 else
3509 err = 1;
3510 max_char = new_max;
3511 }
3512 }
3513
3514 if (unicode_size)
3515 *unicode_size = char_count;
3516 if (has_errors)
3517 *has_errors = err;
3518 return max_char;
3519}
3520
3521/* Similar to PyUnicode_WRITE but can also write into wstr field
3522 of the legacy unicode representation */
3523#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3524 do { \
3525 const int k_ = (kind); \
3526 if (k_ == PyUnicode_WCHAR_KIND) \
3527 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3528 else if (k_ == PyUnicode_1BYTE_KIND) \
3529 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3530 else if (k_ == PyUnicode_2BYTE_KIND) \
3531 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3532 else \
3533 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3534 } while (0)
3535
Alexander Belopolsky40018472011-02-26 01:02:56 +00003536PyObject *
3537PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003538 Py_ssize_t size,
3539 const char *errors,
3540 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003541{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003544 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003545 Py_ssize_t startinpos;
3546 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003547 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003549 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550 PyObject *errorHandler = NULL;
3551 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003552 Py_UCS4 maxchar = 0;
3553 Py_ssize_t unicode_size;
3554 Py_ssize_t i;
3555 int kind;
3556 void *data;
3557 int has_errors;
3558 Py_UNICODE *error_outptr;
3559#if SIZEOF_WCHAR_T == 2
3560 Py_ssize_t wchar_offset = 0;
3561#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562
Walter Dörwald69652032004-09-07 20:24:22 +00003563 if (size == 0) {
3564 if (consumed)
3565 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003566 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003568 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3569 consumed, &has_errors);
3570 if (has_errors) {
3571 unicode = _PyUnicode_New(size);
3572 if (!unicode)
3573 return NULL;
3574 kind = PyUnicode_WCHAR_KIND;
3575 data = PyUnicode_AS_UNICODE(unicode);
3576 assert(data != NULL);
3577 }
3578 else {
3579 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3580 if (!unicode)
3581 return NULL;
3582 /* When the string is ASCII only, just use memcpy and return.
3583 unicode_size may be != size if there is an incomplete UTF-8
3584 sequence at the end of the ASCII block. */
3585 if (maxchar < 128 && size == unicode_size) {
3586 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3587 return (PyObject *)unicode;
3588 }
3589 kind = PyUnicode_KIND(unicode);
3590 data = PyUnicode_DATA(unicode);
3591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003593 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003595 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596
3597 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003598 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599
3600 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003601 /* Fast path for runs of ASCII characters. Given that common UTF-8
3602 input will consist of an overwhelming majority of ASCII
3603 characters, we try to optimize for this case by checking
3604 as many characters as a C 'long' can contain.
3605 First, check if we can do an aligned read, as most CPUs have
3606 a penalty for unaligned reads.
3607 */
3608 if (!((size_t) s & LONG_PTR_MASK)) {
3609 /* Help register allocation */
3610 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003611 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003612 while (_s < aligned_end) {
3613 /* Read a whole long at a time (either 4 or 8 bytes),
3614 and do a fast unrolled copy if it only contains ASCII
3615 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003616 unsigned long value = *(unsigned long *) _s;
3617 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003618 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003619 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3620 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3621 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3622 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003623#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003624 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3625 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3626 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3627 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003628#endif
3629 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003630 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003631 }
3632 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003633 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003634 if (s == e)
3635 break;
3636 ch = (unsigned char)*s;
3637 }
3638 }
3639
3640 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003641 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642 s++;
3643 continue;
3644 }
3645
3646 n = utf8_code_length[ch];
3647
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003648 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003649 if (consumed)
3650 break;
3651 else {
3652 errmsg = "unexpected end of data";
3653 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003654 endinpos = startinpos+1;
3655 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3656 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003657 goto utf8Error;
3658 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003659 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660
3661 switch (n) {
3662
3663 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003664 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003665 startinpos = s-starts;
3666 endinpos = startinpos+1;
3667 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668
3669 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003670 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 startinpos = s-starts;
3672 endinpos = startinpos+1;
3673 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674
3675 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003676 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003677 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003678 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003679 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003680 goto utf8Error;
3681 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003683 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003684 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 break;
3686
3687 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003688 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3689 will result in surrogates in range d800-dfff. Surrogates are
3690 not valid UTF-8 so they are rejected.
3691 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3692 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00003693 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003694 (s[2] & 0xc0) != 0x80 ||
3695 ((unsigned char)s[0] == 0xE0 &&
3696 (unsigned char)s[1] < 0xA0) ||
3697 ((unsigned char)s[0] == 0xED &&
3698 (unsigned char)s[1] > 0x9F)) {
3699 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003700 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003701 endinpos = startinpos + 1;
3702
3703 /* if s[1] first two bits are 1 and 0, then the invalid
3704 continuation byte is s[2], so increment endinpos by 1,
3705 if not, s[1] is invalid and endinpos doesn't need to
3706 be incremented. */
3707 if ((s[1] & 0xC0) == 0x80)
3708 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003709 goto utf8Error;
3710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003712 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003713 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003714 break;
3715
3716 case 4:
3717 if ((s[1] & 0xc0) != 0x80 ||
3718 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00003719 (s[3] & 0xc0) != 0x80 ||
3720 ((unsigned char)s[0] == 0xF0 &&
3721 (unsigned char)s[1] < 0x90) ||
3722 ((unsigned char)s[0] == 0xF4 &&
3723 (unsigned char)s[1] > 0x8F)) {
3724 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003725 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003726 endinpos = startinpos + 1;
3727 if ((s[1] & 0xC0) == 0x80) {
3728 endinpos++;
3729 if ((s[2] & 0xC0) == 0x80)
3730 endinpos++;
3731 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003732 goto utf8Error;
3733 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003734 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00003735 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3736 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003738 /* If the string is flexible or we have native UCS-4, write
3739 directly.. */
3740 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
3741 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00003742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003743 else {
3744 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00003745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003746 /* translate from 10000..10FFFF to 0..FFFF */
3747 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00003748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003749 /* high surrogate = top 10 bits added to D800 */
3750 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3751 (Py_UNICODE)(0xD800 + (ch >> 10)));
3752
3753 /* low surrogate = bottom 10 bits added to DC00 */
3754 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
3755 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
3756 }
3757#if SIZEOF_WCHAR_T == 2
3758 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003759#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 }
3762 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00003763 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003764
Benjamin Peterson29060642009-01-31 22:14:21 +00003765 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766 /* If this is not yet a resizable string, make it one.. */
3767 if (kind != PyUnicode_WCHAR_KIND) {
3768 const Py_UNICODE *u;
3769 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
3770 if (!new_unicode)
3771 goto onError;
3772 u = PyUnicode_AsUnicode((PyObject *)unicode);
3773 if (!u)
3774 goto onError;
3775#if SIZEOF_WCHAR_T == 2
3776 i += wchar_offset;
3777#endif
3778 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
3779 Py_DECREF(unicode);
3780 unicode = new_unicode;
3781 kind = 0;
3782 data = PyUnicode_AS_UNICODE(new_unicode);
3783 assert(data != NULL);
3784 }
3785 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00003786 if (unicode_decode_call_errorhandler(
3787 errors, &errorHandler,
3788 "utf8", errmsg,
3789 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003790 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00003791 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003792 /* Update data because unicode_decode_call_errorhandler might have
3793 re-created or resized the unicode object. */
3794 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00003795 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003797 /* Ensure the unicode_size calculation above was correct: */
3798 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
3799
Walter Dörwald69652032004-09-07 20:24:22 +00003800 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003801 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803 /* Adjust length and ready string when it contained errors and
3804 is of the old resizable kind. */
3805 if (kind == PyUnicode_WCHAR_KIND) {
3806 if (_PyUnicode_Resize(&unicode, i) < 0 ||
3807 PyUnicode_READY(unicode) == -1)
3808 goto onError;
3809 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 Py_XDECREF(errorHandler);
3812 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003813 if (PyUnicode_READY(unicode) == -1) {
3814 Py_DECREF(unicode);
3815 return NULL;
3816 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 return (PyObject *)unicode;
3818
Benjamin Peterson29060642009-01-31 22:14:21 +00003819 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 Py_XDECREF(errorHandler);
3821 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822 Py_DECREF(unicode);
3823 return NULL;
3824}
3825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003826#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00003827
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003828#ifdef __APPLE__
3829
3830/* Simplified UTF-8 decoder using surrogateescape error handler,
3831 used to decode the command line arguments on Mac OS X. */
3832
3833wchar_t*
3834_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
3835{
3836 int n;
3837 const char *e;
3838 wchar_t *unicode, *p;
3839
3840 /* Note: size will always be longer than the resulting Unicode
3841 character count */
3842 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
3843 PyErr_NoMemory();
3844 return NULL;
3845 }
3846 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
3847 if (!unicode)
3848 return NULL;
3849
3850 /* Unpack UTF-8 encoded data */
3851 p = unicode;
3852 e = s + size;
3853 while (s < e) {
3854 Py_UCS4 ch = (unsigned char)*s;
3855
3856 if (ch < 0x80) {
3857 *p++ = (wchar_t)ch;
3858 s++;
3859 continue;
3860 }
3861
3862 n = utf8_code_length[ch];
3863 if (s + n > e) {
3864 goto surrogateescape;
3865 }
3866
3867 switch (n) {
3868 case 0:
3869 case 1:
3870 goto surrogateescape;
3871
3872 case 2:
3873 if ((s[1] & 0xc0) != 0x80)
3874 goto surrogateescape;
3875 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
3876 assert ((ch > 0x007F) && (ch <= 0x07FF));
3877 *p++ = (wchar_t)ch;
3878 break;
3879
3880 case 3:
3881 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3882 will result in surrogates in range d800-dfff. Surrogates are
3883 not valid UTF-8 so they are rejected.
3884 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
3885 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
3886 if ((s[1] & 0xc0) != 0x80 ||
3887 (s[2] & 0xc0) != 0x80 ||
3888 ((unsigned char)s[0] == 0xE0 &&
3889 (unsigned char)s[1] < 0xA0) ||
3890 ((unsigned char)s[0] == 0xED &&
3891 (unsigned char)s[1] > 0x9F)) {
3892
3893 goto surrogateescape;
3894 }
3895 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
3896 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003897 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00003898 break;
3899
3900 case 4:
3901 if ((s[1] & 0xc0) != 0x80 ||
3902 (s[2] & 0xc0) != 0x80 ||
3903 (s[3] & 0xc0) != 0x80 ||
3904 ((unsigned char)s[0] == 0xF0 &&
3905 (unsigned char)s[1] < 0x90) ||
3906 ((unsigned char)s[0] == 0xF4 &&
3907 (unsigned char)s[1] > 0x8F)) {
3908 goto surrogateescape;
3909 }
3910 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
3911 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
3912 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
3913
3914#if SIZEOF_WCHAR_T == 4
3915 *p++ = (wchar_t)ch;
3916#else
3917 /* compute and append the two surrogates: */
3918
3919 /* translate from 10000..10FFFF to 0..FFFF */
3920 ch -= 0x10000;
3921
3922 /* high surrogate = top 10 bits added to D800 */
3923 *p++ = (wchar_t)(0xD800 + (ch >> 10));
3924
3925 /* low surrogate = bottom 10 bits added to DC00 */
3926 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
3927#endif
3928 break;
3929 }
3930 s += n;
3931 continue;
3932
3933 surrogateescape:
3934 *p++ = 0xDC00 + ch;
3935 s++;
3936 }
3937 *p = L'\0';
3938 return unicode;
3939}
3940
3941#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00003942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943/* Primary internal function which creates utf8 encoded bytes objects.
3944
3945 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00003946 and allocate exactly as much space needed at the end. Else allocate the
3947 maximum possible needed (4 result bytes per Unicode character), and return
3948 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003949*/
Tim Peters7e3d9612002-04-21 03:26:37 +00003950PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003951_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952{
Tim Peters602f7402002-04-27 18:03:26 +00003953#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00003954
Guido van Rossum98297ee2007-11-06 21:34:58 +00003955 Py_ssize_t i; /* index into s of next input byte */
3956 PyObject *result; /* result string object */
3957 char *p; /* next free byte in output buffer */
3958 Py_ssize_t nallocated; /* number of result bytes allocated */
3959 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00003960 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003961 PyObject *errorHandler = NULL;
3962 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003963 int kind;
3964 void *data;
3965 Py_ssize_t size;
3966 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
3967#if SIZEOF_WCHAR_T == 2
3968 Py_ssize_t wchar_offset = 0;
3969#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00003970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003971 if (!PyUnicode_Check(unicode)) {
3972 PyErr_BadArgument();
3973 return NULL;
3974 }
3975
3976 if (PyUnicode_READY(unicode) == -1)
3977 return NULL;
3978
3979 if (_PyUnicode_UTF8(unicode))
3980 return PyBytes_FromStringAndSize(_PyUnicode_UTF8(unicode),
3981 _PyUnicode_UTF8_LENGTH(unicode));
3982
3983 kind = PyUnicode_KIND(unicode);
3984 data = PyUnicode_DATA(unicode);
3985 size = PyUnicode_GET_LENGTH(unicode);
3986
Tim Peters602f7402002-04-27 18:03:26 +00003987 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988
Tim Peters602f7402002-04-27 18:03:26 +00003989 if (size <= MAX_SHORT_UNICHARS) {
3990 /* Write into the stack buffer; nallocated can't overflow.
3991 * At the end, we'll allocate exactly as much heap space as it
3992 * turns out we need.
3993 */
3994 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003995 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00003996 p = stackbuf;
3997 }
3998 else {
3999 /* Overallocate on the heap, and give the excess back at the end. */
4000 nallocated = size * 4;
4001 if (nallocated / 4 != size) /* overflow! */
4002 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004003 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004004 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004005 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004006 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004007 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004008
Tim Peters602f7402002-04-27 18:03:26 +00004009 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004011
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004012 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004013 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004015
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004017 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004018 *p++ = (char)(0xc0 | (ch >> 6));
4019 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004020 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004021 Py_ssize_t newpos;
4022 PyObject *rep;
4023 Py_ssize_t repsize, k, startpos;
4024 startpos = i-1;
4025#if SIZEOF_WCHAR_T == 2
4026 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004027#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004028 rep = unicode_encode_call_errorhandler(
4029 errors, &errorHandler, "utf-8", "surrogates not allowed",
4030 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4031 &exc, startpos, startpos+1, &newpos);
4032 if (!rep)
4033 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035 if (PyBytes_Check(rep))
4036 repsize = PyBytes_GET_SIZE(rep);
4037 else
4038 repsize = PyUnicode_GET_SIZE(rep);
4039
4040 if (repsize > 4) {
4041 Py_ssize_t offset;
4042
4043 if (result == NULL)
4044 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004045 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004046 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4049 /* integer overflow */
4050 PyErr_NoMemory();
4051 goto error;
4052 }
4053 nallocated += repsize - 4;
4054 if (result != NULL) {
4055 if (_PyBytes_Resize(&result, nallocated) < 0)
4056 goto error;
4057 } else {
4058 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004059 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060 goto error;
4061 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4062 }
4063 p = PyBytes_AS_STRING(result) + offset;
4064 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066 if (PyBytes_Check(rep)) {
4067 char *prep = PyBytes_AS_STRING(rep);
4068 for(k = repsize; k > 0; k--)
4069 *p++ = *prep++;
4070 } else /* rep is unicode */ {
4071 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4072 Py_UNICODE c;
4073
4074 for(k=0; k<repsize; k++) {
4075 c = prep[k];
4076 if (0x80 <= c) {
4077 raise_encode_exception(&exc, "utf-8",
4078 PyUnicode_AS_UNICODE(unicode),
4079 size, i-1, i,
4080 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004081 goto error;
4082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004084 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004085 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004086 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004087 } else if (ch < 0x10000) {
4088 *p++ = (char)(0xe0 | (ch >> 12));
4089 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4090 *p++ = (char)(0x80 | (ch & 0x3f));
4091 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004092 /* Encode UCS4 Unicode ordinals */
4093 *p++ = (char)(0xf0 | (ch >> 18));
4094 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4095 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4096 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097#if SIZEOF_WCHAR_T == 2
4098 wchar_offset++;
4099#endif
Tim Peters602f7402002-04-27 18:03:26 +00004100 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004102
Guido van Rossum98297ee2007-11-06 21:34:58 +00004103 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004104 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004105 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004106 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004107 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004108 }
4109 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004110 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004111 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004112 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004113 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004116 Py_XDECREF(errorHandler);
4117 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004118 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004119 error:
4120 Py_XDECREF(errorHandler);
4121 Py_XDECREF(exc);
4122 Py_XDECREF(result);
4123 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004124
Tim Peters602f7402002-04-27 18:03:26 +00004125#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126}
4127
Alexander Belopolsky40018472011-02-26 01:02:56 +00004128PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004129PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4130 Py_ssize_t size,
4131 const char *errors)
4132{
4133 PyObject *v, *unicode;
4134
4135 unicode = PyUnicode_FromUnicode(s, size);
4136 if (unicode == NULL)
4137 return NULL;
4138 v = _PyUnicode_AsUTF8String(unicode, errors);
4139 Py_DECREF(unicode);
4140 return v;
4141}
4142
4143PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004144PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004146 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147}
4148
Walter Dörwald41980ca2007-08-16 21:55:45 +00004149/* --- UTF-32 Codec ------------------------------------------------------- */
4150
4151PyObject *
4152PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004153 Py_ssize_t size,
4154 const char *errors,
4155 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004156{
4157 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4158}
4159
4160PyObject *
4161PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004162 Py_ssize_t size,
4163 const char *errors,
4164 int *byteorder,
4165 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004166{
4167 const char *starts = s;
4168 Py_ssize_t startinpos;
4169 Py_ssize_t endinpos;
4170 Py_ssize_t outpos;
4171 PyUnicodeObject *unicode;
4172 Py_UNICODE *p;
4173#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004174 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004175 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004176#else
4177 const int pairs = 0;
4178#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004179 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004180 int bo = 0; /* assume native ordering by default */
4181 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004182 /* Offsets from q for retrieving bytes in the right order. */
4183#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4184 int iorder[] = {0, 1, 2, 3};
4185#else
4186 int iorder[] = {3, 2, 1, 0};
4187#endif
4188 PyObject *errorHandler = NULL;
4189 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004190
Walter Dörwald41980ca2007-08-16 21:55:45 +00004191 q = (unsigned char *)s;
4192 e = q + size;
4193
4194 if (byteorder)
4195 bo = *byteorder;
4196
4197 /* Check for BOM marks (U+FEFF) in the input and adjust current
4198 byte order setting accordingly. In native mode, the leading BOM
4199 mark is skipped, in all other modes, it is copied to the output
4200 stream as-is (giving a ZWNBSP character). */
4201 if (bo == 0) {
4202 if (size >= 4) {
4203 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004204 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004205#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 if (bom == 0x0000FEFF) {
4207 q += 4;
4208 bo = -1;
4209 }
4210 else if (bom == 0xFFFE0000) {
4211 q += 4;
4212 bo = 1;
4213 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004214#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004215 if (bom == 0x0000FEFF) {
4216 q += 4;
4217 bo = 1;
4218 }
4219 else if (bom == 0xFFFE0000) {
4220 q += 4;
4221 bo = -1;
4222 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004223#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004224 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004225 }
4226
4227 if (bo == -1) {
4228 /* force LE */
4229 iorder[0] = 0;
4230 iorder[1] = 1;
4231 iorder[2] = 2;
4232 iorder[3] = 3;
4233 }
4234 else if (bo == 1) {
4235 /* force BE */
4236 iorder[0] = 3;
4237 iorder[1] = 2;
4238 iorder[2] = 1;
4239 iorder[3] = 0;
4240 }
4241
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004242 /* On narrow builds we split characters outside the BMP into two
4243 codepoints => count how much extra space we need. */
4244#ifndef Py_UNICODE_WIDE
4245 for (qq = q; qq < e; qq += 4)
4246 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4247 pairs++;
4248#endif
4249
4250 /* This might be one to much, because of a BOM */
4251 unicode = _PyUnicode_New((size+3)/4+pairs);
4252 if (!unicode)
4253 return NULL;
4254 if (size == 0)
4255 return (PyObject *)unicode;
4256
4257 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004258 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004259
Walter Dörwald41980ca2007-08-16 21:55:45 +00004260 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004261 Py_UCS4 ch;
4262 /* remaining bytes at the end? (size should be divisible by 4) */
4263 if (e-q<4) {
4264 if (consumed)
4265 break;
4266 errmsg = "truncated data";
4267 startinpos = ((const char *)q)-starts;
4268 endinpos = ((const char *)e)-starts;
4269 goto utf32Error;
4270 /* The remaining input chars are ignored if the callback
4271 chooses to skip the input */
4272 }
4273 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4274 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004275
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 if (ch >= 0x110000)
4277 {
4278 errmsg = "codepoint not in range(0x110000)";
4279 startinpos = ((const char *)q)-starts;
4280 endinpos = startinpos+4;
4281 goto utf32Error;
4282 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004283#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004284 if (ch >= 0x10000)
4285 {
4286 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4287 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4288 }
4289 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004290#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 *p++ = ch;
4292 q += 4;
4293 continue;
4294 utf32Error:
4295 outpos = p-PyUnicode_AS_UNICODE(unicode);
4296 if (unicode_decode_call_errorhandler(
4297 errors, &errorHandler,
4298 "utf32", errmsg,
4299 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4300 &unicode, &outpos, &p))
4301 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004302 }
4303
4304 if (byteorder)
4305 *byteorder = bo;
4306
4307 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004309
4310 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004311 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004312 goto onError;
4313
4314 Py_XDECREF(errorHandler);
4315 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004316 if (PyUnicode_READY(unicode) == -1) {
4317 Py_DECREF(unicode);
4318 return NULL;
4319 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004320 return (PyObject *)unicode;
4321
Benjamin Peterson29060642009-01-31 22:14:21 +00004322 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004323 Py_DECREF(unicode);
4324 Py_XDECREF(errorHandler);
4325 Py_XDECREF(exc);
4326 return NULL;
4327}
4328
4329PyObject *
4330PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004331 Py_ssize_t size,
4332 const char *errors,
4333 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004334{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004335 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004336 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004337 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004338#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004339 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004340#else
4341 const int pairs = 0;
4342#endif
4343 /* Offsets from p for storing byte pairs in the right order. */
4344#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4345 int iorder[] = {0, 1, 2, 3};
4346#else
4347 int iorder[] = {3, 2, 1, 0};
4348#endif
4349
Benjamin Peterson29060642009-01-31 22:14:21 +00004350#define STORECHAR(CH) \
4351 do { \
4352 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4353 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4354 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4355 p[iorder[0]] = (CH) & 0xff; \
4356 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004357 } while(0)
4358
4359 /* In narrow builds we can output surrogate pairs as one codepoint,
4360 so we need less space. */
4361#ifndef Py_UNICODE_WIDE
4362 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004363 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4364 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4365 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004366#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004367 nsize = (size - pairs + (byteorder == 0));
4368 bytesize = nsize * 4;
4369 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004370 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004371 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004372 if (v == NULL)
4373 return NULL;
4374
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004375 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004376 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004377 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004378 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004379 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004380
4381 if (byteorder == -1) {
4382 /* force LE */
4383 iorder[0] = 0;
4384 iorder[1] = 1;
4385 iorder[2] = 2;
4386 iorder[3] = 3;
4387 }
4388 else if (byteorder == 1) {
4389 /* force BE */
4390 iorder[0] = 3;
4391 iorder[1] = 2;
4392 iorder[2] = 1;
4393 iorder[3] = 0;
4394 }
4395
4396 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004397 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004398#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4400 Py_UCS4 ch2 = *s;
4401 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4402 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4403 s++;
4404 size--;
4405 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004406 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004407#endif
4408 STORECHAR(ch);
4409 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004410
4411 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004412 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004413#undef STORECHAR
4414}
4415
Alexander Belopolsky40018472011-02-26 01:02:56 +00004416PyObject *
4417PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004418{
4419 if (!PyUnicode_Check(unicode)) {
4420 PyErr_BadArgument();
4421 return NULL;
4422 }
4423 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004424 PyUnicode_GET_SIZE(unicode),
4425 NULL,
4426 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004427}
4428
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429/* --- UTF-16 Codec ------------------------------------------------------- */
4430
Tim Peters772747b2001-08-09 22:21:55 +00004431PyObject *
4432PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 Py_ssize_t size,
4434 const char *errors,
4435 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436{
Walter Dörwald69652032004-09-07 20:24:22 +00004437 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4438}
4439
Antoine Pitrouab868312009-01-10 15:40:25 +00004440/* Two masks for fast checking of whether a C 'long' may contain
4441 UTF16-encoded surrogate characters. This is an efficient heuristic,
4442 assuming that non-surrogate characters with a code point >= 0x8000 are
4443 rare in most input.
4444 FAST_CHAR_MASK is used when the input is in native byte ordering,
4445 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004446*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004447#if (SIZEOF_LONG == 8)
4448# define FAST_CHAR_MASK 0x8000800080008000L
4449# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4450#elif (SIZEOF_LONG == 4)
4451# define FAST_CHAR_MASK 0x80008000L
4452# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4453#else
4454# error C 'long' size should be either 4 or 8!
4455#endif
4456
Walter Dörwald69652032004-09-07 20:24:22 +00004457PyObject *
4458PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004459 Py_ssize_t size,
4460 const char *errors,
4461 int *byteorder,
4462 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004463{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004464 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004465 Py_ssize_t startinpos;
4466 Py_ssize_t endinpos;
4467 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 PyUnicodeObject *unicode;
4469 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004470 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004471 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004472 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004473 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004474 /* Offsets from q for retrieving byte pairs in the right order. */
4475#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4476 int ihi = 1, ilo = 0;
4477#else
4478 int ihi = 0, ilo = 1;
4479#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004480 PyObject *errorHandler = NULL;
4481 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482
4483 /* Note: size will always be longer than the resulting Unicode
4484 character count */
4485 unicode = _PyUnicode_New(size);
4486 if (!unicode)
4487 return NULL;
4488 if (size == 0)
4489 return (PyObject *)unicode;
4490
4491 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004492 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004493 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004494 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495
4496 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004497 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004499 /* Check for BOM marks (U+FEFF) in the input and adjust current
4500 byte order setting accordingly. In native mode, the leading BOM
4501 mark is skipped, in all other modes, it is copied to the output
4502 stream as-is (giving a ZWNBSP character). */
4503 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004504 if (size >= 2) {
4505 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004506#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 if (bom == 0xFEFF) {
4508 q += 2;
4509 bo = -1;
4510 }
4511 else if (bom == 0xFFFE) {
4512 q += 2;
4513 bo = 1;
4514 }
Tim Petersced69f82003-09-16 20:30:58 +00004515#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004516 if (bom == 0xFEFF) {
4517 q += 2;
4518 bo = 1;
4519 }
4520 else if (bom == 0xFFFE) {
4521 q += 2;
4522 bo = -1;
4523 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004524#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004525 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004526 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527
Tim Peters772747b2001-08-09 22:21:55 +00004528 if (bo == -1) {
4529 /* force LE */
4530 ihi = 1;
4531 ilo = 0;
4532 }
4533 else if (bo == 1) {
4534 /* force BE */
4535 ihi = 0;
4536 ilo = 1;
4537 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004538#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4539 native_ordering = ilo < ihi;
4540#else
4541 native_ordering = ilo > ihi;
4542#endif
Tim Peters772747b2001-08-09 22:21:55 +00004543
Antoine Pitrouab868312009-01-10 15:40:25 +00004544 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004545 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004547 /* First check for possible aligned read of a C 'long'. Unaligned
4548 reads are more expensive, better to defer to another iteration. */
4549 if (!((size_t) q & LONG_PTR_MASK)) {
4550 /* Fast path for runs of non-surrogate chars. */
4551 register const unsigned char *_q = q;
4552 Py_UNICODE *_p = p;
4553 if (native_ordering) {
4554 /* Native ordering is simple: as long as the input cannot
4555 possibly contain a surrogate char, do an unrolled copy
4556 of several 16-bit code points to the target object.
4557 The non-surrogate check is done on several input bytes
4558 at a time (as many as a C 'long' can contain). */
4559 while (_q < aligned_end) {
4560 unsigned long data = * (unsigned long *) _q;
4561 if (data & FAST_CHAR_MASK)
4562 break;
4563 _p[0] = ((unsigned short *) _q)[0];
4564 _p[1] = ((unsigned short *) _q)[1];
4565#if (SIZEOF_LONG == 8)
4566 _p[2] = ((unsigned short *) _q)[2];
4567 _p[3] = ((unsigned short *) _q)[3];
4568#endif
4569 _q += SIZEOF_LONG;
4570 _p += SIZEOF_LONG / 2;
4571 }
4572 }
4573 else {
4574 /* Byteswapped ordering is similar, but we must decompose
4575 the copy bytewise, and take care of zero'ing out the
4576 upper bytes if the target object is in 32-bit units
4577 (that is, in UCS-4 builds). */
4578 while (_q < aligned_end) {
4579 unsigned long data = * (unsigned long *) _q;
4580 if (data & SWAPPED_FAST_CHAR_MASK)
4581 break;
4582 /* Zero upper bytes in UCS-4 builds */
4583#if (Py_UNICODE_SIZE > 2)
4584 _p[0] = 0;
4585 _p[1] = 0;
4586#if (SIZEOF_LONG == 8)
4587 _p[2] = 0;
4588 _p[3] = 0;
4589#endif
4590#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004591 /* Issue #4916; UCS-4 builds on big endian machines must
4592 fill the two last bytes of each 4-byte unit. */
4593#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4594# define OFF 2
4595#else
4596# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004597#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004598 ((unsigned char *) _p)[OFF + 1] = _q[0];
4599 ((unsigned char *) _p)[OFF + 0] = _q[1];
4600 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4601 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4602#if (SIZEOF_LONG == 8)
4603 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4604 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4605 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4606 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4607#endif
4608#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004609 _q += SIZEOF_LONG;
4610 _p += SIZEOF_LONG / 2;
4611 }
4612 }
4613 p = _p;
4614 q = _q;
4615 if (q >= e)
4616 break;
4617 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619
Benjamin Peterson14339b62009-01-31 16:36:08 +00004620 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004621
4622 if (ch < 0xD800 || ch > 0xDFFF) {
4623 *p++ = ch;
4624 continue;
4625 }
4626
4627 /* UTF-16 code pair: */
4628 if (q > e) {
4629 errmsg = "unexpected end of data";
4630 startinpos = (((const char *)q) - 2) - starts;
4631 endinpos = ((const char *)e) + 1 - starts;
4632 goto utf16Error;
4633 }
4634 if (0xD800 <= ch && ch <= 0xDBFF) {
4635 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4636 q += 2;
4637 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004638#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004639 *p++ = ch;
4640 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004641#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004642 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004643#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004644 continue;
4645 }
4646 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004647 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004648 startinpos = (((const char *)q)-4)-starts;
4649 endinpos = startinpos+2;
4650 goto utf16Error;
4651 }
4652
Benjamin Peterson14339b62009-01-31 16:36:08 +00004653 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004654 errmsg = "illegal encoding";
4655 startinpos = (((const char *)q)-2)-starts;
4656 endinpos = startinpos+2;
4657 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004658
Benjamin Peterson29060642009-01-31 22:14:21 +00004659 utf16Error:
4660 outpos = p - PyUnicode_AS_UNICODE(unicode);
4661 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004662 errors,
4663 &errorHandler,
4664 "utf16", errmsg,
4665 &starts,
4666 (const char **)&e,
4667 &startinpos,
4668 &endinpos,
4669 &exc,
4670 (const char **)&q,
4671 &unicode,
4672 &outpos,
4673 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004676 /* remaining byte at the end? (size should be even) */
4677 if (e == q) {
4678 if (!consumed) {
4679 errmsg = "truncated data";
4680 startinpos = ((const char *)q) - starts;
4681 endinpos = ((const char *)e) + 1 - starts;
4682 outpos = p - PyUnicode_AS_UNICODE(unicode);
4683 if (unicode_decode_call_errorhandler(
4684 errors,
4685 &errorHandler,
4686 "utf16", errmsg,
4687 &starts,
4688 (const char **)&e,
4689 &startinpos,
4690 &endinpos,
4691 &exc,
4692 (const char **)&q,
4693 &unicode,
4694 &outpos,
4695 &p))
4696 goto onError;
4697 /* The remaining input chars are ignored if the callback
4698 chooses to skip the input */
4699 }
4700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701
4702 if (byteorder)
4703 *byteorder = bo;
4704
Walter Dörwald69652032004-09-07 20:24:22 +00004705 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004706 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00004707
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708 /* Adjust length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004709 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710 goto onError;
4711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 Py_XDECREF(errorHandler);
4713 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004714 if (PyUnicode_READY(unicode) == -1) {
4715 Py_DECREF(unicode);
4716 return NULL;
4717 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718 return (PyObject *)unicode;
4719
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004722 Py_XDECREF(errorHandler);
4723 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724 return NULL;
4725}
4726
Antoine Pitrouab868312009-01-10 15:40:25 +00004727#undef FAST_CHAR_MASK
4728#undef SWAPPED_FAST_CHAR_MASK
4729
Tim Peters772747b2001-08-09 22:21:55 +00004730PyObject *
4731PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004732 Py_ssize_t size,
4733 const char *errors,
4734 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004736 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00004737 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004738 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004739#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004740 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004741#else
4742 const int pairs = 0;
4743#endif
Tim Peters772747b2001-08-09 22:21:55 +00004744 /* Offsets from p for storing byte pairs in the right order. */
4745#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4746 int ihi = 1, ilo = 0;
4747#else
4748 int ihi = 0, ilo = 1;
4749#endif
4750
Benjamin Peterson29060642009-01-31 22:14:21 +00004751#define STORECHAR(CH) \
4752 do { \
4753 p[ihi] = ((CH) >> 8) & 0xff; \
4754 p[ilo] = (CH) & 0xff; \
4755 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00004756 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004758#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004759 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004760 if (s[i] >= 0x10000)
4761 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004762#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004763 /* 2 * (size + pairs + (byteorder == 0)) */
4764 if (size > PY_SSIZE_T_MAX ||
4765 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00004766 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004767 nsize = size + pairs + (byteorder == 0);
4768 bytesize = nsize * 2;
4769 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004770 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004771 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772 if (v == NULL)
4773 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004775 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004777 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004778 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004779 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00004780
4781 if (byteorder == -1) {
4782 /* force LE */
4783 ihi = 1;
4784 ilo = 0;
4785 }
4786 else if (byteorder == 1) {
4787 /* force BE */
4788 ihi = 0;
4789 ilo = 1;
4790 }
4791
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004792 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004793 Py_UNICODE ch = *s++;
4794 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004795#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004796 if (ch >= 0x10000) {
4797 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
4798 ch = 0xD800 | ((ch-0x10000) >> 10);
4799 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004800#endif
Tim Peters772747b2001-08-09 22:21:55 +00004801 STORECHAR(ch);
4802 if (ch2)
4803 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004804 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004805
4806 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004807 return v;
Tim Peters772747b2001-08-09 22:21:55 +00004808#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809}
4810
Alexander Belopolsky40018472011-02-26 01:02:56 +00004811PyObject *
4812PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813{
4814 if (!PyUnicode_Check(unicode)) {
4815 PyErr_BadArgument();
4816 return NULL;
4817 }
4818 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004819 PyUnicode_GET_SIZE(unicode),
4820 NULL,
4821 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822}
4823
4824/* --- Unicode Escape Codec ----------------------------------------------- */
4825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004826/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
4827 if all the escapes in the string make it still a valid ASCII string.
4828 Returns -1 if any escapes were found which cause the string to
4829 pop out of ASCII range. Otherwise returns the length of the
4830 required buffer to hold the string.
4831 */
4832Py_ssize_t
4833length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
4834{
4835 const unsigned char *p = (const unsigned char *)s;
4836 const unsigned char *end = p + size;
4837 Py_ssize_t length = 0;
4838
4839 if (size < 0)
4840 return -1;
4841
4842 for (; p < end; ++p) {
4843 if (*p > 127) {
4844 /* Non-ASCII */
4845 return -1;
4846 }
4847 else if (*p != '\\') {
4848 /* Normal character */
4849 ++length;
4850 }
4851 else {
4852 /* Backslash-escape, check next char */
4853 ++p;
4854 /* Escape sequence reaches till end of string or
4855 non-ASCII follow-up. */
4856 if (p >= end || *p > 127)
4857 return -1;
4858 switch (*p) {
4859 case '\n':
4860 /* backslash + \n result in zero characters */
4861 break;
4862 case '\\': case '\'': case '\"':
4863 case 'b': case 'f': case 't':
4864 case 'n': case 'r': case 'v': case 'a':
4865 ++length;
4866 break;
4867 case '0': case '1': case '2': case '3':
4868 case '4': case '5': case '6': case '7':
4869 case 'x': case 'u': case 'U': case 'N':
4870 /* these do not guarantee ASCII characters */
4871 return -1;
4872 default:
4873 /* count the backslash + the other character */
4874 length += 2;
4875 }
4876 }
4877 }
4878 return length;
4879}
4880
4881/* Similar to PyUnicode_WRITE but either write into wstr field
4882 or treat string as ASCII. */
4883#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
4884 do { \
4885 if ((kind) != PyUnicode_WCHAR_KIND) \
4886 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4887 else \
4888 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4889 } while (0)
4890
4891#define WRITE_WSTR(buf, index, value) \
4892 assert(kind == PyUnicode_WCHAR_KIND), \
4893 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
4894
4895
Fredrik Lundh06d12682001-01-24 07:59:11 +00004896static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00004897
Alexander Belopolsky40018472011-02-26 01:02:56 +00004898PyObject *
4899PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004900 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02004901 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004903 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004904 Py_ssize_t startinpos;
4905 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004906 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004908 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004910 char* message;
4911 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004912 PyObject *errorHandler = NULL;
4913 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004914 Py_ssize_t ascii_length;
4915 Py_ssize_t i;
4916 int kind;
4917 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004919 ascii_length = length_of_escaped_ascii_string(s, size);
4920
4921 /* After length_of_escaped_ascii_string() there are two alternatives,
4922 either the string is pure ASCII with named escapes like \n, etc.
4923 and we determined it's exact size (common case)
4924 or it contains \x, \u, ... escape sequences. then we create a
4925 legacy wchar string and resize it at the end of this function. */
4926 if (ascii_length >= 0) {
4927 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
4928 if (!v)
4929 goto onError;
4930 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
4931 kind = PyUnicode_1BYTE_KIND;
4932 data = PyUnicode_DATA(v);
4933 }
4934 else {
4935 /* Escaped strings will always be longer than the resulting
4936 Unicode string, so we start with size here and then reduce the
4937 length after conversion to the true value.
4938 (but if the error callback returns a long replacement string
4939 we'll have to allocate more space) */
4940 v = _PyUnicode_New(size);
4941 if (!v)
4942 goto onError;
4943 kind = PyUnicode_WCHAR_KIND;
4944 data = PyUnicode_AS_UNICODE(v);
4945 }
4946
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947 if (size == 0)
4948 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004949 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00004951
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 while (s < end) {
4953 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00004954 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004955 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004957 if (kind == PyUnicode_WCHAR_KIND) {
4958 assert(i < _PyUnicode_WSTR_LENGTH(v));
4959 }
4960 else {
4961 /* The only case in which i == ascii_length is a backslash
4962 followed by a newline. */
4963 assert(i <= ascii_length);
4964 }
4965
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966 /* Non-escape characters are interpreted as Unicode ordinals */
4967 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004968 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969 continue;
4970 }
4971
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004972 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973 /* \ - Escapes */
4974 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004975 c = *s++;
4976 if (s > end)
4977 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004978
4979 if (kind == PyUnicode_WCHAR_KIND) {
4980 assert(i < _PyUnicode_WSTR_LENGTH(v));
4981 }
4982 else {
4983 /* The only case in which i == ascii_length is a backslash
4984 followed by a newline. */
4985 assert(i < ascii_length || (i == ascii_length && c == '\n'));
4986 }
4987
Guido van Rossum8ce8a782007-11-01 19:42:39 +00004988 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989
Benjamin Peterson29060642009-01-31 22:14:21 +00004990 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004992 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
4993 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
4994 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
4995 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
4996 /* FF */
4997 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
4998 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
4999 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5000 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5001 /* VT */
5002 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5003 /* BEL, not classic C */
5004 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005
Benjamin Peterson29060642009-01-31 22:14:21 +00005006 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005007 case '0': case '1': case '2': case '3':
5008 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005009 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005010 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005011 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005012 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005013 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005015 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016 break;
5017
Benjamin Peterson29060642009-01-31 22:14:21 +00005018 /* hex escapes */
5019 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005021 digits = 2;
5022 message = "truncated \\xXX escape";
5023 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005026 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005027 digits = 4;
5028 message = "truncated \\uXXXX escape";
5029 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005032 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005033 digits = 8;
5034 message = "truncated \\UXXXXXXXX escape";
5035 hexescape:
5036 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005037 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005038 if (s+digits>end) {
5039 endinpos = size;
5040 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005041 errors, &errorHandler,
5042 "unicodeescape", "end of string in escape sequence",
5043 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005044 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005046 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005047 goto nextByte;
5048 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005049 for (j = 0; j < digits; ++j) {
5050 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005051 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005052 endinpos = (s+j+1)-starts;
5053 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005054 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 errors, &errorHandler,
5056 "unicodeescape", message,
5057 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005058 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005059 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005060 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005061 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005062 }
5063 chr = (chr<<4) & ~0xF;
5064 if (c >= '0' && c <= '9')
5065 chr += c - '0';
5066 else if (c >= 'a' && c <= 'f')
5067 chr += 10 + c - 'a';
5068 else
5069 chr += 10 + c - 'A';
5070 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005071 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005072 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005073 /* _decoding_error will have already written into the
5074 target buffer. */
5075 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005076 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005077 /* when we get here, chr is a 32-bit unicode character */
5078 if (chr <= 0xffff)
5079 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005080 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005081 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005082 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005083 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005084#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005085 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005086#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005087 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005088 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5089 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005090#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005091 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005092 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005093 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005094 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 errors, &errorHandler,
5096 "unicodeescape", "illegal Unicode character",
5097 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005098 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005099 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005100 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005101 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005102 break;
5103
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005105 case 'N':
5106 message = "malformed \\N character escape";
5107 if (ucnhash_CAPI == NULL) {
5108 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005109 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5110 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005111 if (ucnhash_CAPI == NULL)
5112 goto ucnhashError;
5113 }
5114 if (*s == '{') {
5115 const char *start = s+1;
5116 /* look for the closing brace */
5117 while (*s != '}' && s < end)
5118 s++;
5119 if (s > start && s < end && *s == '}') {
5120 /* found a name. look it up in the unicode database */
5121 message = "unknown Unicode character name";
5122 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005123 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5124 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005125 goto store;
5126 }
5127 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005128 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005129 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005130 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005131 errors, &errorHandler,
5132 "unicodeescape", message,
5133 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005134 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005135 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005136 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005137 break;
5138
5139 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005140 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005141 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005142 message = "\\ at end of string";
5143 s--;
5144 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005145 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005146 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 errors, &errorHandler,
5148 "unicodeescape", message,
5149 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005150 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005151 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005152 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005153 }
5154 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005155 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5156 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005157 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005158 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005160 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005161 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005163 /* Ensure the length prediction worked in case of ASCII strings */
5164 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5165
5166 if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 ||
5167 PyUnicode_READY(v) == -1))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005168 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005169 Py_XDECREF(errorHandler);
5170 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005172
Benjamin Peterson29060642009-01-31 22:14:21 +00005173 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005174 PyErr_SetString(
5175 PyExc_UnicodeError,
5176 "\\N escapes not supported (can't load unicodedata module)"
5177 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005178 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005179 Py_XDECREF(errorHandler);
5180 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005181 return NULL;
5182
Benjamin Peterson29060642009-01-31 22:14:21 +00005183 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005185 Py_XDECREF(errorHandler);
5186 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 return NULL;
5188}
5189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005190#undef WRITE_ASCII_OR_WSTR
5191#undef WRITE_WSTR
5192
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193/* Return a Unicode-Escape string version of the Unicode object.
5194
5195 If quotes is true, the string is enclosed in u"" or u'' quotes as
5196 appropriate.
5197
5198*/
5199
Walter Dörwald79e913e2007-05-12 11:08:06 +00005200static const char *hexdigits = "0123456789abcdef";
5201
Alexander Belopolsky40018472011-02-26 01:02:56 +00005202PyObject *
5203PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005204 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005206 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005209#ifdef Py_UNICODE_WIDE
5210 const Py_ssize_t expandsize = 10;
5211#else
5212 const Py_ssize_t expandsize = 6;
5213#endif
5214
Thomas Wouters89f507f2006-12-13 04:49:30 +00005215 /* XXX(nnorwitz): rather than over-allocating, it would be
5216 better to choose a different scheme. Perhaps scan the
5217 first N-chars of the string and allocate based on that size.
5218 */
5219 /* Initial allocation is based on the longest-possible unichr
5220 escape.
5221
5222 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5223 unichr, so in this case it's the longest unichr escape. In
5224 narrow (UTF-16) builds this is five chars per source unichr
5225 since there are two unichrs in the surrogate pair, so in narrow
5226 (UTF-16) builds it's not the longest unichr escape.
5227
5228 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5229 so in the narrow (UTF-16) build case it's the longest unichr
5230 escape.
5231 */
5232
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005233 if (size == 0)
5234 return PyBytes_FromStringAndSize(NULL, 0);
5235
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005236 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005237 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005238
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005239 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 2
5241 + expandsize*size
5242 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 if (repr == NULL)
5244 return NULL;
5245
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005246 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248 while (size-- > 0) {
5249 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005250
Walter Dörwald79e913e2007-05-12 11:08:06 +00005251 /* Escape backslashes */
5252 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 *p++ = '\\';
5254 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005255 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005256 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005257
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005258#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005259 /* Map 21-bit characters to '\U00xxxxxx' */
5260 else if (ch >= 0x10000) {
5261 *p++ = '\\';
5262 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005263 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5264 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5265 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5266 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5267 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5268 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5269 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5270 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005271 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005272 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005273#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5275 else if (ch >= 0xD800 && ch < 0xDC00) {
5276 Py_UNICODE ch2;
5277 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005278
Benjamin Peterson29060642009-01-31 22:14:21 +00005279 ch2 = *s++;
5280 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005281 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005282 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5283 *p++ = '\\';
5284 *p++ = 'U';
5285 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5286 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5287 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5288 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5289 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5290 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5291 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5292 *p++ = hexdigits[ucs & 0x0000000F];
5293 continue;
5294 }
5295 /* Fall through: isolated surrogates are copied as-is */
5296 s--;
5297 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005298 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005299#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005300
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005302 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 *p++ = '\\';
5304 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005305 *p++ = hexdigits[(ch >> 12) & 0x000F];
5306 *p++ = hexdigits[(ch >> 8) & 0x000F];
5307 *p++ = hexdigits[(ch >> 4) & 0x000F];
5308 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005310
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005311 /* Map special whitespace to '\t', \n', '\r' */
5312 else if (ch == '\t') {
5313 *p++ = '\\';
5314 *p++ = 't';
5315 }
5316 else if (ch == '\n') {
5317 *p++ = '\\';
5318 *p++ = 'n';
5319 }
5320 else if (ch == '\r') {
5321 *p++ = '\\';
5322 *p++ = 'r';
5323 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005324
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005325 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005326 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005328 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005329 *p++ = hexdigits[(ch >> 4) & 0x000F];
5330 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005331 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005332
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333 /* Copy everything else as-is */
5334 else
5335 *p++ = (char) ch;
5336 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005338 assert(p - PyBytes_AS_STRING(repr) > 0);
5339 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5340 return NULL;
5341 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342}
5343
Alexander Belopolsky40018472011-02-26 01:02:56 +00005344PyObject *
5345PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005347 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 if (!PyUnicode_Check(unicode)) {
5349 PyErr_BadArgument();
5350 return NULL;
5351 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005352 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5353 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005354 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355}
5356
5357/* --- Raw Unicode Escape Codec ------------------------------------------- */
5358
Alexander Belopolsky40018472011-02-26 01:02:56 +00005359PyObject *
5360PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005361 Py_ssize_t size,
5362 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005364 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005365 Py_ssize_t startinpos;
5366 Py_ssize_t endinpos;
5367 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005369 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370 const char *end;
5371 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 PyObject *errorHandler = NULL;
5373 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005374
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 /* Escaped strings will always be longer than the resulting
5376 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377 length after conversion to the true value. (But decoding error
5378 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 v = _PyUnicode_New(size);
5380 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005384 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 end = s + size;
5386 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 unsigned char c;
5388 Py_UCS4 x;
5389 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005390 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 /* Non-escape characters are interpreted as Unicode ordinals */
5393 if (*s != '\\') {
5394 *p++ = (unsigned char)*s++;
5395 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005396 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005397 startinpos = s-starts;
5398
5399 /* \u-escapes are only interpreted iff the number of leading
5400 backslashes if odd */
5401 bs = s;
5402 for (;s < end;) {
5403 if (*s != '\\')
5404 break;
5405 *p++ = (unsigned char)*s++;
5406 }
5407 if (((s - bs) & 1) == 0 ||
5408 s >= end ||
5409 (*s != 'u' && *s != 'U')) {
5410 continue;
5411 }
5412 p--;
5413 count = *s=='u' ? 4 : 8;
5414 s++;
5415
5416 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5417 outpos = p-PyUnicode_AS_UNICODE(v);
5418 for (x = 0, i = 0; i < count; ++i, ++s) {
5419 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005420 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 endinpos = s-starts;
5422 if (unicode_decode_call_errorhandler(
5423 errors, &errorHandler,
5424 "rawunicodeescape", "truncated \\uXXXX",
5425 &starts, &end, &startinpos, &endinpos, &exc, &s,
5426 &v, &outpos, &p))
5427 goto onError;
5428 goto nextByte;
5429 }
5430 x = (x<<4) & ~0xF;
5431 if (c >= '0' && c <= '9')
5432 x += c - '0';
5433 else if (c >= 'a' && c <= 'f')
5434 x += 10 + c - 'a';
5435 else
5436 x += 10 + c - 'A';
5437 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005438 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005439 /* UCS-2 character */
5440 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005441 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005442 /* UCS-4 character. Either store directly, or as
5443 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005444#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005445 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005446#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005447 x -= 0x10000L;
5448 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5449 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005450#endif
5451 } else {
5452 endinpos = s-starts;
5453 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005454 if (unicode_decode_call_errorhandler(
5455 errors, &errorHandler,
5456 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005457 &starts, &end, &startinpos, &endinpos, &exc, &s,
5458 &v, &outpos, &p))
5459 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005460 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005461 nextByte:
5462 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005464 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005466 Py_XDECREF(errorHandler);
5467 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005468 if (PyUnicode_READY(v) == -1) {
5469 Py_DECREF(v);
5470 return NULL;
5471 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005473
Benjamin Peterson29060642009-01-31 22:14:21 +00005474 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005476 Py_XDECREF(errorHandler);
5477 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 return NULL;
5479}
5480
Alexander Belopolsky40018472011-02-26 01:02:56 +00005481PyObject *
5482PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005483 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005485 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 char *p;
5487 char *q;
5488
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005489#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005490 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005491#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005492 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005493#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005494
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005495 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005497
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005498 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 if (repr == NULL)
5500 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005501 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005502 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005504 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505 while (size-- > 0) {
5506 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005507#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 /* Map 32-bit characters to '\Uxxxxxxxx' */
5509 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005510 *p++ = '\\';
5511 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005512 *p++ = hexdigits[(ch >> 28) & 0xf];
5513 *p++ = hexdigits[(ch >> 24) & 0xf];
5514 *p++ = hexdigits[(ch >> 20) & 0xf];
5515 *p++ = hexdigits[(ch >> 16) & 0xf];
5516 *p++ = hexdigits[(ch >> 12) & 0xf];
5517 *p++ = hexdigits[(ch >> 8) & 0xf];
5518 *p++ = hexdigits[(ch >> 4) & 0xf];
5519 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005520 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005521 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005522#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5524 if (ch >= 0xD800 && ch < 0xDC00) {
5525 Py_UNICODE ch2;
5526 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005527
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 ch2 = *s++;
5529 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005530 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5532 *p++ = '\\';
5533 *p++ = 'U';
5534 *p++ = hexdigits[(ucs >> 28) & 0xf];
5535 *p++ = hexdigits[(ucs >> 24) & 0xf];
5536 *p++ = hexdigits[(ucs >> 20) & 0xf];
5537 *p++ = hexdigits[(ucs >> 16) & 0xf];
5538 *p++ = hexdigits[(ucs >> 12) & 0xf];
5539 *p++ = hexdigits[(ucs >> 8) & 0xf];
5540 *p++ = hexdigits[(ucs >> 4) & 0xf];
5541 *p++ = hexdigits[ucs & 0xf];
5542 continue;
5543 }
5544 /* Fall through: isolated surrogates are copied as-is */
5545 s--;
5546 size++;
5547 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005548#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005549 /* Map 16-bit characters to '\uxxxx' */
5550 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 *p++ = '\\';
5552 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005553 *p++ = hexdigits[(ch >> 12) & 0xf];
5554 *p++ = hexdigits[(ch >> 8) & 0xf];
5555 *p++ = hexdigits[(ch >> 4) & 0xf];
5556 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 /* Copy everything else as-is */
5559 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 *p++ = (char) ch;
5561 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005562 size = p - q;
5563
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005564 assert(size > 0);
5565 if (_PyBytes_Resize(&repr, size) < 0)
5566 return NULL;
5567 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568}
5569
Alexander Belopolsky40018472011-02-26 01:02:56 +00005570PyObject *
5571PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005573 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005575 PyErr_BadArgument();
5576 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005578 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5579 PyUnicode_GET_SIZE(unicode));
5580
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005581 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582}
5583
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005584/* --- Unicode Internal Codec ------------------------------------------- */
5585
Alexander Belopolsky40018472011-02-26 01:02:56 +00005586PyObject *
5587_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005588 Py_ssize_t size,
5589 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005590{
5591 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005592 Py_ssize_t startinpos;
5593 Py_ssize_t endinpos;
5594 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005595 PyUnicodeObject *v;
5596 Py_UNICODE *p;
5597 const char *end;
5598 const char *reason;
5599 PyObject *errorHandler = NULL;
5600 PyObject *exc = NULL;
5601
Neal Norwitzd43069c2006-01-08 01:12:10 +00005602#ifdef Py_UNICODE_WIDE
5603 Py_UNICODE unimax = PyUnicode_GetMax();
5604#endif
5605
Thomas Wouters89f507f2006-12-13 04:49:30 +00005606 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005607 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5608 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005609 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005610 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5611 as string was created with the old API. */
5612 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005614 p = PyUnicode_AS_UNICODE(v);
5615 end = s + size;
5616
5617 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005618 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005619 /* We have to sanity check the raw data, otherwise doom looms for
5620 some malformed UCS-4 data. */
5621 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005622#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005623 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005624#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005625 end-s < Py_UNICODE_SIZE
5626 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005627 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005628 startinpos = s - starts;
5629 if (end-s < Py_UNICODE_SIZE) {
5630 endinpos = end-starts;
5631 reason = "truncated input";
5632 }
5633 else {
5634 endinpos = s - starts + Py_UNICODE_SIZE;
5635 reason = "illegal code point (> 0x10FFFF)";
5636 }
5637 outpos = p - PyUnicode_AS_UNICODE(v);
5638 if (unicode_decode_call_errorhandler(
5639 errors, &errorHandler,
5640 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005641 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005642 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005643 goto onError;
5644 }
5645 }
5646 else {
5647 p++;
5648 s += Py_UNICODE_SIZE;
5649 }
5650 }
5651
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005652 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005653 goto onError;
5654 Py_XDECREF(errorHandler);
5655 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005656 if (PyUnicode_READY(v) == -1) {
5657 Py_DECREF(v);
5658 return NULL;
5659 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005660 return (PyObject *)v;
5661
Benjamin Peterson29060642009-01-31 22:14:21 +00005662 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005663 Py_XDECREF(v);
5664 Py_XDECREF(errorHandler);
5665 Py_XDECREF(exc);
5666 return NULL;
5667}
5668
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669/* --- Latin-1 Codec ------------------------------------------------------ */
5670
Alexander Belopolsky40018472011-02-26 01:02:56 +00005671PyObject *
5672PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005673 Py_ssize_t size,
5674 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005677 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678}
5679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005680/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005681static void
5682make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005683 const char *encoding,
5684 const Py_UNICODE *unicode, Py_ssize_t size,
5685 Py_ssize_t startpos, Py_ssize_t endpos,
5686 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005688 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 *exceptionObject = PyUnicodeEncodeError_Create(
5690 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 }
5692 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
5694 goto onError;
5695 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
5696 goto onError;
5697 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
5698 goto onError;
5699 return;
5700 onError:
5701 Py_DECREF(*exceptionObject);
5702 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 }
5704}
5705
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005706/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005707static void
5708raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005709 const char *encoding,
5710 const Py_UNICODE *unicode, Py_ssize_t size,
5711 Py_ssize_t startpos, Py_ssize_t endpos,
5712 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713{
5714 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005716 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005718}
5719
5720/* error handling callback helper:
5721 build arguments, call the callback and check the arguments,
5722 put the result into newpos and return the replacement string, which
5723 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005724static PyObject *
5725unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005726 PyObject **errorHandler,
5727 const char *encoding, const char *reason,
5728 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5729 Py_ssize_t startpos, Py_ssize_t endpos,
5730 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005732 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005733
5734 PyObject *restuple;
5735 PyObject *resunicode;
5736
5737 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 }
5742
5743 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005744 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005745 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005747
5748 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005750 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005753 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005754 Py_DECREF(restuple);
5755 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005757 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00005758 &resunicode, newpos)) {
5759 Py_DECREF(restuple);
5760 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005762 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
5763 PyErr_SetString(PyExc_TypeError, &argparse[3]);
5764 Py_DECREF(restuple);
5765 return NULL;
5766 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005769 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5771 Py_DECREF(restuple);
5772 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005773 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 Py_INCREF(resunicode);
5775 Py_DECREF(restuple);
5776 return resunicode;
5777}
5778
Alexander Belopolsky40018472011-02-26 01:02:56 +00005779static PyObject *
5780unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005781 Py_ssize_t size,
5782 const char *errors,
5783 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784{
5785 /* output object */
5786 PyObject *res;
5787 /* pointers to the beginning and end+1 of input */
5788 const Py_UNICODE *startp = p;
5789 const Py_UNICODE *endp = p + size;
5790 /* pointer to the beginning of the unencodable characters */
5791 /* const Py_UNICODE *badp = NULL; */
5792 /* pointer into the output */
5793 char *str;
5794 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005795 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005796 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
5797 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798 PyObject *errorHandler = NULL;
5799 PyObject *exc = NULL;
5800 /* the following variable is used for caching string comparisons
5801 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5802 int known_errorHandler = -1;
5803
5804 /* allocate enough for a simple encoding without
5805 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005806 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00005807 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005808 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005810 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005811 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005812 ressize = size;
5813
5814 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005816
Benjamin Peterson29060642009-01-31 22:14:21 +00005817 /* can we encode this? */
5818 if (c<limit) {
5819 /* no overflow check, because we know that the space is enough */
5820 *str++ = (char)c;
5821 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005822 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005823 else {
5824 Py_ssize_t unicodepos = p-startp;
5825 Py_ssize_t requiredsize;
5826 PyObject *repunicode;
5827 Py_ssize_t repsize;
5828 Py_ssize_t newpos;
5829 Py_ssize_t respos;
5830 Py_UNICODE *uni2;
5831 /* startpos for collecting unencodable chars */
5832 const Py_UNICODE *collstart = p;
5833 const Py_UNICODE *collend = p;
5834 /* find all unecodable characters */
5835 while ((collend < endp) && ((*collend)>=limit))
5836 ++collend;
5837 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
5838 if (known_errorHandler==-1) {
5839 if ((errors==NULL) || (!strcmp(errors, "strict")))
5840 known_errorHandler = 1;
5841 else if (!strcmp(errors, "replace"))
5842 known_errorHandler = 2;
5843 else if (!strcmp(errors, "ignore"))
5844 known_errorHandler = 3;
5845 else if (!strcmp(errors, "xmlcharrefreplace"))
5846 known_errorHandler = 4;
5847 else
5848 known_errorHandler = 0;
5849 }
5850 switch (known_errorHandler) {
5851 case 1: /* strict */
5852 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
5853 goto onError;
5854 case 2: /* replace */
5855 while (collstart++<collend)
5856 *str++ = '?'; /* fall through */
5857 case 3: /* ignore */
5858 p = collend;
5859 break;
5860 case 4: /* xmlcharrefreplace */
5861 respos = str - PyBytes_AS_STRING(res);
5862 /* determine replacement size (temporarily (mis)uses p) */
5863 for (p = collstart, repsize = 0; p < collend; ++p) {
5864 if (*p<10)
5865 repsize += 2+1+1;
5866 else if (*p<100)
5867 repsize += 2+2+1;
5868 else if (*p<1000)
5869 repsize += 2+3+1;
5870 else if (*p<10000)
5871 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005872#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 else
5874 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00005875#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005876 else if (*p<100000)
5877 repsize += 2+5+1;
5878 else if (*p<1000000)
5879 repsize += 2+6+1;
5880 else
5881 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005882#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 }
5884 requiredsize = respos+repsize+(endp-collend);
5885 if (requiredsize > ressize) {
5886 if (requiredsize<2*ressize)
5887 requiredsize = 2*ressize;
5888 if (_PyBytes_Resize(&res, requiredsize))
5889 goto onError;
5890 str = PyBytes_AS_STRING(res) + respos;
5891 ressize = requiredsize;
5892 }
5893 /* generate replacement (temporarily (mis)uses p) */
5894 for (p = collstart; p < collend; ++p) {
5895 str += sprintf(str, "&#%d;", (int)*p);
5896 }
5897 p = collend;
5898 break;
5899 default:
5900 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5901 encoding, reason, startp, size, &exc,
5902 collstart-startp, collend-startp, &newpos);
5903 if (repunicode == NULL)
5904 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005905 if (PyBytes_Check(repunicode)) {
5906 /* Directly copy bytes result to output. */
5907 repsize = PyBytes_Size(repunicode);
5908 if (repsize > 1) {
5909 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005910 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005911 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
5912 Py_DECREF(repunicode);
5913 goto onError;
5914 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00005915 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005916 ressize += repsize-1;
5917 }
5918 memcpy(str, PyBytes_AsString(repunicode), repsize);
5919 str += repsize;
5920 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005921 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005922 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005923 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005924 /* need more space? (at least enough for what we
5925 have+the replacement+the rest of the string, so
5926 we won't have to check space for encodable characters) */
5927 respos = str - PyBytes_AS_STRING(res);
5928 repsize = PyUnicode_GET_SIZE(repunicode);
5929 requiredsize = respos+repsize+(endp-collend);
5930 if (requiredsize > ressize) {
5931 if (requiredsize<2*ressize)
5932 requiredsize = 2*ressize;
5933 if (_PyBytes_Resize(&res, requiredsize)) {
5934 Py_DECREF(repunicode);
5935 goto onError;
5936 }
5937 str = PyBytes_AS_STRING(res) + respos;
5938 ressize = requiredsize;
5939 }
5940 /* check if there is anything unencodable in the replacement
5941 and copy it to the output */
5942 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
5943 c = *uni2;
5944 if (c >= limit) {
5945 raise_encode_exception(&exc, encoding, startp, size,
5946 unicodepos, unicodepos+1, reason);
5947 Py_DECREF(repunicode);
5948 goto onError;
5949 }
5950 *str = (char)c;
5951 }
5952 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005953 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005954 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005955 }
5956 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005957 /* Resize if we allocated to much */
5958 size = str - PyBytes_AS_STRING(res);
5959 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00005960 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005961 if (_PyBytes_Resize(&res, size) < 0)
5962 goto onError;
5963 }
5964
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005965 Py_XDECREF(errorHandler);
5966 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005967 return res;
5968
5969 onError:
5970 Py_XDECREF(res);
5971 Py_XDECREF(errorHandler);
5972 Py_XDECREF(exc);
5973 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005974}
5975
Alexander Belopolsky40018472011-02-26 01:02:56 +00005976PyObject *
5977PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005978 Py_ssize_t size,
5979 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005981 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982}
5983
Alexander Belopolsky40018472011-02-26 01:02:56 +00005984PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005985_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986{
5987 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 PyErr_BadArgument();
5989 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005991 if (PyUnicode_READY(unicode) == -1)
5992 return NULL;
5993 /* Fast path: if it is a one-byte string, construct
5994 bytes object directly. */
5995 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
5996 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
5997 PyUnicode_GET_LENGTH(unicode));
5998 /* Non-Latin-1 characters present. Defer to above function to
5999 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006002 errors);
6003}
6004
6005PyObject*
6006PyUnicode_AsLatin1String(PyObject *unicode)
6007{
6008 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009}
6010
6011/* --- 7-bit ASCII Codec -------------------------------------------------- */
6012
Alexander Belopolsky40018472011-02-26 01:02:56 +00006013PyObject *
6014PyUnicode_DecodeASCII(const char *s,
6015 Py_ssize_t size,
6016 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 PyUnicodeObject *v;
6020 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006021 Py_ssize_t startinpos;
6022 Py_ssize_t endinpos;
6023 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006024 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006025 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026 PyObject *errorHandler = NULL;
6027 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006028 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006029
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006031 if (size == 1 && *(unsigned char*)s < 128)
6032 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6033
6034 /* Fast path. Assume the input actually *is* ASCII, and allocate
6035 a single-block Unicode object with that assumption. If there is
6036 an error, drop the object and start over. */
6037 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6038 if (v == NULL)
6039 goto onError;
6040 d = PyUnicode_1BYTE_DATA(v);
6041 for (i = 0; i < size; i++) {
6042 unsigned char ch = ((unsigned char*)s)[i];
6043 if (ch < 128)
6044 d[i] = ch;
6045 else
6046 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006048 if (i == size)
6049 return (PyObject*)v;
6050 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006051
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 v = _PyUnicode_New(size);
6053 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006058 e = s + size;
6059 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 register unsigned char c = (unsigned char)*s;
6061 if (c < 128) {
6062 *p++ = c;
6063 ++s;
6064 }
6065 else {
6066 startinpos = s-starts;
6067 endinpos = startinpos + 1;
6068 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6069 if (unicode_decode_call_errorhandler(
6070 errors, &errorHandler,
6071 "ascii", "ordinal not in range(128)",
6072 &starts, &e, &startinpos, &endinpos, &exc, &s,
6073 &v, &outpos, &p))
6074 goto onError;
6075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006077 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6079 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006080 Py_XDECREF(errorHandler);
6081 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006082 if (PyUnicode_READY(v) == -1) {
6083 Py_DECREF(v);
6084 return NULL;
6085 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006087
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006090 Py_XDECREF(errorHandler);
6091 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 return NULL;
6093}
6094
Alexander Belopolsky40018472011-02-26 01:02:56 +00006095PyObject *
6096PyUnicode_EncodeASCII(const Py_UNICODE *p,
6097 Py_ssize_t size,
6098 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006100 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101}
6102
Alexander Belopolsky40018472011-02-26 01:02:56 +00006103PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006104_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105{
6106 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 PyErr_BadArgument();
6108 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006110 if (PyUnicode_READY(unicode) == -1)
6111 return NULL;
6112 /* Fast path: if it is an ASCII-only string, construct bytes object
6113 directly. Else defer to above function to raise the exception. */
6114 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6115 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6116 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006119 errors);
6120}
6121
6122PyObject *
6123PyUnicode_AsASCIIString(PyObject *unicode)
6124{
6125 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126}
6127
Victor Stinner99b95382011-07-04 14:23:54 +02006128#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006129
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006130/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006131
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006132#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006133#define NEED_RETRY
6134#endif
6135
6136/* XXX This code is limited to "true" double-byte encodings, as
6137 a) it assumes an incomplete character consists of a single byte, and
6138 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006140
Alexander Belopolsky40018472011-02-26 01:02:56 +00006141static int
6142is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006143{
6144 const char *curr = s + offset;
6145
6146 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 const char *prev = CharPrev(s, curr);
6148 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006149 }
6150 return 0;
6151}
6152
6153/*
6154 * Decode MBCS string into unicode object. If 'final' is set, converts
6155 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6156 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006157static int
6158decode_mbcs(PyUnicodeObject **v,
6159 const char *s, /* MBCS string */
6160 int size, /* sizeof MBCS string */
6161 int final,
6162 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006163{
6164 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006165 Py_ssize_t n;
6166 DWORD usize;
6167 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006168
6169 assert(size >= 0);
6170
Victor Stinner554f3f02010-06-16 23:33:54 +00006171 /* check and handle 'errors' arg */
6172 if (errors==NULL || strcmp(errors, "strict")==0)
6173 flags = MB_ERR_INVALID_CHARS;
6174 else if (strcmp(errors, "ignore")==0)
6175 flags = 0;
6176 else {
6177 PyErr_Format(PyExc_ValueError,
6178 "mbcs encoding does not support errors='%s'",
6179 errors);
6180 return -1;
6181 }
6182
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006183 /* Skip trailing lead-byte unless 'final' is set */
6184 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006186
6187 /* First get the size of the result */
6188 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006189 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6190 if (usize==0)
6191 goto mbcs_decode_error;
6192 } else
6193 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006194
6195 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006196 /* Create unicode object */
6197 *v = _PyUnicode_New(usize);
6198 if (*v == NULL)
6199 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006200 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006201 }
6202 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 /* Extend unicode object */
6204 n = PyUnicode_GET_SIZE(*v);
6205 if (_PyUnicode_Resize(v, n + usize) < 0)
6206 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006207 }
6208
6209 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006210 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006212 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6213 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006215 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006216 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006217
6218mbcs_decode_error:
6219 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6220 we raise a UnicodeDecodeError - else it is a 'generic'
6221 windows error
6222 */
6223 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6224 /* Ideally, we should get reason from FormatMessage - this
6225 is the Windows 2000 English version of the message
6226 */
6227 PyObject *exc = NULL;
6228 const char *reason = "No mapping for the Unicode character exists "
6229 "in the target multi-byte code page.";
6230 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6231 if (exc != NULL) {
6232 PyCodec_StrictErrors(exc);
6233 Py_DECREF(exc);
6234 }
6235 } else {
6236 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6237 }
6238 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006239}
6240
Alexander Belopolsky40018472011-02-26 01:02:56 +00006241PyObject *
6242PyUnicode_DecodeMBCSStateful(const char *s,
6243 Py_ssize_t size,
6244 const char *errors,
6245 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006246{
6247 PyUnicodeObject *v = NULL;
6248 int done;
6249
6250 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006252
6253#ifdef NEED_RETRY
6254 retry:
6255 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006256 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006257 else
6258#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006259 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006260
6261 if (done < 0) {
6262 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006264 }
6265
6266 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006267 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006268
6269#ifdef NEED_RETRY
6270 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 s += done;
6272 size -= done;
6273 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006274 }
6275#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006276 if (PyUnicode_READY(v) == -1) {
6277 Py_DECREF(v);
6278 return NULL;
6279 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006280 return (PyObject *)v;
6281}
6282
Alexander Belopolsky40018472011-02-26 01:02:56 +00006283PyObject *
6284PyUnicode_DecodeMBCS(const char *s,
6285 Py_ssize_t size,
6286 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006287{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006288 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6289}
6290
6291/*
6292 * Convert unicode into string object (MBCS).
6293 * Returns 0 if succeed, -1 otherwise.
6294 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006295static int
6296encode_mbcs(PyObject **repr,
6297 const Py_UNICODE *p, /* unicode */
6298 int size, /* size of unicode */
6299 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006300{
Victor Stinner554f3f02010-06-16 23:33:54 +00006301 BOOL usedDefaultChar = FALSE;
6302 BOOL *pusedDefaultChar;
6303 int mbcssize;
6304 Py_ssize_t n;
6305 PyObject *exc = NULL;
6306 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006307
6308 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006309
Victor Stinner554f3f02010-06-16 23:33:54 +00006310 /* check and handle 'errors' arg */
6311 if (errors==NULL || strcmp(errors, "strict")==0) {
6312 flags = WC_NO_BEST_FIT_CHARS;
6313 pusedDefaultChar = &usedDefaultChar;
6314 } else if (strcmp(errors, "replace")==0) {
6315 flags = 0;
6316 pusedDefaultChar = NULL;
6317 } else {
6318 PyErr_Format(PyExc_ValueError,
6319 "mbcs encoding does not support errors='%s'",
6320 errors);
6321 return -1;
6322 }
6323
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006324 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006325 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006326 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6327 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 if (mbcssize == 0) {
6329 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6330 return -1;
6331 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006332 /* If we used a default char, then we failed! */
6333 if (pusedDefaultChar && *pusedDefaultChar)
6334 goto mbcs_encode_error;
6335 } else {
6336 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006337 }
6338
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006339 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 /* Create string object */
6341 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6342 if (*repr == NULL)
6343 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006344 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006345 }
6346 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 /* Extend string object */
6348 n = PyBytes_Size(*repr);
6349 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6350 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006351 }
6352
6353 /* Do the conversion */
6354 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006356 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6357 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6359 return -1;
6360 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006361 if (pusedDefaultChar && *pusedDefaultChar)
6362 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006363 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006364 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006365
6366mbcs_encode_error:
6367 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6368 Py_XDECREF(exc);
6369 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006370}
6371
Alexander Belopolsky40018472011-02-26 01:02:56 +00006372PyObject *
6373PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6374 Py_ssize_t size,
6375 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006376{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006377 PyObject *repr = NULL;
6378 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006379
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006380#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006382 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006383 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006384 else
6385#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006386 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006387
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006388 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 Py_XDECREF(repr);
6390 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006391 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006392
6393#ifdef NEED_RETRY
6394 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006395 p += INT_MAX;
6396 size -= INT_MAX;
6397 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006398 }
6399#endif
6400
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006401 return repr;
6402}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006403
Alexander Belopolsky40018472011-02-26 01:02:56 +00006404PyObject *
6405PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006406{
6407 if (!PyUnicode_Check(unicode)) {
6408 PyErr_BadArgument();
6409 return NULL;
6410 }
6411 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 PyUnicode_GET_SIZE(unicode),
6413 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006414}
6415
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006416#undef NEED_RETRY
6417
Victor Stinner99b95382011-07-04 14:23:54 +02006418#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006419
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420/* --- Character Mapping Codec -------------------------------------------- */
6421
Alexander Belopolsky40018472011-02-26 01:02:56 +00006422PyObject *
6423PyUnicode_DecodeCharmap(const char *s,
6424 Py_ssize_t size,
6425 PyObject *mapping,
6426 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006429 Py_ssize_t startinpos;
6430 Py_ssize_t endinpos;
6431 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 PyUnicodeObject *v;
6434 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006435 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 PyObject *errorHandler = NULL;
6437 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006438 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006439 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006440
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 /* Default to Latin-1 */
6442 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444
6445 v = _PyUnicode_New(size);
6446 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006452 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 mapstring = PyUnicode_AS_UNICODE(mapping);
6454 maplen = PyUnicode_GET_SIZE(mapping);
6455 while (s < e) {
6456 unsigned char ch = *s;
6457 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458
Benjamin Peterson29060642009-01-31 22:14:21 +00006459 if (ch < maplen)
6460 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 if (x == 0xfffe) {
6463 /* undefined mapping */
6464 outpos = p-PyUnicode_AS_UNICODE(v);
6465 startinpos = s-starts;
6466 endinpos = startinpos+1;
6467 if (unicode_decode_call_errorhandler(
6468 errors, &errorHandler,
6469 "charmap", "character maps to <undefined>",
6470 &starts, &e, &startinpos, &endinpos, &exc, &s,
6471 &v, &outpos, &p)) {
6472 goto onError;
6473 }
6474 continue;
6475 }
6476 *p++ = x;
6477 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006478 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006479 }
6480 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 while (s < e) {
6482 unsigned char ch = *s;
6483 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006484
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6486 w = PyLong_FromLong((long)ch);
6487 if (w == NULL)
6488 goto onError;
6489 x = PyObject_GetItem(mapping, w);
6490 Py_DECREF(w);
6491 if (x == NULL) {
6492 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6493 /* No mapping found means: mapping is undefined. */
6494 PyErr_Clear();
6495 x = Py_None;
6496 Py_INCREF(x);
6497 } else
6498 goto onError;
6499 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006500
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 /* Apply mapping */
6502 if (PyLong_Check(x)) {
6503 long value = PyLong_AS_LONG(x);
6504 if (value < 0 || value > 65535) {
6505 PyErr_SetString(PyExc_TypeError,
6506 "character mapping must be in range(65536)");
6507 Py_DECREF(x);
6508 goto onError;
6509 }
6510 *p++ = (Py_UNICODE)value;
6511 }
6512 else if (x == Py_None) {
6513 /* undefined mapping */
6514 outpos = p-PyUnicode_AS_UNICODE(v);
6515 startinpos = s-starts;
6516 endinpos = startinpos+1;
6517 if (unicode_decode_call_errorhandler(
6518 errors, &errorHandler,
6519 "charmap", "character maps to <undefined>",
6520 &starts, &e, &startinpos, &endinpos, &exc, &s,
6521 &v, &outpos, &p)) {
6522 Py_DECREF(x);
6523 goto onError;
6524 }
6525 Py_DECREF(x);
6526 continue;
6527 }
6528 else if (PyUnicode_Check(x)) {
6529 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006530
Benjamin Peterson29060642009-01-31 22:14:21 +00006531 if (targetsize == 1)
6532 /* 1-1 mapping */
6533 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006534
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 else if (targetsize > 1) {
6536 /* 1-n mapping */
6537 if (targetsize > extrachars) {
6538 /* resize first */
6539 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6540 Py_ssize_t needed = (targetsize - extrachars) + \
6541 (targetsize << 2);
6542 extrachars += needed;
6543 /* XXX overflow detection missing */
6544 if (_PyUnicode_Resize(&v,
6545 PyUnicode_GET_SIZE(v) + needed) < 0) {
6546 Py_DECREF(x);
6547 goto onError;
6548 }
6549 p = PyUnicode_AS_UNICODE(v) + oldpos;
6550 }
6551 Py_UNICODE_COPY(p,
6552 PyUnicode_AS_UNICODE(x),
6553 targetsize);
6554 p += targetsize;
6555 extrachars -= targetsize;
6556 }
6557 /* 1-0 mapping: skip the character */
6558 }
6559 else {
6560 /* wrong return value */
6561 PyErr_SetString(PyExc_TypeError,
6562 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006563 Py_DECREF(x);
6564 goto onError;
6565 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 Py_DECREF(x);
6567 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006568 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 }
6570 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6572 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006573 Py_XDECREF(errorHandler);
6574 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006575 if (PyUnicode_READY(v) == -1) {
6576 Py_DECREF(v);
6577 return NULL;
6578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006580
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006582 Py_XDECREF(errorHandler);
6583 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 Py_XDECREF(v);
6585 return NULL;
6586}
6587
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006588/* Charmap encoding: the lookup table */
6589
Alexander Belopolsky40018472011-02-26 01:02:56 +00006590struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 PyObject_HEAD
6592 unsigned char level1[32];
6593 int count2, count3;
6594 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006595};
6596
6597static PyObject*
6598encoding_map_size(PyObject *obj, PyObject* args)
6599{
6600 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006601 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006603}
6604
6605static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006606 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 PyDoc_STR("Return the size (in bytes) of this object") },
6608 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006609};
6610
6611static void
6612encoding_map_dealloc(PyObject* o)
6613{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006614 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006615}
6616
6617static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006618 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 "EncodingMap", /*tp_name*/
6620 sizeof(struct encoding_map), /*tp_basicsize*/
6621 0, /*tp_itemsize*/
6622 /* methods */
6623 encoding_map_dealloc, /*tp_dealloc*/
6624 0, /*tp_print*/
6625 0, /*tp_getattr*/
6626 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006627 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 0, /*tp_repr*/
6629 0, /*tp_as_number*/
6630 0, /*tp_as_sequence*/
6631 0, /*tp_as_mapping*/
6632 0, /*tp_hash*/
6633 0, /*tp_call*/
6634 0, /*tp_str*/
6635 0, /*tp_getattro*/
6636 0, /*tp_setattro*/
6637 0, /*tp_as_buffer*/
6638 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6639 0, /*tp_doc*/
6640 0, /*tp_traverse*/
6641 0, /*tp_clear*/
6642 0, /*tp_richcompare*/
6643 0, /*tp_weaklistoffset*/
6644 0, /*tp_iter*/
6645 0, /*tp_iternext*/
6646 encoding_map_methods, /*tp_methods*/
6647 0, /*tp_members*/
6648 0, /*tp_getset*/
6649 0, /*tp_base*/
6650 0, /*tp_dict*/
6651 0, /*tp_descr_get*/
6652 0, /*tp_descr_set*/
6653 0, /*tp_dictoffset*/
6654 0, /*tp_init*/
6655 0, /*tp_alloc*/
6656 0, /*tp_new*/
6657 0, /*tp_free*/
6658 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006659};
6660
6661PyObject*
6662PyUnicode_BuildEncodingMap(PyObject* string)
6663{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006664 PyObject *result;
6665 struct encoding_map *mresult;
6666 int i;
6667 int need_dict = 0;
6668 unsigned char level1[32];
6669 unsigned char level2[512];
6670 unsigned char *mlevel1, *mlevel2, *mlevel3;
6671 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006672 int kind;
6673 void *data;
6674 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006676 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006677 PyErr_BadArgument();
6678 return NULL;
6679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006680 kind = PyUnicode_KIND(string);
6681 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006682 memset(level1, 0xFF, sizeof level1);
6683 memset(level2, 0xFF, sizeof level2);
6684
6685 /* If there isn't a one-to-one mapping of NULL to \0,
6686 or if there are non-BMP characters, we need to use
6687 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006688 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006689 need_dict = 1;
6690 for (i = 1; i < 256; i++) {
6691 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006692 ch = PyUnicode_READ(kind, data, i);
6693 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006694 need_dict = 1;
6695 break;
6696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006697 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006698 /* unmapped character */
6699 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006700 l1 = ch >> 11;
6701 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006702 if (level1[l1] == 0xFF)
6703 level1[l1] = count2++;
6704 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00006705 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006706 }
6707
6708 if (count2 >= 0xFF || count3 >= 0xFF)
6709 need_dict = 1;
6710
6711 if (need_dict) {
6712 PyObject *result = PyDict_New();
6713 PyObject *key, *value;
6714 if (!result)
6715 return NULL;
6716 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006717 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00006718 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006719 if (!key || !value)
6720 goto failed1;
6721 if (PyDict_SetItem(result, key, value) == -1)
6722 goto failed1;
6723 Py_DECREF(key);
6724 Py_DECREF(value);
6725 }
6726 return result;
6727 failed1:
6728 Py_XDECREF(key);
6729 Py_XDECREF(value);
6730 Py_DECREF(result);
6731 return NULL;
6732 }
6733
6734 /* Create a three-level trie */
6735 result = PyObject_MALLOC(sizeof(struct encoding_map) +
6736 16*count2 + 128*count3 - 1);
6737 if (!result)
6738 return PyErr_NoMemory();
6739 PyObject_Init(result, &EncodingMapType);
6740 mresult = (struct encoding_map*)result;
6741 mresult->count2 = count2;
6742 mresult->count3 = count3;
6743 mlevel1 = mresult->level1;
6744 mlevel2 = mresult->level23;
6745 mlevel3 = mresult->level23 + 16*count2;
6746 memcpy(mlevel1, level1, 32);
6747 memset(mlevel2, 0xFF, 16*count2);
6748 memset(mlevel3, 0, 128*count3);
6749 count3 = 0;
6750 for (i = 1; i < 256; i++) {
6751 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006752 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006753 /* unmapped character */
6754 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006755 o1 = PyUnicode_READ(kind, data, i)>>11;
6756 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006757 i2 = 16*mlevel1[o1] + o2;
6758 if (mlevel2[i2] == 0xFF)
6759 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006760 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006761 i3 = 128*mlevel2[i2] + o3;
6762 mlevel3[i3] = i;
6763 }
6764 return result;
6765}
6766
6767static int
6768encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
6769{
6770 struct encoding_map *map = (struct encoding_map*)mapping;
6771 int l1 = c>>11;
6772 int l2 = (c>>7) & 0xF;
6773 int l3 = c & 0x7F;
6774 int i;
6775
6776#ifdef Py_UNICODE_WIDE
6777 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006779 }
6780#endif
6781 if (c == 0)
6782 return 0;
6783 /* level 1*/
6784 i = map->level1[l1];
6785 if (i == 0xFF) {
6786 return -1;
6787 }
6788 /* level 2*/
6789 i = map->level23[16*i+l2];
6790 if (i == 0xFF) {
6791 return -1;
6792 }
6793 /* level 3 */
6794 i = map->level23[16*map->count2 + 128*i + l3];
6795 if (i == 0) {
6796 return -1;
6797 }
6798 return i;
6799}
6800
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006801/* Lookup the character ch in the mapping. If the character
6802 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00006803 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006804static PyObject *
6805charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806{
Christian Heimes217cfd12007-12-02 14:31:20 +00006807 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006808 PyObject *x;
6809
6810 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006812 x = PyObject_GetItem(mapping, w);
6813 Py_DECREF(w);
6814 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6816 /* No mapping found means: mapping is undefined. */
6817 PyErr_Clear();
6818 x = Py_None;
6819 Py_INCREF(x);
6820 return x;
6821 } else
6822 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00006824 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00006826 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 long value = PyLong_AS_LONG(x);
6828 if (value < 0 || value > 255) {
6829 PyErr_SetString(PyExc_TypeError,
6830 "character mapping must be in range(256)");
6831 Py_DECREF(x);
6832 return NULL;
6833 }
6834 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006836 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00006837 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 /* wrong return value */
6840 PyErr_Format(PyExc_TypeError,
6841 "character mapping must return integer, bytes or None, not %.400s",
6842 x->ob_type->tp_name);
6843 Py_DECREF(x);
6844 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 }
6846}
6847
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006848static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00006849charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006850{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006851 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
6852 /* exponentially overallocate to minimize reallocations */
6853 if (requiredsize < 2*outsize)
6854 requiredsize = 2*outsize;
6855 if (_PyBytes_Resize(outobj, requiredsize))
6856 return -1;
6857 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006858}
6859
Benjamin Peterson14339b62009-01-31 16:36:08 +00006860typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00006862} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006863/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00006864 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006865 space is available. Return a new reference to the object that
6866 was put in the output buffer, or Py_None, if the mapping was undefined
6867 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00006868 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006869static charmapencode_result
6870charmapencode_output(Py_UNICODE c, PyObject *mapping,
6871 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006872{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006873 PyObject *rep;
6874 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00006875 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006876
Christian Heimes90aa7642007-12-19 02:45:37 +00006877 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006878 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006880 if (res == -1)
6881 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00006882 if (outsize<requiredsize)
6883 if (charmapencode_resize(outobj, outpos, requiredsize))
6884 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00006885 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 outstart[(*outpos)++] = (char)res;
6887 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006888 }
6889
6890 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006891 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006893 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006894 Py_DECREF(rep);
6895 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006896 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006897 if (PyLong_Check(rep)) {
6898 Py_ssize_t requiredsize = *outpos+1;
6899 if (outsize<requiredsize)
6900 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6901 Py_DECREF(rep);
6902 return enc_EXCEPTION;
6903 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006904 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006905 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006906 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 else {
6908 const char *repchars = PyBytes_AS_STRING(rep);
6909 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
6910 Py_ssize_t requiredsize = *outpos+repsize;
6911 if (outsize<requiredsize)
6912 if (charmapencode_resize(outobj, outpos, requiredsize)) {
6913 Py_DECREF(rep);
6914 return enc_EXCEPTION;
6915 }
Christian Heimes72b710a2008-05-26 13:28:38 +00006916 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 memcpy(outstart + *outpos, repchars, repsize);
6918 *outpos += repsize;
6919 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006920 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006921 Py_DECREF(rep);
6922 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006923}
6924
6925/* handle an error in PyUnicode_EncodeCharmap
6926 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006927static int
6928charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00006929 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006930 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00006931 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00006932 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006933{
6934 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006935 Py_ssize_t repsize;
6936 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006937 Py_UNICODE *uni2;
6938 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006939 Py_ssize_t collstartpos = *inpos;
6940 Py_ssize_t collendpos = *inpos+1;
6941 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006942 char *encoding = "charmap";
6943 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006944 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006945
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946 /* find all unencodable characters */
6947 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006948 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00006949 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006950 int res = encoding_map_lookup(p[collendpos], mapping);
6951 if (res != -1)
6952 break;
6953 ++collendpos;
6954 continue;
6955 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006956
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 rep = charmapencode_lookup(p[collendpos], mapping);
6958 if (rep==NULL)
6959 return -1;
6960 else if (rep!=Py_None) {
6961 Py_DECREF(rep);
6962 break;
6963 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006964 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006966 }
6967 /* cache callback name lookup
6968 * (if not done yet, i.e. it's the first error) */
6969 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 if ((errors==NULL) || (!strcmp(errors, "strict")))
6971 *known_errorHandler = 1;
6972 else if (!strcmp(errors, "replace"))
6973 *known_errorHandler = 2;
6974 else if (!strcmp(errors, "ignore"))
6975 *known_errorHandler = 3;
6976 else if (!strcmp(errors, "xmlcharrefreplace"))
6977 *known_errorHandler = 4;
6978 else
6979 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006980 }
6981 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006982 case 1: /* strict */
6983 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6984 return -1;
6985 case 2: /* replace */
6986 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006987 x = charmapencode_output('?', mapping, res, respos);
6988 if (x==enc_EXCEPTION) {
6989 return -1;
6990 }
6991 else if (x==enc_FAILED) {
6992 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
6993 return -1;
6994 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006995 }
6996 /* fall through */
6997 case 3: /* ignore */
6998 *inpos = collendpos;
6999 break;
7000 case 4: /* xmlcharrefreplace */
7001 /* generate replacement (temporarily (mis)uses p) */
7002 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 char buffer[2+29+1+1];
7004 char *cp;
7005 sprintf(buffer, "&#%d;", (int)p[collpos]);
7006 for (cp = buffer; *cp; ++cp) {
7007 x = charmapencode_output(*cp, mapping, res, respos);
7008 if (x==enc_EXCEPTION)
7009 return -1;
7010 else if (x==enc_FAILED) {
7011 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7012 return -1;
7013 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007014 }
7015 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007016 *inpos = collendpos;
7017 break;
7018 default:
7019 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 encoding, reason, p, size, exceptionObject,
7021 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007022 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007024 if (PyBytes_Check(repunicode)) {
7025 /* Directly copy bytes result to output. */
7026 Py_ssize_t outsize = PyBytes_Size(*res);
7027 Py_ssize_t requiredsize;
7028 repsize = PyBytes_Size(repunicode);
7029 requiredsize = *respos + repsize;
7030 if (requiredsize > outsize)
7031 /* Make room for all additional bytes. */
7032 if (charmapencode_resize(res, respos, requiredsize)) {
7033 Py_DECREF(repunicode);
7034 return -1;
7035 }
7036 memcpy(PyBytes_AsString(*res) + *respos,
7037 PyBytes_AsString(repunicode), repsize);
7038 *respos += repsize;
7039 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007040 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007041 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007042 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007043 /* generate replacement */
7044 repsize = PyUnicode_GET_SIZE(repunicode);
7045 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007046 x = charmapencode_output(*uni2, mapping, res, respos);
7047 if (x==enc_EXCEPTION) {
7048 return -1;
7049 }
7050 else if (x==enc_FAILED) {
7051 Py_DECREF(repunicode);
7052 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7053 return -1;
7054 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007055 }
7056 *inpos = newpos;
7057 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007058 }
7059 return 0;
7060}
7061
Alexander Belopolsky40018472011-02-26 01:02:56 +00007062PyObject *
7063PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7064 Py_ssize_t size,
7065 PyObject *mapping,
7066 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007068 /* output object */
7069 PyObject *res = NULL;
7070 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007071 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007072 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007073 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007074 PyObject *errorHandler = NULL;
7075 PyObject *exc = NULL;
7076 /* the following variable is used for caching string comparisons
7077 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7078 * 3=ignore, 4=xmlcharrefreplace */
7079 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080
7081 /* Default to Latin-1 */
7082 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007085 /* allocate enough for a simple encoding without
7086 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007087 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007088 if (res == NULL)
7089 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007090 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007093 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 /* try to encode it */
7095 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7096 if (x==enc_EXCEPTION) /* error */
7097 goto onError;
7098 if (x==enc_FAILED) { /* unencodable character */
7099 if (charmap_encoding_error(p, size, &inpos, mapping,
7100 &exc,
7101 &known_errorHandler, &errorHandler, errors,
7102 &res, &respos)) {
7103 goto onError;
7104 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007105 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007106 else
7107 /* done with this character => adjust input position */
7108 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007111 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007112 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007113 if (_PyBytes_Resize(&res, respos) < 0)
7114 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007115
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007116 Py_XDECREF(exc);
7117 Py_XDECREF(errorHandler);
7118 return res;
7119
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007121 Py_XDECREF(res);
7122 Py_XDECREF(exc);
7123 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124 return NULL;
7125}
7126
Alexander Belopolsky40018472011-02-26 01:02:56 +00007127PyObject *
7128PyUnicode_AsCharmapString(PyObject *unicode,
7129 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130{
7131 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 PyErr_BadArgument();
7133 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134 }
7135 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 PyUnicode_GET_SIZE(unicode),
7137 mapping,
7138 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139}
7140
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007141/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007142static void
7143make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007144 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007145 Py_ssize_t startpos, Py_ssize_t endpos,
7146 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007148 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007149 *exceptionObject = _PyUnicodeTranslateError_Create(
7150 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151 }
7152 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007153 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7154 goto onError;
7155 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7156 goto onError;
7157 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7158 goto onError;
7159 return;
7160 onError:
7161 Py_DECREF(*exceptionObject);
7162 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163 }
7164}
7165
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007166/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007167static void
7168raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007169 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007170 Py_ssize_t startpos, Py_ssize_t endpos,
7171 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007172{
7173 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007174 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007175 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007176 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007177}
7178
7179/* error handling callback helper:
7180 build arguments, call the callback and check the arguments,
7181 put the result into newpos and return the replacement string, which
7182 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007183static PyObject *
7184unicode_translate_call_errorhandler(const char *errors,
7185 PyObject **errorHandler,
7186 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007187 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007188 Py_ssize_t startpos, Py_ssize_t endpos,
7189 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007190{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007191 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007192
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007193 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007194 PyObject *restuple;
7195 PyObject *resunicode;
7196
7197 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007199 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007201 }
7202
7203 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007204 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007205 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007207
7208 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007210 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007212 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007213 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 Py_DECREF(restuple);
7215 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007216 }
7217 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 &resunicode, &i_newpos)) {
7219 Py_DECREF(restuple);
7220 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007221 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007222 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007223 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007224 else
7225 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007226 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7228 Py_DECREF(restuple);
7229 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007230 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007231 Py_INCREF(resunicode);
7232 Py_DECREF(restuple);
7233 return resunicode;
7234}
7235
7236/* Lookup the character ch in the mapping and put the result in result,
7237 which must be decrefed by the caller.
7238 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007239static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007240charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007241{
Christian Heimes217cfd12007-12-02 14:31:20 +00007242 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007243 PyObject *x;
7244
7245 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007247 x = PyObject_GetItem(mapping, w);
7248 Py_DECREF(w);
7249 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007250 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7251 /* No mapping found means: use 1:1 mapping. */
7252 PyErr_Clear();
7253 *result = NULL;
7254 return 0;
7255 } else
7256 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007257 }
7258 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 *result = x;
7260 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007261 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007262 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007263 long value = PyLong_AS_LONG(x);
7264 long max = PyUnicode_GetMax();
7265 if (value < 0 || value > max) {
7266 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007267 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007268 Py_DECREF(x);
7269 return -1;
7270 }
7271 *result = x;
7272 return 0;
7273 }
7274 else if (PyUnicode_Check(x)) {
7275 *result = x;
7276 return 0;
7277 }
7278 else {
7279 /* wrong return value */
7280 PyErr_SetString(PyExc_TypeError,
7281 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007282 Py_DECREF(x);
7283 return -1;
7284 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007285}
7286/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 if not reallocate and adjust various state variables.
7288 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007289static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007290charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007292{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007293 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007294 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 /* exponentially overallocate to minimize reallocations */
7296 if (requiredsize < 2 * oldsize)
7297 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007298 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7299 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007301 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007302 }
7303 return 0;
7304}
7305/* lookup the character, put the result in the output string and adjust
7306 various state variables. Return a new reference to the object that
7307 was put in the output buffer in *result, or Py_None, if the mapping was
7308 undefined (in which case no character was written).
7309 The called must decref result.
7310 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007311static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007312charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7313 PyObject *mapping, Py_UCS4 **output,
7314 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007315 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007316{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007317 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7318 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007320 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007321 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007322 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007323 }
7324 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007325 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007326 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007328 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007329 }
7330 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007331 Py_ssize_t repsize;
7332 if (PyUnicode_READY(*res) == -1)
7333 return -1;
7334 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 if (repsize==1) {
7336 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007337 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 }
7339 else if (repsize!=0) {
7340 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007341 Py_ssize_t requiredsize = *opos +
7342 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007344 Py_ssize_t i;
7345 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007346 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007347 for(i = 0; i < repsize; i++)
7348 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007350 }
7351 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007353 return 0;
7354}
7355
Alexander Belopolsky40018472011-02-26 01:02:56 +00007356PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007357_PyUnicode_TranslateCharmap(PyObject *input,
7358 PyObject *mapping,
7359 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007361 /* input object */
7362 char *idata;
7363 Py_ssize_t size, i;
7364 int kind;
7365 /* output buffer */
7366 Py_UCS4 *output = NULL;
7367 Py_ssize_t osize;
7368 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007369 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007370 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007371 char *reason = "character maps to <undefined>";
7372 PyObject *errorHandler = NULL;
7373 PyObject *exc = NULL;
7374 /* the following variable is used for caching string comparisons
7375 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7376 * 3=ignore, 4=xmlcharrefreplace */
7377 int known_errorHandler = -1;
7378
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 PyErr_BadArgument();
7381 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007384 if (PyUnicode_READY(input) == -1)
7385 return NULL;
7386 idata = (char*)PyUnicode_DATA(input);
7387 kind = PyUnicode_KIND(input);
7388 size = PyUnicode_GET_LENGTH(input);
7389 i = 0;
7390
7391 if (size == 0) {
7392 Py_INCREF(input);
7393 return input;
7394 }
7395
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007396 /* allocate enough for a simple 1:1 translation without
7397 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007398 osize = size;
7399 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7400 opos = 0;
7401 if (output == NULL) {
7402 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007406 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 /* try to encode it */
7408 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007409 if (charmaptranslate_output(input, i, mapping,
7410 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 Py_XDECREF(x);
7412 goto onError;
7413 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007414 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007416 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 else { /* untranslatable character */
7418 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7419 Py_ssize_t repsize;
7420 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007421 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007423 Py_ssize_t collstart = i;
7424 Py_ssize_t collend = i+1;
7425 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007428 while (collend < size) {
7429 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007430 goto onError;
7431 Py_XDECREF(x);
7432 if (x!=Py_None)
7433 break;
7434 ++collend;
7435 }
7436 /* cache callback name lookup
7437 * (if not done yet, i.e. it's the first error) */
7438 if (known_errorHandler==-1) {
7439 if ((errors==NULL) || (!strcmp(errors, "strict")))
7440 known_errorHandler = 1;
7441 else if (!strcmp(errors, "replace"))
7442 known_errorHandler = 2;
7443 else if (!strcmp(errors, "ignore"))
7444 known_errorHandler = 3;
7445 else if (!strcmp(errors, "xmlcharrefreplace"))
7446 known_errorHandler = 4;
7447 else
7448 known_errorHandler = 0;
7449 }
7450 switch (known_errorHandler) {
7451 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007452 raise_translate_exception(&exc, input, collstart,
7453 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007454 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 case 2: /* replace */
7456 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007457 for (coll = collstart; coll<collend; coll++)
7458 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 /* fall through */
7460 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007461 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 break;
7463 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007464 /* generate replacement (temporarily (mis)uses i) */
7465 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 char buffer[2+29+1+1];
7467 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007468 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7469 if (charmaptranslate_makespace(&output, &osize,
7470 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 goto onError;
7472 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007473 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007475 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 break;
7477 default:
7478 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007479 reason, input, &exc,
7480 collstart, collend, &newpos);
7481 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007482 goto onError;
7483 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007484 repsize = PyUnicode_GET_LENGTH(repunicode);
7485 if (charmaptranslate_makespace(&output, &osize,
7486 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007487 Py_DECREF(repunicode);
7488 goto onError;
7489 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007490 for (uni2 = 0; repsize-->0; ++uni2)
7491 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7492 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007494 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007495 }
7496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007497 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7498 if (!res)
7499 goto onError;
7500 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007501 Py_XDECREF(exc);
7502 Py_XDECREF(errorHandler);
7503 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504
Benjamin Peterson29060642009-01-31 22:14:21 +00007505 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007506 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007507 Py_XDECREF(exc);
7508 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509 return NULL;
7510}
7511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007512/* Deprecated. Use PyUnicode_Translate instead. */
7513PyObject *
7514PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7515 Py_ssize_t size,
7516 PyObject *mapping,
7517 const char *errors)
7518{
7519 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7520 if (!unicode)
7521 return NULL;
7522 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7523}
7524
Alexander Belopolsky40018472011-02-26 01:02:56 +00007525PyObject *
7526PyUnicode_Translate(PyObject *str,
7527 PyObject *mapping,
7528 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529{
7530 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007531
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 str = PyUnicode_FromObject(str);
7533 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007534 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007535 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 Py_DECREF(str);
7537 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007538
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540 Py_XDECREF(str);
7541 return NULL;
7542}
Tim Petersced69f82003-09-16 20:30:58 +00007543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007544static Py_UCS4
7545fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7546{
7547 /* No need to call PyUnicode_READY(self) because this function is only
7548 called as a callback from fixup() which does it already. */
7549 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7550 const int kind = PyUnicode_KIND(self);
7551 void *data = PyUnicode_DATA(self);
7552 Py_UCS4 maxchar = 0, ch, fixed;
7553 Py_ssize_t i;
7554
7555 for (i = 0; i < len; ++i) {
7556 ch = PyUnicode_READ(kind, data, i);
7557 fixed = 0;
7558 if (ch > 127) {
7559 if (Py_UNICODE_ISSPACE(ch))
7560 fixed = ' ';
7561 else {
7562 const int decimal = Py_UNICODE_TODECIMAL(ch);
7563 if (decimal >= 0)
7564 fixed = '0' + decimal;
7565 }
7566 if (fixed != 0) {
7567 if (fixed > maxchar)
7568 maxchar = fixed;
7569 PyUnicode_WRITE(kind, data, i, fixed);
7570 }
7571 else if (ch > maxchar)
7572 maxchar = ch;
7573 }
7574 else if (ch > maxchar)
7575 maxchar = ch;
7576 }
7577
7578 return maxchar;
7579}
7580
7581PyObject *
7582_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7583{
7584 if (!PyUnicode_Check(unicode)) {
7585 PyErr_BadInternalCall();
7586 return NULL;
7587 }
7588 if (PyUnicode_READY(unicode) == -1)
7589 return NULL;
7590 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7591 /* If the string is already ASCII, just return the same string */
7592 Py_INCREF(unicode);
7593 return unicode;
7594 }
7595 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7596}
7597
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007598PyObject *
7599PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7600 Py_ssize_t length)
7601{
7602 PyObject *result;
7603 Py_UNICODE *p; /* write pointer into result */
7604 Py_ssize_t i;
7605 /* Copy to a new string */
7606 result = (PyObject *)_PyUnicode_New(length);
7607 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7608 if (result == NULL)
7609 return result;
7610 p = PyUnicode_AS_UNICODE(result);
7611 /* Iterate over code points */
7612 for (i = 0; i < length; i++) {
7613 Py_UNICODE ch =s[i];
7614 if (ch > 127) {
7615 int decimal = Py_UNICODE_TODECIMAL(ch);
7616 if (decimal >= 0)
7617 p[i] = '0' + decimal;
7618 }
7619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007620 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7621 Py_DECREF(result);
7622 return NULL;
7623 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007624 return result;
7625}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007626/* --- Decimal Encoder ---------------------------------------------------- */
7627
Alexander Belopolsky40018472011-02-26 01:02:56 +00007628int
7629PyUnicode_EncodeDecimal(Py_UNICODE *s,
7630 Py_ssize_t length,
7631 char *output,
7632 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007633{
7634 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007635 PyObject *errorHandler = NULL;
7636 PyObject *exc = NULL;
7637 const char *encoding = "decimal";
7638 const char *reason = "invalid decimal Unicode string";
7639 /* the following variable is used for caching string comparisons
7640 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7641 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007642
7643 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007644 PyErr_BadArgument();
7645 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007646 }
7647
7648 p = s;
7649 end = s + length;
7650 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007651 register Py_UNICODE ch = *p;
7652 int decimal;
7653 PyObject *repunicode;
7654 Py_ssize_t repsize;
7655 Py_ssize_t newpos;
7656 Py_UNICODE *uni2;
7657 Py_UNICODE *collstart;
7658 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007659
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007661 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007662 ++p;
7663 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007664 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 decimal = Py_UNICODE_TODECIMAL(ch);
7666 if (decimal >= 0) {
7667 *output++ = '0' + decimal;
7668 ++p;
7669 continue;
7670 }
7671 if (0 < ch && ch < 256) {
7672 *output++ = (char)ch;
7673 ++p;
7674 continue;
7675 }
7676 /* All other characters are considered unencodable */
7677 collstart = p;
7678 collend = p+1;
7679 while (collend < end) {
7680 if ((0 < *collend && *collend < 256) ||
7681 !Py_UNICODE_ISSPACE(*collend) ||
7682 Py_UNICODE_TODECIMAL(*collend))
7683 break;
7684 }
7685 /* cache callback name lookup
7686 * (if not done yet, i.e. it's the first error) */
7687 if (known_errorHandler==-1) {
7688 if ((errors==NULL) || (!strcmp(errors, "strict")))
7689 known_errorHandler = 1;
7690 else if (!strcmp(errors, "replace"))
7691 known_errorHandler = 2;
7692 else if (!strcmp(errors, "ignore"))
7693 known_errorHandler = 3;
7694 else if (!strcmp(errors, "xmlcharrefreplace"))
7695 known_errorHandler = 4;
7696 else
7697 known_errorHandler = 0;
7698 }
7699 switch (known_errorHandler) {
7700 case 1: /* strict */
7701 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
7702 goto onError;
7703 case 2: /* replace */
7704 for (p = collstart; p < collend; ++p)
7705 *output++ = '?';
7706 /* fall through */
7707 case 3: /* ignore */
7708 p = collend;
7709 break;
7710 case 4: /* xmlcharrefreplace */
7711 /* generate replacement (temporarily (mis)uses p) */
7712 for (p = collstart; p < collend; ++p)
7713 output += sprintf(output, "&#%d;", (int)*p);
7714 p = collend;
7715 break;
7716 default:
7717 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
7718 encoding, reason, s, length, &exc,
7719 collstart-s, collend-s, &newpos);
7720 if (repunicode == NULL)
7721 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007722 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007723 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007724 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
7725 Py_DECREF(repunicode);
7726 goto onError;
7727 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007728 /* generate replacement */
7729 repsize = PyUnicode_GET_SIZE(repunicode);
7730 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7731 Py_UNICODE ch = *uni2;
7732 if (Py_UNICODE_ISSPACE(ch))
7733 *output++ = ' ';
7734 else {
7735 decimal = Py_UNICODE_TODECIMAL(ch);
7736 if (decimal >= 0)
7737 *output++ = '0' + decimal;
7738 else if (0 < ch && ch < 256)
7739 *output++ = (char)ch;
7740 else {
7741 Py_DECREF(repunicode);
7742 raise_encode_exception(&exc, encoding,
7743 s, length, collstart-s, collend-s, reason);
7744 goto onError;
7745 }
7746 }
7747 }
7748 p = s + newpos;
7749 Py_DECREF(repunicode);
7750 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00007751 }
7752 /* 0-terminate the output string */
7753 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007754 Py_XDECREF(exc);
7755 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007756 return 0;
7757
Benjamin Peterson29060642009-01-31 22:14:21 +00007758 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007759 Py_XDECREF(exc);
7760 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00007761 return -1;
7762}
7763
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764/* --- Helpers ------------------------------------------------------------ */
7765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007766#include "stringlib/ucs1lib.h"
7767#include "stringlib/fastsearch.h"
7768#include "stringlib/partition.h"
7769#include "stringlib/split.h"
7770#include "stringlib/count.h"
7771#include "stringlib/find.h"
7772#include "stringlib/localeutil.h"
7773#include "stringlib/undef.h"
7774
7775#include "stringlib/ucs2lib.h"
7776#include "stringlib/fastsearch.h"
7777#include "stringlib/partition.h"
7778#include "stringlib/split.h"
7779#include "stringlib/count.h"
7780#include "stringlib/find.h"
7781#include "stringlib/localeutil.h"
7782#include "stringlib/undef.h"
7783
7784#include "stringlib/ucs4lib.h"
7785#include "stringlib/fastsearch.h"
7786#include "stringlib/partition.h"
7787#include "stringlib/split.h"
7788#include "stringlib/count.h"
7789#include "stringlib/find.h"
7790#include "stringlib/localeutil.h"
7791#include "stringlib/undef.h"
7792
7793static Py_ssize_t
7794any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
7795 const Py_UCS1*, Py_ssize_t,
7796 Py_ssize_t, Py_ssize_t),
7797 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
7798 const Py_UCS2*, Py_ssize_t,
7799 Py_ssize_t, Py_ssize_t),
7800 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
7801 const Py_UCS4*, Py_ssize_t,
7802 Py_ssize_t, Py_ssize_t),
7803 PyObject* s1, PyObject* s2,
7804 Py_ssize_t start,
7805 Py_ssize_t end)
7806{
7807 int kind1, kind2, kind;
7808 void *buf1, *buf2;
7809 Py_ssize_t len1, len2, result;
7810
7811 kind1 = PyUnicode_KIND(s1);
7812 kind2 = PyUnicode_KIND(s2);
7813 kind = kind1 > kind2 ? kind1 : kind2;
7814 buf1 = PyUnicode_DATA(s1);
7815 buf2 = PyUnicode_DATA(s2);
7816 if (kind1 != kind)
7817 buf1 = _PyUnicode_AsKind(s1, kind);
7818 if (!buf1)
7819 return -2;
7820 if (kind2 != kind)
7821 buf2 = _PyUnicode_AsKind(s2, kind);
7822 if (!buf2) {
7823 if (kind1 != kind) PyMem_Free(buf1);
7824 return -2;
7825 }
7826 len1 = PyUnicode_GET_LENGTH(s1);
7827 len2 = PyUnicode_GET_LENGTH(s2);
7828
7829 switch(kind) {
7830 case PyUnicode_1BYTE_KIND:
7831 result = ucs1(buf1, len1, buf2, len2, start, end);
7832 break;
7833 case PyUnicode_2BYTE_KIND:
7834 result = ucs2(buf1, len1, buf2, len2, start, end);
7835 break;
7836 case PyUnicode_4BYTE_KIND:
7837 result = ucs4(buf1, len1, buf2, len2, start, end);
7838 break;
7839 default:
7840 assert(0); result = -2;
7841 }
7842
7843 if (kind1 != kind)
7844 PyMem_Free(buf1);
7845 if (kind2 != kind)
7846 PyMem_Free(buf2);
7847
7848 return result;
7849}
7850
7851Py_ssize_t
7852_PyUnicode_InsertThousandsGrouping(int kind, void *data,
7853 Py_ssize_t n_buffer,
7854 void *digits, Py_ssize_t n_digits,
7855 Py_ssize_t min_width,
7856 const char *grouping,
7857 const char *thousands_sep)
7858{
7859 switch(kind) {
7860 case PyUnicode_1BYTE_KIND:
7861 return _PyUnicode_ucs1_InsertThousandsGrouping(
7862 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
7863 min_width, grouping, thousands_sep);
7864 case PyUnicode_2BYTE_KIND:
7865 return _PyUnicode_ucs2_InsertThousandsGrouping(
7866 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
7867 min_width, grouping, thousands_sep);
7868 case PyUnicode_4BYTE_KIND:
7869 return _PyUnicode_ucs4_InsertThousandsGrouping(
7870 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
7871 min_width, grouping, thousands_sep);
7872 }
7873 assert(0);
7874 return -1;
7875}
7876
7877
Eric Smith8c663262007-08-25 02:26:07 +00007878#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00007879#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007880
Thomas Wouters477c8d52006-05-27 19:21:47 +00007881#include "stringlib/count.h"
7882#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00007883
Thomas Wouters477c8d52006-05-27 19:21:47 +00007884/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007885#define ADJUST_INDICES(start, end, len) \
7886 if (end > len) \
7887 end = len; \
7888 else if (end < 0) { \
7889 end += len; \
7890 if (end < 0) \
7891 end = 0; \
7892 } \
7893 if (start < 0) { \
7894 start += len; \
7895 if (start < 0) \
7896 start = 0; \
7897 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007898
Alexander Belopolsky40018472011-02-26 01:02:56 +00007899Py_ssize_t
7900PyUnicode_Count(PyObject *str,
7901 PyObject *substr,
7902 Py_ssize_t start,
7903 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007905 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007906 PyUnicodeObject* str_obj;
7907 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 int kind1, kind2, kind;
7909 void *buf1 = NULL, *buf2 = NULL;
7910 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00007911
Thomas Wouters477c8d52006-05-27 19:21:47 +00007912 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007913 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007915 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007916 if (!sub_obj || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 Py_DECREF(str_obj);
7918 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919 }
Tim Petersced69f82003-09-16 20:30:58 +00007920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007921 kind1 = PyUnicode_KIND(str_obj);
7922 kind2 = PyUnicode_KIND(sub_obj);
7923 kind = kind1 > kind2 ? kind1 : kind2;
7924 buf1 = PyUnicode_DATA(str_obj);
7925 if (kind1 != kind)
7926 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
7927 if (!buf1)
7928 goto onError;
7929 buf2 = PyUnicode_DATA(sub_obj);
7930 if (kind2 != kind)
7931 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
7932 if (!buf2)
7933 goto onError;
7934 len1 = PyUnicode_GET_LENGTH(str_obj);
7935 len2 = PyUnicode_GET_LENGTH(sub_obj);
7936
7937 ADJUST_INDICES(start, end, len1);
7938 switch(kind) {
7939 case PyUnicode_1BYTE_KIND:
7940 result = ucs1lib_count(
7941 ((Py_UCS1*)buf1) + start, end - start,
7942 buf2, len2, PY_SSIZE_T_MAX
7943 );
7944 break;
7945 case PyUnicode_2BYTE_KIND:
7946 result = ucs2lib_count(
7947 ((Py_UCS2*)buf1) + start, end - start,
7948 buf2, len2, PY_SSIZE_T_MAX
7949 );
7950 break;
7951 case PyUnicode_4BYTE_KIND:
7952 result = ucs4lib_count(
7953 ((Py_UCS4*)buf1) + start, end - start,
7954 buf2, len2, PY_SSIZE_T_MAX
7955 );
7956 break;
7957 default:
7958 assert(0); result = 0;
7959 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007960
7961 Py_DECREF(sub_obj);
7962 Py_DECREF(str_obj);
7963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007964 if (kind1 != kind)
7965 PyMem_Free(buf1);
7966 if (kind2 != kind)
7967 PyMem_Free(buf2);
7968
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007970 onError:
7971 Py_DECREF(sub_obj);
7972 Py_DECREF(str_obj);
7973 if (kind1 != kind && buf1)
7974 PyMem_Free(buf1);
7975 if (kind2 != kind && buf2)
7976 PyMem_Free(buf2);
7977 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978}
7979
Alexander Belopolsky40018472011-02-26 01:02:56 +00007980Py_ssize_t
7981PyUnicode_Find(PyObject *str,
7982 PyObject *sub,
7983 Py_ssize_t start,
7984 Py_ssize_t end,
7985 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007987 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00007988
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007990 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007992 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007993 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 Py_DECREF(str);
7995 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 }
Tim Petersced69f82003-09-16 20:30:58 +00007997
Thomas Wouters477c8d52006-05-27 19:21:47 +00007998 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007999 result = any_find_slice(
8000 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8001 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008002 );
8003 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008004 result = any_find_slice(
8005 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8006 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008007 );
8008
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008010 Py_DECREF(sub);
8011
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 return result;
8013}
8014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008015Py_ssize_t
8016PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8017 Py_ssize_t start, Py_ssize_t end,
8018 int direction)
8019{
8020 char *result;
8021 int kind;
8022 if (PyUnicode_READY(str) == -1)
8023 return -2;
8024 if (end > PyUnicode_GET_LENGTH(str))
8025 end = PyUnicode_GET_LENGTH(str);
8026 kind = PyUnicode_KIND(str);
8027 result = findchar(PyUnicode_1BYTE_DATA(str)
8028 + PyUnicode_KIND_SIZE(kind, start),
8029 kind,
8030 end-start, ch, direction);
8031 if (!result)
8032 return -1;
8033 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8034}
8035
Alexander Belopolsky40018472011-02-26 01:02:56 +00008036static int
8037tailmatch(PyUnicodeObject *self,
8038 PyUnicodeObject *substring,
8039 Py_ssize_t start,
8040 Py_ssize_t end,
8041 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008043 int kind_self;
8044 int kind_sub;
8045 void *data_self;
8046 void *data_sub;
8047 Py_ssize_t offset;
8048 Py_ssize_t i;
8049 Py_ssize_t end_sub;
8050
8051 if (PyUnicode_READY(self) == -1 ||
8052 PyUnicode_READY(substring) == -1)
8053 return 0;
8054
8055 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056 return 1;
8057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008058 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8059 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008063 kind_self = PyUnicode_KIND(self);
8064 data_self = PyUnicode_DATA(self);
8065 kind_sub = PyUnicode_KIND(substring);
8066 data_sub = PyUnicode_DATA(substring);
8067 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8068
8069 if (direction > 0)
8070 offset = end;
8071 else
8072 offset = start;
8073
8074 if (PyUnicode_READ(kind_self, data_self, offset) ==
8075 PyUnicode_READ(kind_sub, data_sub, 0) &&
8076 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8077 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8078 /* If both are of the same kind, memcmp is sufficient */
8079 if (kind_self == kind_sub) {
8080 return ! memcmp((char *)data_self +
8081 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8082 data_sub,
8083 PyUnicode_GET_LENGTH(substring) *
8084 PyUnicode_CHARACTER_SIZE(substring));
8085 }
8086 /* otherwise we have to compare each character by first accesing it */
8087 else {
8088 /* We do not need to compare 0 and len(substring)-1 because
8089 the if statement above ensured already that they are equal
8090 when we end up here. */
8091 // TODO: honor direction and do a forward or backwards search
8092 for (i = 1; i < end_sub; ++i) {
8093 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8094 PyUnicode_READ(kind_sub, data_sub, i))
8095 return 0;
8096 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008097 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099 }
8100
8101 return 0;
8102}
8103
Alexander Belopolsky40018472011-02-26 01:02:56 +00008104Py_ssize_t
8105PyUnicode_Tailmatch(PyObject *str,
8106 PyObject *substr,
8107 Py_ssize_t start,
8108 Py_ssize_t end,
8109 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008111 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008112
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113 str = PyUnicode_FromObject(str);
8114 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116 substr = PyUnicode_FromObject(substr);
8117 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 Py_DECREF(str);
8119 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120 }
Tim Petersced69f82003-09-16 20:30:58 +00008121
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 (PyUnicodeObject *)substr,
8124 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125 Py_DECREF(str);
8126 Py_DECREF(substr);
8127 return result;
8128}
8129
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130/* Apply fixfct filter to the Unicode object self and return a
8131 reference to the modified object */
8132
Alexander Belopolsky40018472011-02-26 01:02:56 +00008133static PyObject *
8134fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008135 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008137 PyObject *u;
8138 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008140 if (PyUnicode_READY(self) == -1)
8141 return NULL;
8142 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8143 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8144 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008148 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8149 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008151 /* fix functions return the new maximum character in a string,
8152 if the kind of the resulting unicode object does not change,
8153 everything is fine. Otherwise we need to change the string kind
8154 and re-run the fix function. */
8155 maxchar_new = fixfct((PyUnicodeObject*)u);
8156 if (maxchar_new == 0)
8157 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8158 else if (maxchar_new <= 127)
8159 maxchar_new = 127;
8160 else if (maxchar_new <= 255)
8161 maxchar_new = 255;
8162 else if (maxchar_new <= 65535)
8163 maxchar_new = 65535;
8164 else
8165 maxchar_new = 1114111; /* 0x10ffff */
8166
8167 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 /* fixfct should return TRUE if it modified the buffer. If
8169 FALSE, return a reference to the original buffer instead
8170 (to save space, not time) */
8171 Py_INCREF(self);
8172 Py_DECREF(u);
8173 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008175 else if (maxchar_new == maxchar_old) {
8176 return u;
8177 }
8178 else {
8179 /* In case the maximum character changed, we need to
8180 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008181 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008182 if (v == NULL) {
8183 Py_DECREF(u);
8184 return NULL;
8185 }
8186 if (maxchar_new > maxchar_old) {
8187 /* If the maxchar increased so that the kind changed, not all
8188 characters are representable anymore and we need to fix the
8189 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008190 if (PyUnicode_CopyCharacters(v, 0,
8191 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008192 PyUnicode_GET_LENGTH(self)) < 0)
8193 {
8194 Py_DECREF(u);
8195 return NULL;
8196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008197 maxchar_old = fixfct((PyUnicodeObject*)v);
8198 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8199 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008200 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008201 if (PyUnicode_CopyCharacters(v, 0,
8202 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008203 PyUnicode_GET_LENGTH(self)) < 0)
8204 {
8205 Py_DECREF(u);
8206 return NULL;
8207 }
8208 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008209
8210 Py_DECREF(u);
8211 return v;
8212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213}
8214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008215static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008216fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008218 /* No need to call PyUnicode_READY(self) because this function is only
8219 called as a callback from fixup() which does it already. */
8220 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8221 const int kind = PyUnicode_KIND(self);
8222 void *data = PyUnicode_DATA(self);
8223 int touched = 0;
8224 Py_UCS4 maxchar = 0;
8225 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008227 for (i = 0; i < len; ++i) {
8228 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8229 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8230 if (up != ch) {
8231 if (up > maxchar)
8232 maxchar = up;
8233 PyUnicode_WRITE(kind, data, i, up);
8234 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008236 else if (ch > maxchar)
8237 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238 }
8239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008240 if (touched)
8241 return maxchar;
8242 else
8243 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244}
8245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008246static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008247fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008249 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8250 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8251 const int kind = PyUnicode_KIND(self);
8252 void *data = PyUnicode_DATA(self);
8253 int touched = 0;
8254 Py_UCS4 maxchar = 0;
8255 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008257 for(i = 0; i < len; ++i) {
8258 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8259 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8260 if (lo != ch) {
8261 if (lo > maxchar)
8262 maxchar = lo;
8263 PyUnicode_WRITE(kind, data, i, lo);
8264 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008266 else if (ch > maxchar)
8267 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268 }
8269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008270 if (touched)
8271 return maxchar;
8272 else
8273 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274}
8275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008276static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008277fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008279 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8280 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8281 const int kind = PyUnicode_KIND(self);
8282 void *data = PyUnicode_DATA(self);
8283 int touched = 0;
8284 Py_UCS4 maxchar = 0;
8285 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008287 for(i = 0; i < len; ++i) {
8288 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8289 Py_UCS4 nu = 0;
8290
8291 if (Py_UNICODE_ISUPPER(ch))
8292 nu = Py_UNICODE_TOLOWER(ch);
8293 else if (Py_UNICODE_ISLOWER(ch))
8294 nu = Py_UNICODE_TOUPPER(ch);
8295
8296 if (nu != 0) {
8297 if (nu > maxchar)
8298 maxchar = nu;
8299 PyUnicode_WRITE(kind, data, i, nu);
8300 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008302 else if (ch > maxchar)
8303 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 }
8305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008306 if (touched)
8307 return maxchar;
8308 else
8309 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310}
8311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008313fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008315 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8316 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8317 const int kind = PyUnicode_KIND(self);
8318 void *data = PyUnicode_DATA(self);
8319 int touched = 0;
8320 Py_UCS4 maxchar = 0;
8321 Py_ssize_t i = 0;
8322 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008323
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008324 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008326
8327 ch = PyUnicode_READ(kind, data, i);
8328 if (!Py_UNICODE_ISUPPER(ch)) {
8329 maxchar = Py_UNICODE_TOUPPER(ch);
8330 PyUnicode_WRITE(kind, data, i, maxchar);
8331 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008333 ++i;
8334 for(; i < len; ++i) {
8335 ch = PyUnicode_READ(kind, data, i);
8336 if (!Py_UNICODE_ISLOWER(ch)) {
8337 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8338 if (lo > maxchar)
8339 maxchar = lo;
8340 PyUnicode_WRITE(kind, data, i, lo);
8341 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008342 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008343 else if (ch > maxchar)
8344 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008345 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008346
8347 if (touched)
8348 return maxchar;
8349 else
8350 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351}
8352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008354fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8357 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8358 const int kind = PyUnicode_KIND(self);
8359 void *data = PyUnicode_DATA(self);
8360 Py_UCS4 maxchar = 0;
8361 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 int previous_is_cased;
8363
8364 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365 if (len == 1) {
8366 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8367 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8368 if (ti != ch) {
8369 PyUnicode_WRITE(kind, data, i, ti);
8370 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 }
8372 else
8373 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 for(; i < len; ++i) {
8377 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8378 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008379
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 nu = Py_UNICODE_TOTITLE(ch);
8384
8385 if (nu > maxchar)
8386 maxchar = nu;
8387 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008388
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 if (Py_UNICODE_ISLOWER(ch) ||
8390 Py_UNICODE_ISUPPER(ch) ||
8391 Py_UNICODE_ISTITLE(ch))
8392 previous_is_cased = 1;
8393 else
8394 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008396 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397}
8398
Tim Peters8ce9f162004-08-27 01:49:32 +00008399PyObject *
8400PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008403 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008405 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008406 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8407 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008408 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008409 Py_ssize_t sz, i, res_offset;
8410 Py_UCS4 maxchar = 0;
8411 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412
Tim Peters05eba1f2004-08-27 21:32:02 +00008413 fseq = PySequence_Fast(seq, "");
8414 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008415 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008416 }
8417
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008418 /* NOTE: the following code can't call back into Python code,
8419 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008420 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008421
Tim Peters05eba1f2004-08-27 21:32:02 +00008422 seqlen = PySequence_Fast_GET_SIZE(fseq);
8423 /* If empty sequence, return u"". */
8424 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008425 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008426 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008427 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008428 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008429 /* If singleton sequence with an exact Unicode, return that. */
8430 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 item = items[0];
8432 if (PyUnicode_CheckExact(item)) {
8433 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008434 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 goto Done;
8436 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008437 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008438 else {
8439 /* Set up sep and seplen */
8440 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008441 /* fall back to a blank space separator */
8442 sep = PyUnicode_FromOrdinal(' ');
8443 if (!sep || PyUnicode_READY(sep) == -1)
8444 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008445 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008446 else {
8447 if (!PyUnicode_Check(separator)) {
8448 PyErr_Format(PyExc_TypeError,
8449 "separator: expected str instance,"
8450 " %.80s found",
8451 Py_TYPE(separator)->tp_name);
8452 goto onError;
8453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 if (PyUnicode_READY(separator) == -1)
8455 goto onError;
8456 sep = separator;
8457 seplen = PyUnicode_GET_LENGTH(separator);
8458 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8459 /* inc refcount to keep this code path symetric with the
8460 above case of a blank separator */
8461 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008462 }
8463 }
8464
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008465 /* There are at least two things to join, or else we have a subclass
8466 * of str in the sequence.
8467 * Do a pre-pass to figure out the total amount of space we'll
8468 * need (sz), and see whether all argument are strings.
8469 */
8470 sz = 0;
8471 for (i = 0; i < seqlen; i++) {
8472 const Py_ssize_t old_sz = sz;
8473 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 if (!PyUnicode_Check(item)) {
8475 PyErr_Format(PyExc_TypeError,
8476 "sequence item %zd: expected str instance,"
8477 " %.80s found",
8478 i, Py_TYPE(item)->tp_name);
8479 goto onError;
8480 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 if (PyUnicode_READY(item) == -1)
8482 goto onError;
8483 sz += PyUnicode_GET_LENGTH(item);
8484 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8485 if (item_maxchar > maxchar)
8486 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008487 if (i != 0)
8488 sz += seplen;
8489 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8490 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008492 goto onError;
8493 }
8494 }
Tim Petersced69f82003-09-16 20:30:58 +00008495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008497 if (res == NULL)
8498 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008499
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008500 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008502 Py_ssize_t itemlen;
8503 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 /* Copy item, and maybe the separator. */
8506 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008507 if (PyUnicode_CopyCharacters(res, res_offset,
8508 sep, 0, seplen) < 0)
8509 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008510 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008512 if (PyUnicode_CopyCharacters(res, res_offset,
8513 item, 0, itemlen) < 0)
8514 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008516 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008518
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008520 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 Py_XDECREF(sep);
8522 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008525 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008527 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528 return NULL;
8529}
8530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531#define FILL(kind, data, value, start, length) \
8532 do { \
8533 Py_ssize_t i_ = 0; \
8534 assert(kind != PyUnicode_WCHAR_KIND); \
8535 switch ((kind)) { \
8536 case PyUnicode_1BYTE_KIND: { \
8537 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8538 memset(to_, (unsigned char)value, length); \
8539 break; \
8540 } \
8541 case PyUnicode_2BYTE_KIND: { \
8542 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8543 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8544 break; \
8545 } \
8546 default: { \
8547 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8548 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8549 break; \
8550 } \
8551 } \
8552 } while (0)
8553
Alexander Belopolsky40018472011-02-26 01:02:56 +00008554static PyUnicodeObject *
8555pad(PyUnicodeObject *self,
8556 Py_ssize_t left,
8557 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560 PyObject *u;
8561 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008562 int kind;
8563 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564
8565 if (left < 0)
8566 left = 0;
8567 if (right < 0)
8568 right = 0;
8569
Tim Peters7a29bd52001-09-12 03:03:31 +00008570 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 Py_INCREF(self);
8572 return self;
8573 }
8574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8576 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008577 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8578 return NULL;
8579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8581 if (fill > maxchar)
8582 maxchar = fill;
8583 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008584 if (!u)
8585 return NULL;
8586
8587 kind = PyUnicode_KIND(u);
8588 data = PyUnicode_DATA(u);
8589 if (left)
8590 FILL(kind, data, fill, 0, left);
8591 if (right)
8592 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008593 if (PyUnicode_CopyCharacters(u, left,
8594 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008595 _PyUnicode_LENGTH(self)) < 0)
8596 {
8597 Py_DECREF(u);
8598 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599 }
8600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604
Alexander Belopolsky40018472011-02-26 01:02:56 +00008605PyObject *
8606PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609
8610 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008614 switch(PyUnicode_KIND(string)) {
8615 case PyUnicode_1BYTE_KIND:
8616 list = ucs1lib_splitlines(
8617 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8618 PyUnicode_GET_LENGTH(string), keepends);
8619 break;
8620 case PyUnicode_2BYTE_KIND:
8621 list = ucs2lib_splitlines(
8622 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8623 PyUnicode_GET_LENGTH(string), keepends);
8624 break;
8625 case PyUnicode_4BYTE_KIND:
8626 list = ucs4lib_splitlines(
8627 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8628 PyUnicode_GET_LENGTH(string), keepends);
8629 break;
8630 default:
8631 assert(0);
8632 list = 0;
8633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634 Py_DECREF(string);
8635 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636}
8637
Alexander Belopolsky40018472011-02-26 01:02:56 +00008638static PyObject *
8639split(PyUnicodeObject *self,
8640 PyUnicodeObject *substring,
8641 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 int kind1, kind2, kind;
8644 void *buf1, *buf2;
8645 Py_ssize_t len1, len2;
8646 PyObject* out;
8647
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008649 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 if (PyUnicode_READY(self) == -1)
8652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 if (substring == NULL)
8655 switch(PyUnicode_KIND(self)) {
8656 case PyUnicode_1BYTE_KIND:
8657 return ucs1lib_split_whitespace(
8658 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8659 PyUnicode_GET_LENGTH(self), maxcount
8660 );
8661 case PyUnicode_2BYTE_KIND:
8662 return ucs2lib_split_whitespace(
8663 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8664 PyUnicode_GET_LENGTH(self), maxcount
8665 );
8666 case PyUnicode_4BYTE_KIND:
8667 return ucs4lib_split_whitespace(
8668 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8669 PyUnicode_GET_LENGTH(self), maxcount
8670 );
8671 default:
8672 assert(0);
8673 return NULL;
8674 }
8675
8676 if (PyUnicode_READY(substring) == -1)
8677 return NULL;
8678
8679 kind1 = PyUnicode_KIND(self);
8680 kind2 = PyUnicode_KIND(substring);
8681 kind = kind1 > kind2 ? kind1 : kind2;
8682 buf1 = PyUnicode_DATA(self);
8683 buf2 = PyUnicode_DATA(substring);
8684 if (kind1 != kind)
8685 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8686 if (!buf1)
8687 return NULL;
8688 if (kind2 != kind)
8689 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8690 if (!buf2) {
8691 if (kind1 != kind) PyMem_Free(buf1);
8692 return NULL;
8693 }
8694 len1 = PyUnicode_GET_LENGTH(self);
8695 len2 = PyUnicode_GET_LENGTH(substring);
8696
8697 switch(kind) {
8698 case PyUnicode_1BYTE_KIND:
8699 out = ucs1lib_split(
8700 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8701 break;
8702 case PyUnicode_2BYTE_KIND:
8703 out = ucs2lib_split(
8704 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8705 break;
8706 case PyUnicode_4BYTE_KIND:
8707 out = ucs4lib_split(
8708 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8709 break;
8710 default:
8711 out = NULL;
8712 }
8713 if (kind1 != kind)
8714 PyMem_Free(buf1);
8715 if (kind2 != kind)
8716 PyMem_Free(buf2);
8717 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718}
8719
Alexander Belopolsky40018472011-02-26 01:02:56 +00008720static PyObject *
8721rsplit(PyUnicodeObject *self,
8722 PyUnicodeObject *substring,
8723 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008724{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 int kind1, kind2, kind;
8726 void *buf1, *buf2;
8727 Py_ssize_t len1, len2;
8728 PyObject* out;
8729
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008730 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008731 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008733 if (PyUnicode_READY(self) == -1)
8734 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008736 if (substring == NULL)
8737 switch(PyUnicode_KIND(self)) {
8738 case PyUnicode_1BYTE_KIND:
8739 return ucs1lib_rsplit_whitespace(
8740 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8741 PyUnicode_GET_LENGTH(self), maxcount
8742 );
8743 case PyUnicode_2BYTE_KIND:
8744 return ucs2lib_rsplit_whitespace(
8745 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8746 PyUnicode_GET_LENGTH(self), maxcount
8747 );
8748 case PyUnicode_4BYTE_KIND:
8749 return ucs4lib_rsplit_whitespace(
8750 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8751 PyUnicode_GET_LENGTH(self), maxcount
8752 );
8753 default:
8754 assert(0);
8755 return NULL;
8756 }
8757
8758 if (PyUnicode_READY(substring) == -1)
8759 return NULL;
8760
8761 kind1 = PyUnicode_KIND(self);
8762 kind2 = PyUnicode_KIND(substring);
8763 kind = kind1 > kind2 ? kind1 : kind2;
8764 buf1 = PyUnicode_DATA(self);
8765 buf2 = PyUnicode_DATA(substring);
8766 if (kind1 != kind)
8767 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
8768 if (!buf1)
8769 return NULL;
8770 if (kind2 != kind)
8771 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
8772 if (!buf2) {
8773 if (kind1 != kind) PyMem_Free(buf1);
8774 return NULL;
8775 }
8776 len1 = PyUnicode_GET_LENGTH(self);
8777 len2 = PyUnicode_GET_LENGTH(substring);
8778
8779 switch(kind) {
8780 case PyUnicode_1BYTE_KIND:
8781 out = ucs1lib_rsplit(
8782 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8783 break;
8784 case PyUnicode_2BYTE_KIND:
8785 out = ucs2lib_rsplit(
8786 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8787 break;
8788 case PyUnicode_4BYTE_KIND:
8789 out = ucs4lib_rsplit(
8790 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
8791 break;
8792 default:
8793 out = NULL;
8794 }
8795 if (kind1 != kind)
8796 PyMem_Free(buf1);
8797 if (kind2 != kind)
8798 PyMem_Free(buf2);
8799 return out;
8800}
8801
8802static Py_ssize_t
8803anylib_find(int kind, void *buf1, Py_ssize_t len1,
8804 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
8805{
8806 switch(kind) {
8807 case PyUnicode_1BYTE_KIND:
8808 return ucs1lib_find(buf1, len1, buf2, len2, offset);
8809 case PyUnicode_2BYTE_KIND:
8810 return ucs2lib_find(buf1, len1, buf2, len2, offset);
8811 case PyUnicode_4BYTE_KIND:
8812 return ucs4lib_find(buf1, len1, buf2, len2, offset);
8813 }
8814 assert(0);
8815 return -1;
8816}
8817
8818static Py_ssize_t
8819anylib_count(int kind, void* sbuf, Py_ssize_t slen,
8820 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
8821{
8822 switch(kind) {
8823 case PyUnicode_1BYTE_KIND:
8824 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
8825 case PyUnicode_2BYTE_KIND:
8826 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
8827 case PyUnicode_4BYTE_KIND:
8828 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
8829 }
8830 assert(0);
8831 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008832}
8833
Alexander Belopolsky40018472011-02-26 01:02:56 +00008834static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835replace(PyObject *self, PyObject *str1,
8836 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008837{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838 PyObject *u;
8839 char *sbuf = PyUnicode_DATA(self);
8840 char *buf1 = PyUnicode_DATA(str1);
8841 char *buf2 = PyUnicode_DATA(str2);
8842 int srelease = 0, release1 = 0, release2 = 0;
8843 int skind = PyUnicode_KIND(self);
8844 int kind1 = PyUnicode_KIND(str1);
8845 int kind2 = PyUnicode_KIND(str2);
8846 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
8847 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
8848 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849
8850 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008853 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 if (skind < kind1)
8856 /* substring too wide to be present */
8857 goto nothing;
8858
8859 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00008860 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008861 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008863 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008865 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866 Py_UCS4 u1, u2, maxchar;
8867 int mayshrink, rkind;
8868 u1 = PyUnicode_READ_CHAR(str1, 0);
8869 if (!findchar(sbuf, PyUnicode_KIND(self),
8870 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00008871 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872 u2 = PyUnicode_READ_CHAR(str2, 0);
8873 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8874 /* Replacing u1 with u2 may cause a maxchar reduction in the
8875 result string. */
8876 mayshrink = maxchar > 127;
8877 if (u2 > maxchar) {
8878 maxchar = u2;
8879 mayshrink = 0;
8880 }
8881 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008882 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008884 if (PyUnicode_CopyCharacters(u, 0,
8885 (PyObject*)self, 0, slen) < 0)
8886 {
8887 Py_DECREF(u);
8888 return NULL;
8889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008890 rkind = PyUnicode_KIND(u);
8891 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
8892 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008893 if (--maxcount < 0)
8894 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008896 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008897 if (mayshrink) {
8898 PyObject *tmp = u;
8899 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
8900 PyUnicode_GET_LENGTH(tmp));
8901 Py_DECREF(tmp);
8902 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904 int rkind = skind;
8905 char *res;
8906 if (kind1 < rkind) {
8907 /* widen substring */
8908 buf1 = _PyUnicode_AsKind(str1, rkind);
8909 if (!buf1) goto error;
8910 release1 = 1;
8911 }
8912 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008913 if (i < 0)
8914 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915 if (rkind > kind2) {
8916 /* widen replacement */
8917 buf2 = _PyUnicode_AsKind(str2, rkind);
8918 if (!buf2) goto error;
8919 release2 = 1;
8920 }
8921 else if (rkind < kind2) {
8922 /* widen self and buf1 */
8923 rkind = kind2;
8924 if (release1) PyMem_Free(buf1);
8925 sbuf = _PyUnicode_AsKind(self, rkind);
8926 if (!sbuf) goto error;
8927 srelease = 1;
8928 buf1 = _PyUnicode_AsKind(str1, rkind);
8929 if (!buf1) goto error;
8930 release1 = 1;
8931 }
8932 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
8933 if (!res) {
8934 PyErr_NoMemory();
8935 goto error;
8936 }
8937 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008938 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008939 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8940 buf2,
8941 PyUnicode_KIND_SIZE(rkind, len2));
8942 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008943
8944 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
8946 slen-i,
8947 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008948 if (i == -1)
8949 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
8951 buf2,
8952 PyUnicode_KIND_SIZE(rkind, len2));
8953 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008954 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955
8956 u = PyUnicode_FromKindAndData(rkind, res, slen);
8957 PyMem_Free(res);
8958 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 Py_ssize_t n, i, j, ires;
8963 Py_ssize_t product, new_size;
8964 int rkind = skind;
8965 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 if (kind1 < rkind) {
8968 buf1 = _PyUnicode_AsKind(str1, rkind);
8969 if (!buf1) goto error;
8970 release1 = 1;
8971 }
8972 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008973 if (n == 0)
8974 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975 if (kind2 < rkind) {
8976 buf2 = _PyUnicode_AsKind(str2, rkind);
8977 if (!buf2) goto error;
8978 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 else if (kind2 > rkind) {
8981 rkind = kind2;
8982 sbuf = _PyUnicode_AsKind(self, rkind);
8983 if (!sbuf) goto error;
8984 srelease = 1;
8985 if (release1) PyMem_Free(buf1);
8986 buf1 = _PyUnicode_AsKind(str1, rkind);
8987 if (!buf1) goto error;
8988 release1 = 1;
8989 }
8990 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
8991 PyUnicode_GET_LENGTH(str1))); */
8992 product = n * (len2-len1);
8993 if ((product / (len2-len1)) != n) {
8994 PyErr_SetString(PyExc_OverflowError,
8995 "replace string is too long");
8996 goto error;
8997 }
8998 new_size = slen + product;
8999 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9000 PyErr_SetString(PyExc_OverflowError,
9001 "replace string is too long");
9002 goto error;
9003 }
9004 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9005 if (!res)
9006 goto error;
9007 ires = i = 0;
9008 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009009 while (n-- > 0) {
9010 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 j = anylib_find(rkind,
9012 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9013 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009014 if (j == -1)
9015 break;
9016 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009017 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9019 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9020 PyUnicode_KIND_SIZE(rkind, j-i));
9021 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009022 }
9023 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024 if (len2 > 0) {
9025 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9026 buf2,
9027 PyUnicode_KIND_SIZE(rkind, len2));
9028 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009033 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9035 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9036 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009037 } else {
9038 /* interleave */
9039 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9041 buf2,
9042 PyUnicode_KIND_SIZE(rkind, len2));
9043 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009044 if (--n <= 0)
9045 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9047 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9048 PyUnicode_KIND_SIZE(rkind, 1));
9049 ires++;
9050 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9053 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9054 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009055 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058 if (srelease)
9059 PyMem_FREE(sbuf);
9060 if (release1)
9061 PyMem_FREE(buf1);
9062 if (release2)
9063 PyMem_FREE(buf2);
9064 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009065
Benjamin Peterson29060642009-01-31 22:14:21 +00009066 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009067 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 if (srelease)
9069 PyMem_FREE(sbuf);
9070 if (release1)
9071 PyMem_FREE(buf1);
9072 if (release2)
9073 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009074 if (PyUnicode_CheckExact(self)) {
9075 Py_INCREF(self);
9076 return (PyObject *) self;
9077 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009078 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079 error:
9080 if (srelease && sbuf)
9081 PyMem_FREE(sbuf);
9082 if (release1 && buf1)
9083 PyMem_FREE(buf1);
9084 if (release2 && buf2)
9085 PyMem_FREE(buf2);
9086 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087}
9088
9089/* --- Unicode Object Methods --------------------------------------------- */
9090
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009091PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093\n\
9094Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009095characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096
9097static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009098unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100 return fixup(self, fixtitle);
9101}
9102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009103PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009104 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009105\n\
9106Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009107have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108
9109static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009110unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112 return fixup(self, fixcapitalize);
9113}
9114
9115#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009116PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118\n\
9119Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009120normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121
9122static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009123unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124{
9125 PyObject *list;
9126 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009127 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129 /* Split into words */
9130 list = split(self, NULL, -1);
9131 if (!list)
9132 return NULL;
9133
9134 /* Capitalize each word */
9135 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9136 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138 if (item == NULL)
9139 goto onError;
9140 Py_DECREF(PyList_GET_ITEM(list, i));
9141 PyList_SET_ITEM(list, i, item);
9142 }
9143
9144 /* Join the words to form a new string */
9145 item = PyUnicode_Join(NULL, list);
9146
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148 Py_DECREF(list);
9149 return (PyObject *)item;
9150}
9151#endif
9152
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009153/* Argument converter. Coerces to a single unicode character */
9154
9155static int
9156convert_uc(PyObject *obj, void *addr)
9157{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009159 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009160
Benjamin Peterson14339b62009-01-31 16:36:08 +00009161 uniobj = PyUnicode_FromObject(obj);
9162 if (uniobj == NULL) {
9163 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009165 return 0;
9166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009168 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009170 Py_DECREF(uniobj);
9171 return 0;
9172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009173 if (PyUnicode_READY(uniobj)) {
9174 Py_DECREF(uniobj);
9175 return 0;
9176 }
9177 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009178 Py_DECREF(uniobj);
9179 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009180}
9181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009182PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009183 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009185Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009186done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009187
9188static PyObject *
9189unicode_center(PyUnicodeObject *self, PyObject *args)
9190{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009191 Py_ssize_t marg, left;
9192 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 Py_UCS4 fillchar = ' ';
9194
9195 if (PyUnicode_READY(self) == -1)
9196 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009197
Thomas Woutersde017742006-02-16 19:34:37 +00009198 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009199 return NULL;
9200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009201 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202 Py_INCREF(self);
9203 return (PyObject*) self;
9204 }
9205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207 left = marg / 2 + (marg & width & 1);
9208
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009209 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210}
9211
Marc-André Lemburge5034372000-08-08 08:04:29 +00009212#if 0
9213
9214/* This code should go into some future Unicode collation support
9215 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009216 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009217
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009218/* speedy UTF-16 code point order comparison */
9219/* gleaned from: */
9220/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9221
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009222static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009223{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009224 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009225 0, 0, 0, 0, 0, 0, 0, 0,
9226 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009227 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009228};
9229
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230static int
9231unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9232{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009233 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009234
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235 Py_UNICODE *s1 = str1->str;
9236 Py_UNICODE *s2 = str2->str;
9237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238 len1 = str1->_base._base.length;
9239 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009240
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009242 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009243
9244 c1 = *s1++;
9245 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009246
Benjamin Peterson29060642009-01-31 22:14:21 +00009247 if (c1 > (1<<11) * 26)
9248 c1 += utf16Fixup[c1>>11];
9249 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009250 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009251 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009252
9253 if (c1 != c2)
9254 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009255
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009256 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009257 }
9258
9259 return (len1 < len2) ? -1 : (len1 != len2);
9260}
9261
Marc-André Lemburge5034372000-08-08 08:04:29 +00009262#else
9263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264/* This function assumes that str1 and str2 are readied by the caller. */
9265
Marc-André Lemburge5034372000-08-08 08:04:29 +00009266static int
9267unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9268{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 int kind1, kind2;
9270 void *data1, *data2;
9271 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 kind1 = PyUnicode_KIND(str1);
9274 kind2 = PyUnicode_KIND(str2);
9275 data1 = PyUnicode_DATA(str1);
9276 data2 = PyUnicode_DATA(str2);
9277 len1 = PyUnicode_GET_LENGTH(str1);
9278 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 for (i = 0; i < len1 && i < len2; ++i) {
9281 Py_UCS4 c1, c2;
9282 c1 = PyUnicode_READ(kind1, data1, i);
9283 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009284
9285 if (c1 != c2)
9286 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009287 }
9288
9289 return (len1 < len2) ? -1 : (len1 != len2);
9290}
9291
9292#endif
9293
Alexander Belopolsky40018472011-02-26 01:02:56 +00009294int
9295PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9298 if (PyUnicode_READY(left) == -1 ||
9299 PyUnicode_READY(right) == -1)
9300 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009301 return unicode_compare((PyUnicodeObject *)left,
9302 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009303 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009304 PyErr_Format(PyExc_TypeError,
9305 "Can't compare %.100s and %.100s",
9306 left->ob_type->tp_name,
9307 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308 return -1;
9309}
9310
Martin v. Löwis5b222132007-06-10 09:51:05 +00009311int
9312PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9313{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 Py_ssize_t i;
9315 int kind;
9316 void *data;
9317 Py_UCS4 chr;
9318
Martin v. Löwis5b222132007-06-10 09:51:05 +00009319 assert(PyUnicode_Check(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320 if (PyUnicode_READY(uni) == -1)
9321 return -1;
9322 kind = PyUnicode_KIND(uni);
9323 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009324 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009325 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9326 if (chr != str[i])
9327 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009328 /* This check keeps Python strings that end in '\0' from comparing equal
9329 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009331 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009332 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009333 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009334 return 0;
9335}
9336
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009337
Benjamin Peterson29060642009-01-31 22:14:21 +00009338#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009339 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009340
Alexander Belopolsky40018472011-02-26 01:02:56 +00009341PyObject *
9342PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009343{
9344 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009345
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009346 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9347 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 if (PyUnicode_READY(left) == -1 ||
9349 PyUnicode_READY(right) == -1)
9350 return NULL;
9351 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9352 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009353 if (op == Py_EQ) {
9354 Py_INCREF(Py_False);
9355 return Py_False;
9356 }
9357 if (op == Py_NE) {
9358 Py_INCREF(Py_True);
9359 return Py_True;
9360 }
9361 }
9362 if (left == right)
9363 result = 0;
9364 else
9365 result = unicode_compare((PyUnicodeObject *)left,
9366 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009367
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009368 /* Convert the return value to a Boolean */
9369 switch (op) {
9370 case Py_EQ:
9371 v = TEST_COND(result == 0);
9372 break;
9373 case Py_NE:
9374 v = TEST_COND(result != 0);
9375 break;
9376 case Py_LE:
9377 v = TEST_COND(result <= 0);
9378 break;
9379 case Py_GE:
9380 v = TEST_COND(result >= 0);
9381 break;
9382 case Py_LT:
9383 v = TEST_COND(result == -1);
9384 break;
9385 case Py_GT:
9386 v = TEST_COND(result == 1);
9387 break;
9388 default:
9389 PyErr_BadArgument();
9390 return NULL;
9391 }
9392 Py_INCREF(v);
9393 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009394 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009395
Brian Curtindfc80e32011-08-10 20:28:54 -05009396 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009397}
9398
Alexander Belopolsky40018472011-02-26 01:02:56 +00009399int
9400PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009401{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009402 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403 int kind1, kind2, kind;
9404 void *buf1, *buf2;
9405 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009406 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009407
9408 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009409 sub = PyUnicode_FromObject(element);
9410 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009411 PyErr_Format(PyExc_TypeError,
9412 "'in <string>' requires string as left operand, not %s",
9413 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009414 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009415 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009416 if (PyUnicode_READY(sub) == -1)
9417 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009418
Thomas Wouters477c8d52006-05-27 19:21:47 +00009419 str = PyUnicode_FromObject(container);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 if (!str || PyUnicode_READY(container) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009421 Py_DECREF(sub);
9422 return -1;
9423 }
9424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 kind1 = PyUnicode_KIND(str);
9426 kind2 = PyUnicode_KIND(sub);
9427 kind = kind1 > kind2 ? kind1 : kind2;
9428 buf1 = PyUnicode_DATA(str);
9429 buf2 = PyUnicode_DATA(sub);
9430 if (kind1 != kind)
9431 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9432 if (!buf1) {
9433 Py_DECREF(sub);
9434 return -1;
9435 }
9436 if (kind2 != kind)
9437 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9438 if (!buf2) {
9439 Py_DECREF(sub);
9440 if (kind1 != kind) PyMem_Free(buf1);
9441 return -1;
9442 }
9443 len1 = PyUnicode_GET_LENGTH(str);
9444 len2 = PyUnicode_GET_LENGTH(sub);
9445
9446 switch(kind) {
9447 case PyUnicode_1BYTE_KIND:
9448 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9449 break;
9450 case PyUnicode_2BYTE_KIND:
9451 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9452 break;
9453 case PyUnicode_4BYTE_KIND:
9454 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9455 break;
9456 default:
9457 result = -1;
9458 assert(0);
9459 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009460
9461 Py_DECREF(str);
9462 Py_DECREF(sub);
9463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 if (kind1 != kind)
9465 PyMem_Free(buf1);
9466 if (kind2 != kind)
9467 PyMem_Free(buf2);
9468
Guido van Rossum403d68b2000-03-13 15:55:09 +00009469 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009470}
9471
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472/* Concat to string or Unicode object giving a new Unicode object. */
9473
Alexander Belopolsky40018472011-02-26 01:02:56 +00009474PyObject *
9475PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 PyObject *u = NULL, *v = NULL, *w;
9478 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479
9480 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009483 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009486 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487
9488 /* Shortcuts */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 if (v == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009490 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493 if (u == (PyObject*)unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009494 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496 }
9497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 if (PyUnicode_READY(u) == -1 || PyUnicode_READY(v) == -1)
9499 goto onError;
9500
9501 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009502 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505 w = PyUnicode_New(
9506 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9507 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009510 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9511 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009512 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009513 v, 0,
9514 PyUnicode_GET_LENGTH(v)) < 0)
9515 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 Py_DECREF(u);
9517 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519
Benjamin Peterson29060642009-01-31 22:14:21 +00009520 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009521 Py_XDECREF(u);
9522 Py_XDECREF(v);
9523 return NULL;
9524}
9525
Walter Dörwald1ab83302007-05-18 17:15:44 +00009526void
9527PyUnicode_Append(PyObject **pleft, PyObject *right)
9528{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009529 PyObject *new;
9530 if (*pleft == NULL)
9531 return;
9532 if (right == NULL || !PyUnicode_Check(*pleft)) {
9533 Py_DECREF(*pleft);
9534 *pleft = NULL;
9535 return;
9536 }
9537 new = PyUnicode_Concat(*pleft, right);
9538 Py_DECREF(*pleft);
9539 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009540}
9541
9542void
9543PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9544{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009545 PyUnicode_Append(pleft, right);
9546 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009547}
9548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009549PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009550 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009551\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009552Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009553string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009554interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009555
9556static PyObject *
9557unicode_count(PyUnicodeObject *self, PyObject *args)
9558{
9559 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009560 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009561 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 int kind1, kind2, kind;
9564 void *buf1, *buf2;
9565 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566
Jesus Ceaac451502011-04-20 17:09:23 +02009567 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9568 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009569 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 kind1 = PyUnicode_KIND(self);
9572 kind2 = PyUnicode_KIND(substring);
9573 kind = kind1 > kind2 ? kind1 : kind2;
9574 buf1 = PyUnicode_DATA(self);
9575 buf2 = PyUnicode_DATA(substring);
9576 if (kind1 != kind)
9577 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9578 if (!buf1) {
9579 Py_DECREF(substring);
9580 return NULL;
9581 }
9582 if (kind2 != kind)
9583 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9584 if (!buf2) {
9585 Py_DECREF(substring);
9586 if (kind1 != kind) PyMem_Free(buf1);
9587 return NULL;
9588 }
9589 len1 = PyUnicode_GET_LENGTH(self);
9590 len2 = PyUnicode_GET_LENGTH(substring);
9591
9592 ADJUST_INDICES(start, end, len1);
9593 switch(kind) {
9594 case PyUnicode_1BYTE_KIND:
9595 iresult = ucs1lib_count(
9596 ((Py_UCS1*)buf1) + start, end - start,
9597 buf2, len2, PY_SSIZE_T_MAX
9598 );
9599 break;
9600 case PyUnicode_2BYTE_KIND:
9601 iresult = ucs2lib_count(
9602 ((Py_UCS2*)buf1) + start, end - start,
9603 buf2, len2, PY_SSIZE_T_MAX
9604 );
9605 break;
9606 case PyUnicode_4BYTE_KIND:
9607 iresult = ucs4lib_count(
9608 ((Py_UCS4*)buf1) + start, end - start,
9609 buf2, len2, PY_SSIZE_T_MAX
9610 );
9611 break;
9612 default:
9613 assert(0); iresult = 0;
9614 }
9615
9616 result = PyLong_FromSsize_t(iresult);
9617
9618 if (kind1 != kind)
9619 PyMem_Free(buf1);
9620 if (kind2 != kind)
9621 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009622
9623 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009624
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625 return result;
9626}
9627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009628PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00009629 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00009631Encode S using the codec registered for encoding. Default encoding\n\
9632is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00009633handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009634a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
9635'xmlcharrefreplace' as well as any other name registered with\n\
9636codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637
9638static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00009639unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640{
Benjamin Peterson308d6372009-09-18 21:42:35 +00009641 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00009642 char *encoding = NULL;
9643 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00009644
Benjamin Peterson308d6372009-09-18 21:42:35 +00009645 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
9646 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00009648 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00009649}
9650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009651PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009652 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653\n\
9654Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009655If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656
9657static PyObject*
9658unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
9659{
9660 Py_UNICODE *e;
9661 Py_UNICODE *p;
9662 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009663 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009664 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009665 PyUnicodeObject *u;
9666 int tabsize = 8;
9667
9668 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00009669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
9672 return NULL;
9673
Thomas Wouters7e474022000-07-16 12:04:32 +00009674 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009675 i = 0; /* chars up to and including most recent \n or \r */
9676 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
9678 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009680 if (tabsize > 0) {
9681 incr = tabsize - (j % tabsize); /* cannot overflow */
9682 if (j > PY_SSIZE_T_MAX - incr)
9683 goto overflow1;
9684 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009685 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009686 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009688 if (j > PY_SSIZE_T_MAX - 1)
9689 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009690 j++;
9691 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009692 if (i > PY_SSIZE_T_MAX - j)
9693 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009695 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696 }
9697 }
9698
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009699 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00009700 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009701
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702 /* Second pass: create output string and fill it */
9703 u = _PyUnicode_New(i + j);
9704 if (!u)
9705 return NULL;
9706
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009707 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 q = _PyUnicode_WSTR(u); /* next output char */
9709 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009712 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009713 if (tabsize > 0) {
9714 i = tabsize - (j % tabsize);
9715 j += i;
9716 while (i--) {
9717 if (q >= qe)
9718 goto overflow2;
9719 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009720 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009721 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009722 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009723 else {
9724 if (q >= qe)
9725 goto overflow2;
9726 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009727 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728 if (*p == '\n' || *p == '\r')
9729 j = 0;
9730 }
9731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 if (PyUnicode_READY(u) == -1) {
9733 Py_DECREF(u);
9734 return NULL;
9735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00009737
9738 overflow2:
9739 Py_DECREF(u);
9740 overflow1:
9741 PyErr_SetString(PyExc_OverflowError, "new string is too long");
9742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743}
9744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009745PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009746 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747\n\
9748Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08009749such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750arguments start and end are interpreted as in slice notation.\n\
9751\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009752Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753
9754static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009755unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756{
Jesus Ceaac451502011-04-20 17:09:23 +02009757 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009758 Py_ssize_t start;
9759 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009760 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761
Jesus Ceaac451502011-04-20 17:09:23 +02009762 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
9763 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009764 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009766 if (PyUnicode_READY(self) == -1)
9767 return NULL;
9768 if (PyUnicode_READY(substring) == -1)
9769 return NULL;
9770
9771 result = any_find_slice(
9772 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9773 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009774 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009775
9776 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 if (result == -2)
9779 return NULL;
9780
Christian Heimes217cfd12007-12-02 14:31:20 +00009781 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009782}
9783
9784static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009785unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009786{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 Py_UCS4 ch;
9788
9789 if (PyUnicode_READY(self) == -1)
9790 return NULL;
9791 if (index < 0 || index >= _PyUnicode_LENGTH(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009792 PyErr_SetString(PyExc_IndexError, "string index out of range");
9793 return NULL;
9794 }
9795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 ch = PyUnicode_READ(PyUnicode_KIND(self), PyUnicode_DATA(self), index);
9797 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009798}
9799
Guido van Rossumc2504932007-09-18 19:42:40 +00009800/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +01009801 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00009802static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00009803unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804{
Guido van Rossumc2504932007-09-18 19:42:40 +00009805 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +01009806 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 if (_PyUnicode_HASH(self) != -1)
9809 return _PyUnicode_HASH(self);
9810 if (PyUnicode_READY(self) == -1)
9811 return -1;
9812 len = PyUnicode_GET_LENGTH(self);
9813
9814 /* The hash function as a macro, gets expanded three times below. */
9815#define HASH(P) \
9816 x = (Py_uhash_t)*P << 7; \
9817 while (--len >= 0) \
9818 x = (1000003*x) ^ (Py_uhash_t)*P++;
9819
9820 switch (PyUnicode_KIND(self)) {
9821 case PyUnicode_1BYTE_KIND: {
9822 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
9823 HASH(c);
9824 break;
9825 }
9826 case PyUnicode_2BYTE_KIND: {
9827 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
9828 HASH(s);
9829 break;
9830 }
9831 default: {
9832 Py_UCS4 *l;
9833 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
9834 "Impossible switch case in unicode_hash");
9835 l = PyUnicode_4BYTE_DATA(self);
9836 HASH(l);
9837 break;
9838 }
9839 }
9840 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
9841
Guido van Rossumc2504932007-09-18 19:42:40 +00009842 if (x == -1)
9843 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +00009845 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +00009848
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009849PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009850 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009852Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853
9854static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009856{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009857 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02009858 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00009859 Py_ssize_t start;
9860 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861
Jesus Ceaac451502011-04-20 17:09:23 +02009862 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
9863 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 if (PyUnicode_READY(self) == -1)
9867 return NULL;
9868 if (PyUnicode_READY(substring) == -1)
9869 return NULL;
9870
9871 result = any_find_slice(
9872 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
9873 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00009874 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875
9876 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 if (result == -2)
9879 return NULL;
9880
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881 if (result < 0) {
9882 PyErr_SetString(PyExc_ValueError, "substring not found");
9883 return NULL;
9884 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009885
Christian Heimes217cfd12007-12-02 14:31:20 +00009886 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887}
9888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009889PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009890 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00009892Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009893at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894
9895static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009896unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009897{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 Py_ssize_t i, length;
9899 int kind;
9900 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901 int cased;
9902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 if (PyUnicode_READY(self) == -1)
9904 return NULL;
9905 length = PyUnicode_GET_LENGTH(self);
9906 kind = PyUnicode_KIND(self);
9907 data = PyUnicode_DATA(self);
9908
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 if (length == 1)
9911 return PyBool_FromLong(
9912 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009914 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009916 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009917
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 for (i = 0; i < length; i++) {
9920 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009921
Benjamin Peterson29060642009-01-31 22:14:21 +00009922 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
9923 return PyBool_FromLong(0);
9924 else if (!cased && Py_UNICODE_ISLOWER(ch))
9925 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009927 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928}
9929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009930PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009931 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009933Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009934at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935
9936static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009937unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009938{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 Py_ssize_t i, length;
9940 int kind;
9941 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942 int cased;
9943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 if (PyUnicode_READY(self) == -1)
9945 return NULL;
9946 length = PyUnicode_GET_LENGTH(self);
9947 kind = PyUnicode_KIND(self);
9948 data = PyUnicode_DATA(self);
9949
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 if (length == 1)
9952 return PyBool_FromLong(
9953 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009954
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009955 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009957 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00009958
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009960 for (i = 0; i < length; i++) {
9961 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009962
Benjamin Peterson29060642009-01-31 22:14:21 +00009963 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
9964 return PyBool_FromLong(0);
9965 else if (!cased && Py_UNICODE_ISUPPER(ch))
9966 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009967 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00009968 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969}
9970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009971PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009972 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00009974Return True if S is a titlecased string and there is at least one\n\
9975character in S, i.e. upper- and titlecase characters may only\n\
9976follow uncased characters and lowercase characters only cased ones.\n\
9977Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009978
9979static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009980unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009981{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 Py_ssize_t i, length;
9983 int kind;
9984 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009985 int cased, previous_is_cased;
9986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 if (PyUnicode_READY(self) == -1)
9988 return NULL;
9989 length = PyUnicode_GET_LENGTH(self);
9990 kind = PyUnicode_KIND(self);
9991 data = PyUnicode_DATA(self);
9992
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009994 if (length == 1) {
9995 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
9996 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
9997 (Py_UNICODE_ISUPPER(ch) != 0));
9998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009999
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010000 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010002 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010003
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004 cased = 0;
10005 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 for (i = 0; i < length; i++) {
10007 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010008
Benjamin Peterson29060642009-01-31 22:14:21 +000010009 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10010 if (previous_is_cased)
10011 return PyBool_FromLong(0);
10012 previous_is_cased = 1;
10013 cased = 1;
10014 }
10015 else if (Py_UNICODE_ISLOWER(ch)) {
10016 if (!previous_is_cased)
10017 return PyBool_FromLong(0);
10018 previous_is_cased = 1;
10019 cased = 1;
10020 }
10021 else
10022 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010024 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025}
10026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010027PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010028 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010030Return True if all characters in S are whitespace\n\
10031and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032
10033static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010034unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 Py_ssize_t i, length;
10037 int kind;
10038 void *data;
10039
10040 if (PyUnicode_READY(self) == -1)
10041 return NULL;
10042 length = PyUnicode_GET_LENGTH(self);
10043 kind = PyUnicode_KIND(self);
10044 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010045
Guido van Rossumd57fd912000-03-10 22:53:23 +000010046 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 if (length == 1)
10048 return PyBool_FromLong(
10049 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010050
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010051 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010053 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010055 for (i = 0; i < length; i++) {
10056 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010057 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010058 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010059 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010060 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010061}
10062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010063PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010064 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010065\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010066Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010067and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010068
10069static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010070unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 Py_ssize_t i, length;
10073 int kind;
10074 void *data;
10075
10076 if (PyUnicode_READY(self) == -1)
10077 return NULL;
10078 length = PyUnicode_GET_LENGTH(self);
10079 kind = PyUnicode_KIND(self);
10080 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010081
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010082 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 if (length == 1)
10084 return PyBool_FromLong(
10085 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010086
10087 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010089 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 for (i = 0; i < length; i++) {
10092 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010093 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010094 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010095 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010096}
10097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010098PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010099 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010100\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010101Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010102and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010103
10104static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010105unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010106{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 int kind;
10108 void *data;
10109 Py_ssize_t len, i;
10110
10111 if (PyUnicode_READY(self) == -1)
10112 return NULL;
10113
10114 kind = PyUnicode_KIND(self);
10115 data = PyUnicode_DATA(self);
10116 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010117
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010118 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 if (len == 1) {
10120 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10121 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10122 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010123
10124 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010126 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 for (i = 0; i < len; i++) {
10129 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010130 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010131 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010132 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010133 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010134}
10135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010136PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010137 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010139Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010140False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141
10142static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010143unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 Py_ssize_t i, length;
10146 int kind;
10147 void *data;
10148
10149 if (PyUnicode_READY(self) == -1)
10150 return NULL;
10151 length = PyUnicode_GET_LENGTH(self);
10152 kind = PyUnicode_KIND(self);
10153 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010154
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 if (length == 1)
10157 return PyBool_FromLong(
10158 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010160 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010162 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 for (i = 0; i < length; i++) {
10165 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010166 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010168 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169}
10170
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010171PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010172 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010174Return True if all characters in S are digits\n\
10175and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176
10177static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010178unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 Py_ssize_t i, length;
10181 int kind;
10182 void *data;
10183
10184 if (PyUnicode_READY(self) == -1)
10185 return NULL;
10186 length = PyUnicode_GET_LENGTH(self);
10187 kind = PyUnicode_KIND(self);
10188 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189
Guido van Rossumd57fd912000-03-10 22:53:23 +000010190 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 if (length == 1) {
10192 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10193 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010195
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010196 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010198 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 for (i = 0; i < length; i++) {
10201 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010202 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010204 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205}
10206
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010207PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010208 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010210Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010211False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212
10213static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010214unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010215{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010216 Py_ssize_t i, length;
10217 int kind;
10218 void *data;
10219
10220 if (PyUnicode_READY(self) == -1)
10221 return NULL;
10222 length = PyUnicode_GET_LENGTH(self);
10223 kind = PyUnicode_KIND(self);
10224 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 if (length == 1)
10228 return PyBool_FromLong(
10229 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010231 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010233 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 for (i = 0; i < length; i++) {
10236 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010237 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010239 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240}
10241
Martin v. Löwis47383402007-08-15 07:32:56 +000010242int
10243PyUnicode_IsIdentifier(PyObject *self)
10244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 int kind;
10246 void *data;
10247 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010248 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 if (PyUnicode_READY(self) == -1) {
10251 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010252 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 }
10254
10255 /* Special case for empty strings */
10256 if (PyUnicode_GET_LENGTH(self) == 0)
10257 return 0;
10258 kind = PyUnicode_KIND(self);
10259 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010260
10261 /* PEP 3131 says that the first character must be in
10262 XID_Start and subsequent characters in XID_Continue,
10263 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010264 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010265 letters, digits, underscore). However, given the current
10266 definition of XID_Start and XID_Continue, it is sufficient
10267 to check just for these, except that _ must be allowed
10268 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010270 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010271 return 0;
10272
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010273 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010275 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010276 return 1;
10277}
10278
10279PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010281\n\
10282Return True if S is a valid identifier according\n\
10283to the language definition.");
10284
10285static PyObject*
10286unicode_isidentifier(PyObject *self)
10287{
10288 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10289}
10290
Georg Brandl559e5d72008-06-11 18:37:52 +000010291PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010292 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010293\n\
10294Return True if all characters in S are considered\n\
10295printable in repr() or S is empty, False otherwise.");
10296
10297static PyObject*
10298unicode_isprintable(PyObject *self)
10299{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 Py_ssize_t i, length;
10301 int kind;
10302 void *data;
10303
10304 if (PyUnicode_READY(self) == -1)
10305 return NULL;
10306 length = PyUnicode_GET_LENGTH(self);
10307 kind = PyUnicode_KIND(self);
10308 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010309
10310 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 if (length == 1)
10312 return PyBool_FromLong(
10313 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010314
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 for (i = 0; i < length; i++) {
10316 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010317 Py_RETURN_FALSE;
10318 }
10319 }
10320 Py_RETURN_TRUE;
10321}
10322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010323PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010324 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325\n\
10326Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010327iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328
10329static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010330unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010332 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010333}
10334
Martin v. Löwis18e16552006-02-15 17:27:45 +000010335static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336unicode_length(PyUnicodeObject *self)
10337{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 if (PyUnicode_READY(self) == -1)
10339 return -1;
10340 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010341}
10342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010343PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010344 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010345\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010346Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010347done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348
10349static PyObject *
10350unicode_ljust(PyUnicodeObject *self, PyObject *args)
10351{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010352 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 Py_UCS4 fillchar = ' ';
10354
10355 if (PyUnicode_READY(self) == -1)
10356 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010357
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010358 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359 return NULL;
10360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362 Py_INCREF(self);
10363 return (PyObject*) self;
10364 }
10365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367}
10368
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010369PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010370 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010372Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373
10374static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010375unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377 return fixup(self, fixlower);
10378}
10379
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010380#define LEFTSTRIP 0
10381#define RIGHTSTRIP 1
10382#define BOTHSTRIP 2
10383
10384/* Arrays indexed by above */
10385static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10386
10387#define STRIPNAME(i) (stripformat[i]+3)
10388
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010389/* externally visible for str.strip(unicode) */
10390PyObject *
10391_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10392{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 void *data;
10394 int kind;
10395 Py_ssize_t i, j, len;
10396 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10399 return NULL;
10400
10401 kind = PyUnicode_KIND(self);
10402 data = PyUnicode_DATA(self);
10403 len = PyUnicode_GET_LENGTH(self);
10404 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10405 PyUnicode_DATA(sepobj),
10406 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010407
Benjamin Peterson14339b62009-01-31 16:36:08 +000010408 i = 0;
10409 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 while (i < len &&
10411 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010412 i++;
10413 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010414 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010415
Benjamin Peterson14339b62009-01-31 16:36:08 +000010416 j = len;
10417 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010418 do {
10419 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 } while (j >= i &&
10421 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010422 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010423 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010424
Benjamin Peterson14339b62009-01-31 16:36:08 +000010425 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010426 Py_INCREF(self);
10427 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010428 }
10429 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 return PyUnicode_Substring((PyObject*)self, i, j);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010431}
10432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433/* Assumes an already ready self string. */
10434
10435static PyObject *
10436substring(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t len)
10437{
10438 const int kind = PyUnicode_KIND(self);
10439 void *data = PyUnicode_DATA(self);
10440 Py_UCS4 maxchar = 0;
10441 Py_ssize_t i;
10442 PyObject *unicode;
10443
10444 if (start < 0 || len < 0 || (start + len) > PyUnicode_GET_LENGTH(self)) {
10445 PyErr_BadInternalCall();
10446 return NULL;
10447 }
10448
10449 if (len == PyUnicode_GET_LENGTH(self) && PyUnicode_CheckExact(self)) {
10450 Py_INCREF(self);
10451 return (PyObject*)self;
10452 }
10453
10454 for (i = 0; i < len; ++i) {
10455 const Py_UCS4 ch = PyUnicode_READ(kind, data, start + i);
10456 if (ch > maxchar)
10457 maxchar = ch;
10458 }
10459
10460 unicode = PyUnicode_New(len, maxchar);
10461 if (unicode == NULL)
10462 return NULL;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010463 if (PyUnicode_CopyCharacters(unicode, 0,
10464 (PyObject*)self, start, len) < 0)
10465 {
10466 Py_DECREF(unicode);
10467 return NULL;
10468 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 return unicode;
10470}
10471
10472PyObject*
10473PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10474{
10475 unsigned char *data;
10476 int kind;
10477
10478 if (start == 0 && end == PyUnicode_GET_LENGTH(self)
10479 && PyUnicode_CheckExact(self))
10480 {
10481 Py_INCREF(self);
10482 return (PyObject *)self;
10483 }
10484
10485 if ((end - start) == 1)
10486 return unicode_getitem((PyUnicodeObject*)self, start);
10487
10488 if (PyUnicode_READY(self) == -1)
10489 return NULL;
10490 kind = PyUnicode_KIND(self);
10491 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010492 return PyUnicode_FromKindAndData(kind,
10493 data + PyUnicode_KIND_SIZE(kind, start),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 end-start);
10495}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496
10497static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010498do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 int kind;
10501 void *data;
10502 Py_ssize_t len, i, j;
10503
10504 if (PyUnicode_READY(self) == -1)
10505 return NULL;
10506
10507 kind = PyUnicode_KIND(self);
10508 data = PyUnicode_DATA(self);
10509 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010510
Benjamin Peterson14339b62009-01-31 16:36:08 +000010511 i = 0;
10512 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010514 i++;
10515 }
10516 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010517
Benjamin Peterson14339b62009-01-31 16:36:08 +000010518 j = len;
10519 if (striptype != LEFTSTRIP) {
10520 do {
10521 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010523 j++;
10524 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010525
Benjamin Peterson14339b62009-01-31 16:36:08 +000010526 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
10527 Py_INCREF(self);
10528 return (PyObject*)self;
10529 }
10530 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 return substring(self, i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532}
10533
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010534
10535static PyObject *
10536do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10537{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010538 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010539
Benjamin Peterson14339b62009-01-31 16:36:08 +000010540 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10541 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010542
Benjamin Peterson14339b62009-01-31 16:36:08 +000010543 if (sep != NULL && sep != Py_None) {
10544 if (PyUnicode_Check(sep))
10545 return _PyUnicode_XStrip(self, striptype, sep);
10546 else {
10547 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010548 "%s arg must be None or str",
10549 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010550 return NULL;
10551 }
10552 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010553
Benjamin Peterson14339b62009-01-31 16:36:08 +000010554 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010555}
10556
10557
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010558PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010559 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010560\n\
10561Return a copy of the string S with leading and trailing\n\
10562whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010563If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010564
10565static PyObject *
10566unicode_strip(PyUnicodeObject *self, PyObject *args)
10567{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010568 if (PyTuple_GET_SIZE(args) == 0)
10569 return do_strip(self, BOTHSTRIP); /* Common case */
10570 else
10571 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010572}
10573
10574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010575PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010576 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010577\n\
10578Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010579If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010580
10581static PyObject *
10582unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10583{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010584 if (PyTuple_GET_SIZE(args) == 0)
10585 return do_strip(self, LEFTSTRIP); /* Common case */
10586 else
10587 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010588}
10589
10590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010591PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010592 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010593\n\
10594Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010595If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010596
10597static PyObject *
10598unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10599{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010600 if (PyTuple_GET_SIZE(args) == 0)
10601 return do_strip(self, RIGHTSTRIP); /* Common case */
10602 else
10603 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010604}
10605
10606
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010608unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609{
10610 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 Py_ssize_t nchars, n;
10612 size_t nbytes, char_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613
Georg Brandl222de0f2009-04-12 12:01:50 +000010614 if (len < 1) {
10615 Py_INCREF(unicode_empty);
10616 return (PyObject *)unicode_empty;
10617 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618
Tim Peters7a29bd52001-09-12 03:03:31 +000010619 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620 /* no repeat, return original string */
10621 Py_INCREF(str);
10622 return (PyObject*) str;
10623 }
Tim Peters8f422462000-09-09 06:13:41 +000010624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 if (PyUnicode_READY(str) == -1)
10626 return NULL;
10627
Tim Peters8f422462000-09-09 06:13:41 +000010628 /* ensure # of chars needed doesn't overflow int and # of bytes
10629 * needed doesn't overflow size_t
10630 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 nchars = len * PyUnicode_GET_LENGTH(str);
10632 if (nchars / len != PyUnicode_GET_LENGTH(str)) {
Tim Peters8f422462000-09-09 06:13:41 +000010633 PyErr_SetString(PyExc_OverflowError,
10634 "repeated string is too long");
10635 return NULL;
10636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 char_size = PyUnicode_CHARACTER_SIZE(str);
10638 nbytes = (nchars + 1) * char_size;
10639 if (nbytes / char_size != (size_t)(nchars + 1)) {
Tim Peters8f422462000-09-09 06:13:41 +000010640 PyErr_SetString(PyExc_OverflowError,
10641 "repeated string is too long");
10642 return NULL;
10643 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645 if (!u)
10646 return NULL;
10647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 if (PyUnicode_GET_LENGTH(str) == 1) {
10649 const int kind = PyUnicode_KIND(str);
10650 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10651 void *to = PyUnicode_DATA(u);
10652 for (n = 0; n < len; ++n)
10653 PyUnicode_WRITE(kind, to, n, fill_char);
10654 }
10655 else {
10656 /* number of characters copied this far */
10657 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10658 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10659 char *to = (char *) PyUnicode_DATA(u);
10660 Py_MEMCPY(to, PyUnicode_DATA(str),
10661 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010662 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 n = (done <= nchars-done) ? done : nchars-done;
10664 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010665 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010666 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010667 }
10668
10669 return (PyObject*) u;
10670}
10671
Alexander Belopolsky40018472011-02-26 01:02:56 +000010672PyObject *
10673PyUnicode_Replace(PyObject *obj,
10674 PyObject *subobj,
10675 PyObject *replobj,
10676 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677{
10678 PyObject *self;
10679 PyObject *str1;
10680 PyObject *str2;
10681 PyObject *result;
10682
10683 self = PyUnicode_FromObject(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 if (self == NULL || PyUnicode_READY(obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010685 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686 str1 = PyUnicode_FromObject(subobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 if (str1 == NULL || PyUnicode_READY(obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010688 Py_DECREF(self);
10689 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010690 }
10691 str2 = PyUnicode_FromObject(replobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 if (str2 == NULL || PyUnicode_READY(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010693 Py_DECREF(self);
10694 Py_DECREF(str1);
10695 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698 Py_DECREF(self);
10699 Py_DECREF(str1);
10700 Py_DECREF(str2);
10701 return result;
10702}
10703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010704PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000010705 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706\n\
10707Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000010708old replaced by new. If the optional argument count is\n\
10709given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710
10711static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 PyObject *str1;
10715 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010716 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717 PyObject *result;
10718
Martin v. Löwis18e16552006-02-15 17:27:45 +000010719 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000010722 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 str1 = PyUnicode_FromObject(str1);
10724 if (str1 == NULL || PyUnicode_READY(str1) == -1)
10725 return NULL;
10726 str2 = PyUnicode_FromObject(str2);
10727 if (str2 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010728 Py_DECREF(str1);
10729 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000010730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731
10732 result = replace(self, str1, str2, maxcount);
10733
10734 Py_DECREF(str1);
10735 Py_DECREF(str2);
10736 return result;
10737}
10738
Alexander Belopolsky40018472011-02-26 01:02:56 +000010739static PyObject *
10740unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741{
Walter Dörwald79e913e2007-05-12 11:08:06 +000010742 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 Py_ssize_t isize;
10744 Py_ssize_t osize, squote, dquote, i, o;
10745 Py_UCS4 max, quote;
10746 int ikind, okind;
10747 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000010748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000010750 return NULL;
10751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 isize = PyUnicode_GET_LENGTH(unicode);
10753 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 /* Compute length of output, quote characters, and
10756 maximum character */
10757 osize = 2; /* quotes */
10758 max = 127;
10759 squote = dquote = 0;
10760 ikind = PyUnicode_KIND(unicode);
10761 for (i = 0; i < isize; i++) {
10762 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
10763 switch (ch) {
10764 case '\'': squote++; osize++; break;
10765 case '"': dquote++; osize++; break;
10766 case '\\': case '\t': case '\r': case '\n':
10767 osize += 2; break;
10768 default:
10769 /* Fast-path ASCII */
10770 if (ch < ' ' || ch == 0x7f)
10771 osize += 4; /* \xHH */
10772 else if (ch < 0x7f)
10773 osize++;
10774 else if (Py_UNICODE_ISPRINTABLE(ch)) {
10775 osize++;
10776 max = ch > max ? ch : max;
10777 }
10778 else if (ch < 0x100)
10779 osize += 4; /* \xHH */
10780 else if (ch < 0x10000)
10781 osize += 6; /* \uHHHH */
10782 else
10783 osize += 10; /* \uHHHHHHHH */
10784 }
10785 }
10786
10787 quote = '\'';
10788 if (squote) {
10789 if (dquote)
10790 /* Both squote and dquote present. Use squote,
10791 and escape them */
10792 osize += squote;
10793 else
10794 quote = '"';
10795 }
10796
10797 repr = PyUnicode_New(osize, max);
10798 if (repr == NULL)
10799 return NULL;
10800 okind = PyUnicode_KIND(repr);
10801 odata = PyUnicode_DATA(repr);
10802
10803 PyUnicode_WRITE(okind, odata, 0, quote);
10804 PyUnicode_WRITE(okind, odata, osize-1, quote);
10805
10806 for (i = 0, o = 1; i < isize; i++) {
10807 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010808
10809 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 if ((ch == quote) || (ch == '\\')) {
10811 PyUnicode_WRITE(okind, odata, o++, '\\');
10812 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010813 continue;
10814 }
10815
Benjamin Peterson29060642009-01-31 22:14:21 +000010816 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010817 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818 PyUnicode_WRITE(okind, odata, o++, '\\');
10819 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010820 }
10821 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 PyUnicode_WRITE(okind, odata, o++, '\\');
10823 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010824 }
10825 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826 PyUnicode_WRITE(okind, odata, o++, '\\');
10827 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000010828 }
10829
10830 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000010831 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 PyUnicode_WRITE(okind, odata, o++, '\\');
10833 PyUnicode_WRITE(okind, odata, o++, 'x');
10834 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10835 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000010836 }
10837
Georg Brandl559e5d72008-06-11 18:37:52 +000010838 /* Copy ASCII characters as-is */
10839 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010841 }
10842
Benjamin Peterson29060642009-01-31 22:14:21 +000010843 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000010844 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010845 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000010846 (categories Z* and C* except ASCII space)
10847 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010849 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 if (ch <= 0xff) {
10851 PyUnicode_WRITE(okind, odata, o++, '\\');
10852 PyUnicode_WRITE(okind, odata, o++, 'x');
10853 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
10854 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010855 }
10856 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 else if (ch >= 0x10000) {
10858 PyUnicode_WRITE(okind, odata, o++, '\\');
10859 PyUnicode_WRITE(okind, odata, o++, 'U');
10860 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
10861 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
10862 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
10863 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
10864 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10865 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10866 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10867 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010868 }
10869 /* Map 16-bit characters to '\uxxxx' */
10870 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010871 PyUnicode_WRITE(okind, odata, o++, '\\');
10872 PyUnicode_WRITE(okind, odata, o++, 'u');
10873 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
10874 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
10875 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
10876 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000010877 }
10878 }
10879 /* Copy characters as-is */
10880 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000010882 }
10883 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000010884 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010885 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000010886 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010887}
10888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010889PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010890 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010891\n\
10892Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010893such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894arguments start and end are interpreted as in slice notation.\n\
10895\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010896Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897
10898static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010899unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010900{
Jesus Ceaac451502011-04-20 17:09:23 +020010901 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010902 Py_ssize_t start;
10903 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010904 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905
Jesus Ceaac451502011-04-20 17:09:23 +020010906 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
10907 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010908 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910 if (PyUnicode_READY(self) == -1)
10911 return NULL;
10912 if (PyUnicode_READY(substring) == -1)
10913 return NULL;
10914
10915 result = any_find_slice(
10916 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10917 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010918 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919
10920 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922 if (result == -2)
10923 return NULL;
10924
Christian Heimes217cfd12007-12-02 14:31:20 +000010925 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926}
10927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010928PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010929 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010931Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932
10933static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010934unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935{
Jesus Ceaac451502011-04-20 17:09:23 +020010936 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010937 Py_ssize_t start;
10938 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010939 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940
Jesus Ceaac451502011-04-20 17:09:23 +020010941 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
10942 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000010943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945 if (PyUnicode_READY(self) == -1)
10946 return NULL;
10947 if (PyUnicode_READY(substring) == -1)
10948 return NULL;
10949
10950 result = any_find_slice(
10951 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
10952 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010953 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954
10955 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 if (result == -2)
10958 return NULL;
10959
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960 if (result < 0) {
10961 PyErr_SetString(PyExc_ValueError, "substring not found");
10962 return NULL;
10963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964
Christian Heimes217cfd12007-12-02 14:31:20 +000010965 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966}
10967
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010968PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010969 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010971Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010972done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973
10974static PyObject *
10975unicode_rjust(PyUnicodeObject *self, PyObject *args)
10976{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010977 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010978 Py_UCS4 fillchar = ' ';
10979
10980 if (PyUnicode_READY(self) == -1)
10981 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010982
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010983 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984 return NULL;
10985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987 Py_INCREF(self);
10988 return (PyObject*) self;
10989 }
10990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992}
10993
Alexander Belopolsky40018472011-02-26 01:02:56 +000010994PyObject *
10995PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996{
10997 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000010998
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999 s = PyUnicode_FromObject(s);
11000 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011001 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011002 if (sep != NULL) {
11003 sep = PyUnicode_FromObject(sep);
11004 if (sep == NULL) {
11005 Py_DECREF(s);
11006 return NULL;
11007 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 }
11009
11010 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11011
11012 Py_DECREF(s);
11013 Py_XDECREF(sep);
11014 return result;
11015}
11016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011017PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011018 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019\n\
11020Return a list of the words in S, using sep as the\n\
11021delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011022splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011023whitespace string is a separator and empty strings are\n\
11024removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025
11026static PyObject*
11027unicode_split(PyUnicodeObject *self, PyObject *args)
11028{
11029 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011030 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031
Martin v. Löwis18e16552006-02-15 17:27:45 +000011032 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033 return NULL;
11034
11035 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011036 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011038 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011040 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041}
11042
Thomas Wouters477c8d52006-05-27 19:21:47 +000011043PyObject *
11044PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11045{
11046 PyObject* str_obj;
11047 PyObject* sep_obj;
11048 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049 int kind1, kind2, kind;
11050 void *buf1 = NULL, *buf2 = NULL;
11051 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011052
11053 str_obj = PyUnicode_FromObject(str_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054 if (!str_obj || PyUnicode_READY(str_in) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011055 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011056 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011057 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011058 Py_DECREF(str_obj);
11059 return NULL;
11060 }
11061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011062 kind1 = PyUnicode_KIND(str_in);
11063 kind2 = PyUnicode_KIND(sep_obj);
11064 kind = kind1 > kind2 ? kind1 : kind2;
11065 buf1 = PyUnicode_DATA(str_in);
11066 if (kind1 != kind)
11067 buf1 = _PyUnicode_AsKind(str_in, kind);
11068 if (!buf1)
11069 goto onError;
11070 buf2 = PyUnicode_DATA(sep_obj);
11071 if (kind2 != kind)
11072 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11073 if (!buf2)
11074 goto onError;
11075 len1 = PyUnicode_GET_LENGTH(str_obj);
11076 len2 = PyUnicode_GET_LENGTH(sep_obj);
11077
11078 switch(PyUnicode_KIND(str_in)) {
11079 case PyUnicode_1BYTE_KIND:
11080 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11081 break;
11082 case PyUnicode_2BYTE_KIND:
11083 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11084 break;
11085 case PyUnicode_4BYTE_KIND:
11086 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11087 break;
11088 default:
11089 assert(0);
11090 out = 0;
11091 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011092
11093 Py_DECREF(sep_obj);
11094 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 if (kind1 != kind)
11096 PyMem_Free(buf1);
11097 if (kind2 != kind)
11098 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011099
11100 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 onError:
11102 Py_DECREF(sep_obj);
11103 Py_DECREF(str_obj);
11104 if (kind1 != kind && buf1)
11105 PyMem_Free(buf1);
11106 if (kind2 != kind && buf2)
11107 PyMem_Free(buf2);
11108 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011109}
11110
11111
11112PyObject *
11113PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11114{
11115 PyObject* str_obj;
11116 PyObject* sep_obj;
11117 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118 int kind1, kind2, kind;
11119 void *buf1 = NULL, *buf2 = NULL;
11120 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011121
11122 str_obj = PyUnicode_FromObject(str_in);
11123 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011124 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011125 sep_obj = PyUnicode_FromObject(sep_in);
11126 if (!sep_obj) {
11127 Py_DECREF(str_obj);
11128 return NULL;
11129 }
11130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011131 kind1 = PyUnicode_KIND(str_in);
11132 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011133 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 buf1 = PyUnicode_DATA(str_in);
11135 if (kind1 != kind)
11136 buf1 = _PyUnicode_AsKind(str_in, kind);
11137 if (!buf1)
11138 goto onError;
11139 buf2 = PyUnicode_DATA(sep_obj);
11140 if (kind2 != kind)
11141 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11142 if (!buf2)
11143 goto onError;
11144 len1 = PyUnicode_GET_LENGTH(str_obj);
11145 len2 = PyUnicode_GET_LENGTH(sep_obj);
11146
11147 switch(PyUnicode_KIND(str_in)) {
11148 case PyUnicode_1BYTE_KIND:
11149 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11150 break;
11151 case PyUnicode_2BYTE_KIND:
11152 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11153 break;
11154 case PyUnicode_4BYTE_KIND:
11155 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11156 break;
11157 default:
11158 assert(0);
11159 out = 0;
11160 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011161
11162 Py_DECREF(sep_obj);
11163 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 if (kind1 != kind)
11165 PyMem_Free(buf1);
11166 if (kind2 != kind)
11167 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011168
11169 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170 onError:
11171 Py_DECREF(sep_obj);
11172 Py_DECREF(str_obj);
11173 if (kind1 != kind && buf1)
11174 PyMem_Free(buf1);
11175 if (kind2 != kind && buf2)
11176 PyMem_Free(buf2);
11177 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011178}
11179
11180PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011181 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011182\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011183Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011184the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011185found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011186
11187static PyObject*
11188unicode_partition(PyUnicodeObject *self, PyObject *separator)
11189{
11190 return PyUnicode_Partition((PyObject *)self, separator);
11191}
11192
11193PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011194 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011195\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011196Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011197the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011198separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011199
11200static PyObject*
11201unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11202{
11203 return PyUnicode_RPartition((PyObject *)self, separator);
11204}
11205
Alexander Belopolsky40018472011-02-26 01:02:56 +000011206PyObject *
11207PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011208{
11209 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011210
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011211 s = PyUnicode_FromObject(s);
11212 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011213 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011214 if (sep != NULL) {
11215 sep = PyUnicode_FromObject(sep);
11216 if (sep == NULL) {
11217 Py_DECREF(s);
11218 return NULL;
11219 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011220 }
11221
11222 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11223
11224 Py_DECREF(s);
11225 Py_XDECREF(sep);
11226 return result;
11227}
11228
11229PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011230 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011231\n\
11232Return a list of the words in S, using sep as the\n\
11233delimiter string, starting at the end of the string and\n\
11234working to the front. If maxsplit is given, at most maxsplit\n\
11235splits are done. If sep is not specified, any whitespace string\n\
11236is a separator.");
11237
11238static PyObject*
11239unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11240{
11241 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011242 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011243
Martin v. Löwis18e16552006-02-15 17:27:45 +000011244 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011245 return NULL;
11246
11247 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011248 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011249 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011250 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011251 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011252 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011253}
11254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011255PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011256 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257\n\
11258Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011259Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011260is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261
11262static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011263unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011265 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011266 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011268 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11269 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270 return NULL;
11271
Guido van Rossum86662912000-04-11 15:38:46 +000011272 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273}
11274
11275static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011276PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277{
Walter Dörwald346737f2007-05-31 10:44:43 +000011278 if (PyUnicode_CheckExact(self)) {
11279 Py_INCREF(self);
11280 return self;
11281 } else
11282 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011283 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284}
11285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011286PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011287 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288\n\
11289Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011290and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291
11292static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011293unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295 return fixup(self, fixswapcase);
11296}
11297
Georg Brandlceee0772007-11-27 23:48:05 +000011298PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011300\n\
11301Return a translation table usable for str.translate().\n\
11302If there is only one argument, it must be a dictionary mapping Unicode\n\
11303ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011304Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011305If there are two arguments, they must be strings of equal length, and\n\
11306in the resulting dictionary, each character in x will be mapped to the\n\
11307character at the same position in y. If there is a third argument, it\n\
11308must be a string, whose characters will be mapped to None in the result.");
11309
11310static PyObject*
11311unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11312{
11313 PyObject *x, *y = NULL, *z = NULL;
11314 PyObject *new = NULL, *key, *value;
11315 Py_ssize_t i = 0;
11316 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011317
Georg Brandlceee0772007-11-27 23:48:05 +000011318 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11319 return NULL;
11320 new = PyDict_New();
11321 if (!new)
11322 return NULL;
11323 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 int x_kind, y_kind, z_kind;
11325 void *x_data, *y_data, *z_data;
11326
Georg Brandlceee0772007-11-27 23:48:05 +000011327 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011328 if (!PyUnicode_Check(x)) {
11329 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11330 "be a string if there is a second argument");
11331 goto err;
11332 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011334 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11335 "arguments must have equal length");
11336 goto err;
11337 }
11338 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 x_kind = PyUnicode_KIND(x);
11340 y_kind = PyUnicode_KIND(y);
11341 x_data = PyUnicode_DATA(x);
11342 y_data = PyUnicode_DATA(y);
11343 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11344 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11345 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011346 if (!key || !value)
11347 goto err;
11348 res = PyDict_SetItem(new, key, value);
11349 Py_DECREF(key);
11350 Py_DECREF(value);
11351 if (res < 0)
11352 goto err;
11353 }
11354 /* create entries for deleting chars in z */
11355 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 z_kind = PyUnicode_KIND(z);
11357 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011358 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011360 if (!key)
11361 goto err;
11362 res = PyDict_SetItem(new, key, Py_None);
11363 Py_DECREF(key);
11364 if (res < 0)
11365 goto err;
11366 }
11367 }
11368 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 int kind;
11370 void *data;
11371
Georg Brandlceee0772007-11-27 23:48:05 +000011372 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011373 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011374 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11375 "to maketrans it must be a dict");
11376 goto err;
11377 }
11378 /* copy entries into the new dict, converting string keys to int keys */
11379 while (PyDict_Next(x, &i, &key, &value)) {
11380 if (PyUnicode_Check(key)) {
11381 /* convert string keys to integer keys */
11382 PyObject *newkey;
11383 if (PyUnicode_GET_SIZE(key) != 1) {
11384 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11385 "table must be of length 1");
11386 goto err;
11387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 kind = PyUnicode_KIND(key);
11389 data = PyUnicode_DATA(key);
11390 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011391 if (!newkey)
11392 goto err;
11393 res = PyDict_SetItem(new, newkey, value);
11394 Py_DECREF(newkey);
11395 if (res < 0)
11396 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011397 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011398 /* just keep integer keys */
11399 if (PyDict_SetItem(new, key, value) < 0)
11400 goto err;
11401 } else {
11402 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11403 "be strings or integers");
11404 goto err;
11405 }
11406 }
11407 }
11408 return new;
11409 err:
11410 Py_DECREF(new);
11411 return NULL;
11412}
11413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416\n\
11417Return a copy of the string S, where all characters have been mapped\n\
11418through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011419Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011420Unmapped characters are left untouched. Characters mapped to None\n\
11421are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422
11423static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427}
11428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011429PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011430 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011432Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433
11434static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011435unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437 return fixup(self, fixupper);
11438}
11439
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011440PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011441 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011443Pad a numeric string S with zeros on the left, to fill a field\n\
11444of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445
11446static PyObject *
11447unicode_zfill(PyUnicodeObject *self, PyObject *args)
11448{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011449 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011451 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 int kind;
11453 void *data;
11454 Py_UCS4 chr;
11455
11456 if (PyUnicode_READY(self) == -1)
11457 return NULL;
11458
Martin v. Löwis18e16552006-02-15 17:27:45 +000011459 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460 return NULL;
11461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011463 if (PyUnicode_CheckExact(self)) {
11464 Py_INCREF(self);
11465 return (PyObject*) self;
11466 }
11467 else
Victor Stinner034f6cf2011-09-30 02:26:44 +020011468 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469 }
11470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472
11473 u = pad(self, fill, 0, '0');
11474
Walter Dörwald068325e2002-04-15 13:36:47 +000011475 if (u == NULL)
11476 return NULL;
11477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 kind = PyUnicode_KIND(u);
11479 data = PyUnicode_DATA(u);
11480 chr = PyUnicode_READ(kind, data, fill);
11481
11482 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 PyUnicode_WRITE(kind, data, 0, chr);
11485 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486 }
11487
11488 return (PyObject*) u;
11489}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
11491#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011492static PyObject *
11493unicode__decimal2ascii(PyObject *self)
11494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011495 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011496}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497#endif
11498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011499PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011500 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011502Return True if S starts with the specified prefix, False otherwise.\n\
11503With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011504With optional end, stop comparing S at that position.\n\
11505prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506
11507static PyObject *
11508unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011511 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011513 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011514 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011515 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516
Jesus Ceaac451502011-04-20 17:09:23 +020011517 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011518 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011519 if (PyTuple_Check(subobj)) {
11520 Py_ssize_t i;
11521 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11522 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011524 if (substring == NULL)
11525 return NULL;
11526 result = tailmatch(self, substring, start, end, -1);
11527 Py_DECREF(substring);
11528 if (result) {
11529 Py_RETURN_TRUE;
11530 }
11531 }
11532 /* nothing matched */
11533 Py_RETURN_FALSE;
11534 }
11535 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011536 if (substring == NULL) {
11537 if (PyErr_ExceptionMatches(PyExc_TypeError))
11538 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11539 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011540 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011541 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011542 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011544 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545}
11546
11547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011548PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011551Return True if S ends with the specified suffix, False otherwise.\n\
11552With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011553With optional end, stop comparing S at that position.\n\
11554suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555
11556static PyObject *
11557unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011560 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011562 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011563 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011564 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565
Jesus Ceaac451502011-04-20 17:09:23 +020011566 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011567 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011568 if (PyTuple_Check(subobj)) {
11569 Py_ssize_t i;
11570 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11571 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011572 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011573 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011574 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011575 result = tailmatch(self, substring, start, end, +1);
11576 Py_DECREF(substring);
11577 if (result) {
11578 Py_RETURN_TRUE;
11579 }
11580 }
11581 Py_RETURN_FALSE;
11582 }
11583 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011584 if (substring == NULL) {
11585 if (PyErr_ExceptionMatches(PyExc_TypeError))
11586 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11587 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011589 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011590 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011592 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593}
11594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011596
11597PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011598 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011599\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011600Return a formatted version of S, using substitutions from args and kwargs.\n\
11601The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011602
Eric Smith27bbca62010-11-04 17:06:58 +000011603PyDoc_STRVAR(format_map__doc__,
11604 "S.format_map(mapping) -> str\n\
11605\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011606Return a formatted version of S, using substitutions from mapping.\n\
11607The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011608
Eric Smith4a7d76d2008-05-30 18:10:19 +000011609static PyObject *
11610unicode__format__(PyObject* self, PyObject* args)
11611{
11612 PyObject *format_spec;
11613
11614 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11615 return NULL;
11616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11618 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011619}
11620
Eric Smith8c663262007-08-25 02:26:07 +000011621PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011622 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011623\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011624Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011625
11626static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011627unicode__sizeof__(PyUnicodeObject *v)
11628{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011629 Py_ssize_t size;
11630
11631 /* If it's a compact object, account for base structure +
11632 character data. */
11633 if (PyUnicode_IS_COMPACT_ASCII(v))
11634 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11635 else if (PyUnicode_IS_COMPACT(v))
11636 size = sizeof(PyCompactUnicodeObject) +
11637 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11638 else {
11639 /* If it is a two-block object, account for base object, and
11640 for character block if present. */
11641 size = sizeof(PyUnicodeObject);
11642 if (v->data.any)
11643 size += (PyUnicode_GET_LENGTH(v) + 1) *
11644 PyUnicode_CHARACTER_SIZE(v);
11645 }
11646 /* If the wstr pointer is present, account for it unless it is shared
11647 with the data pointer. Since PyUnicode_DATA will crash if the object
11648 is not ready, check whether it's either not ready (in which case the
11649 data is entirely in wstr) or if the data is not shared. */
11650 if (_PyUnicode_WSTR(v) &&
11651 (!PyUnicode_IS_READY(v) ||
11652 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11653 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
11654 if (_PyUnicode_UTF8(v) && _PyUnicode_UTF8(v) != PyUnicode_DATA(v))
11655 size += _PyUnicode_UTF8_LENGTH(v) + 1;
11656
11657 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011658}
11659
11660PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011661 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011662
11663static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011664unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011665{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011666 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 if (!copy)
11668 return NULL;
11669 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011670}
11671
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672static PyMethodDef unicode_methods[] = {
11673
11674 /* Order is according to common usage: often used methods should
11675 appear first, since lookup is done sequentially. */
11676
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000011677 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011678 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
11679 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011680 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011681 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
11682 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
11683 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
11684 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
11685 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
11686 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
11687 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011688 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011689 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
11690 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
11691 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011692 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011693 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
11694 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
11695 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011696 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000011697 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011698 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011699 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011700 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
11701 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
11702 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
11703 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
11704 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
11705 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
11706 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
11707 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
11708 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
11709 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
11710 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
11711 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
11712 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
11713 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000011714 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000011715 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011716 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000011717 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000011718 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000011719 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000011720 {"maketrans", (PyCFunction) unicode_maketrans,
11721 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011722 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000011723#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011724 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725#endif
11726
11727#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011728 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011729 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730#endif
11731
Benjamin Peterson14339b62009-01-31 16:36:08 +000011732 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733 {NULL, NULL}
11734};
11735
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011736static PyObject *
11737unicode_mod(PyObject *v, PyObject *w)
11738{
Brian Curtindfc80e32011-08-10 20:28:54 -050011739 if (!PyUnicode_Check(v))
11740 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000011741 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011742}
11743
11744static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011745 0, /*nb_add*/
11746 0, /*nb_subtract*/
11747 0, /*nb_multiply*/
11748 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000011749};
11750
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011752 (lenfunc) unicode_length, /* sq_length */
11753 PyUnicode_Concat, /* sq_concat */
11754 (ssizeargfunc) unicode_repeat, /* sq_repeat */
11755 (ssizeargfunc) unicode_getitem, /* sq_item */
11756 0, /* sq_slice */
11757 0, /* sq_ass_item */
11758 0, /* sq_ass_slice */
11759 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760};
11761
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011762static PyObject*
11763unicode_subscript(PyUnicodeObject* self, PyObject* item)
11764{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 if (PyUnicode_READY(self) == -1)
11766 return NULL;
11767
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011768 if (PyIndex_Check(item)) {
11769 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011770 if (i == -1 && PyErr_Occurred())
11771 return NULL;
11772 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 i += PyUnicode_GET_LENGTH(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011774 return unicode_getitem(self, i);
11775 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000011776 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011778 Py_UNICODE* result_buf;
11779 PyObject* result;
11780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000011782 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011783 return NULL;
11784 }
11785
11786 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787 return PyUnicode_New(0, 0);
11788 } else if (start == 0 && step == 1 &&
11789 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000011790 PyUnicode_CheckExact(self)) {
11791 Py_INCREF(self);
11792 return (PyObject *)self;
11793 } else if (step == 1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 return substring(self, start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011795 } else {
11796 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000011797 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
11798 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011799
Benjamin Peterson29060642009-01-31 22:14:21 +000011800 if (result_buf == NULL)
11801 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011802
11803 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
11804 result_buf[i] = source_buf[cur];
11805 }
Tim Petersced69f82003-09-16 20:30:58 +000011806
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011807 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000011808 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011809 return result;
11810 }
11811 } else {
11812 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
11813 return NULL;
11814 }
11815}
11816
11817static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011818 (lenfunc)unicode_length, /* mp_length */
11819 (binaryfunc)unicode_subscript, /* mp_subscript */
11820 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000011821};
11822
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824/* Helpers for PyUnicode_Format() */
11825
11826static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000011827getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011829 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011831 (*p_argidx)++;
11832 if (arglen < 0)
11833 return args;
11834 else
11835 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836 }
11837 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011838 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839 return NULL;
11840}
11841
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011842/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011844static PyObject *
11845formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011847 char *p;
11848 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849 double x;
Tim Petersced69f82003-09-16 20:30:58 +000011850
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851 x = PyFloat_AsDouble(v);
11852 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011853 return NULL;
11854
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011856 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000011857
Eric Smith0923d1d2009-04-16 20:16:10 +000011858 p = PyOS_double_to_string(x, type, prec,
11859 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011860 if (p == NULL)
11861 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000011863 PyMem_Free(p);
11864 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865}
11866
Tim Peters38fd5b62000-09-21 05:43:11 +000011867static PyObject*
11868formatlong(PyObject *val, int flags, int prec, int type)
11869{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011870 char *buf;
11871 int len;
11872 PyObject *str; /* temporary string object. */
11873 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011874
Benjamin Peterson14339b62009-01-31 16:36:08 +000011875 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
11876 if (!str)
11877 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011879 Py_DECREF(str);
11880 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000011881}
11882
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011885 size_t buflen,
11886 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011888 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011889 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 if (PyUnicode_GET_LENGTH(v) == 1) {
11891 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000011892 buf[1] = '\0';
11893 return 1;
11894 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011895 goto onError;
11896 }
11897 else {
11898 /* Integer input truncated to a character */
11899 long x;
11900 x = PyLong_AsLong(v);
11901 if (x == -1 && PyErr_Occurred())
11902 goto onError;
11903
11904 if (x < 0 || x > 0x10ffff) {
11905 PyErr_SetString(PyExc_OverflowError,
11906 "%c arg not in range(0x110000)");
11907 return -1;
11908 }
11909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011911 buf[1] = '\0';
11912 return 1;
11913 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000011914
Benjamin Peterson29060642009-01-31 22:14:21 +000011915 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011916 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011917 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000011918 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919}
11920
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011921/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011922 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011923*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000011924#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000011925
Alexander Belopolsky40018472011-02-26 01:02:56 +000011926PyObject *
11927PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 void *fmt;
11930 int fmtkind;
11931 PyObject *result;
11932 Py_UCS4 *res, *res0;
11933 Py_UCS4 max;
11934 int kind;
11935 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000011939
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 PyErr_BadInternalCall();
11942 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
11945 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011946 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 fmt = PyUnicode_DATA(uformat);
11948 fmtkind = PyUnicode_KIND(uformat);
11949 fmtcnt = PyUnicode_GET_LENGTH(uformat);
11950 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951
11952 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
11954 if (res0 == NULL) {
11955 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011956 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958
11959 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011960 arglen = PyTuple_Size(args);
11961 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962 }
11963 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 arglen = -1;
11965 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 }
Christian Heimes90aa7642007-12-19 02:45:37 +000011967 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000011968 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000011969 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970
11971 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011973 if (--rescnt < 0) {
11974 rescnt = fmtcnt + 100;
11975 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
11977 if (res0 == NULL){
11978 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000011979 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 }
11981 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000011982 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011983 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011985 }
11986 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011987 /* Got a format specifier */
11988 int flags = 0;
11989 Py_ssize_t width = -1;
11990 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 Py_UCS4 c = '\0';
11992 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000011993 int isnumok;
11994 PyObject *v = NULL;
11995 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 void *pbuf;
11997 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000011998 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 Py_ssize_t len, len1;
12000 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 fmtpos++;
12003 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12004 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 Py_ssize_t keylen;
12006 PyObject *key;
12007 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012008
Benjamin Peterson29060642009-01-31 22:14:21 +000012009 if (dict == NULL) {
12010 PyErr_SetString(PyExc_TypeError,
12011 "format requires a mapping");
12012 goto onError;
12013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012015 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012017 /* Skip over balanced parentheses */
12018 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012024 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012026 if (fmtcnt < 0 || pcount > 0) {
12027 PyErr_SetString(PyExc_ValueError,
12028 "incomplete format key");
12029 goto onError;
12030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 key = substring(uformat, keystart, keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012032 if (key == NULL)
12033 goto onError;
12034 if (args_owned) {
12035 Py_DECREF(args);
12036 args_owned = 0;
12037 }
12038 args = PyObject_GetItem(dict, key);
12039 Py_DECREF(key);
12040 if (args == NULL) {
12041 goto onError;
12042 }
12043 args_owned = 1;
12044 arglen = -1;
12045 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012046 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012049 case '-': flags |= F_LJUST; continue;
12050 case '+': flags |= F_SIGN; continue;
12051 case ' ': flags |= F_BLANK; continue;
12052 case '#': flags |= F_ALT; continue;
12053 case '0': flags |= F_ZERO; continue;
12054 }
12055 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012056 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012057 if (c == '*') {
12058 v = getnextarg(args, arglen, &argidx);
12059 if (v == NULL)
12060 goto onError;
12061 if (!PyLong_Check(v)) {
12062 PyErr_SetString(PyExc_TypeError,
12063 "* wants int");
12064 goto onError;
12065 }
12066 width = PyLong_AsLong(v);
12067 if (width == -1 && PyErr_Occurred())
12068 goto onError;
12069 if (width < 0) {
12070 flags |= F_LJUST;
12071 width = -width;
12072 }
12073 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012075 }
12076 else if (c >= '0' && c <= '9') {
12077 width = c - '0';
12078 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012080 if (c < '0' || c > '9')
12081 break;
12082 if ((width*10) / 10 != width) {
12083 PyErr_SetString(PyExc_ValueError,
12084 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012085 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012086 }
12087 width = width*10 + (c - '0');
12088 }
12089 }
12090 if (c == '.') {
12091 prec = 0;
12092 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012094 if (c == '*') {
12095 v = getnextarg(args, arglen, &argidx);
12096 if (v == NULL)
12097 goto onError;
12098 if (!PyLong_Check(v)) {
12099 PyErr_SetString(PyExc_TypeError,
12100 "* wants int");
12101 goto onError;
12102 }
12103 prec = PyLong_AsLong(v);
12104 if (prec == -1 && PyErr_Occurred())
12105 goto onError;
12106 if (prec < 0)
12107 prec = 0;
12108 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012110 }
12111 else if (c >= '0' && c <= '9') {
12112 prec = c - '0';
12113 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012115 if (c < '0' || c > '9')
12116 break;
12117 if ((prec*10) / 10 != prec) {
12118 PyErr_SetString(PyExc_ValueError,
12119 "prec too big");
12120 goto onError;
12121 }
12122 prec = prec*10 + (c - '0');
12123 }
12124 }
12125 } /* prec */
12126 if (fmtcnt >= 0) {
12127 if (c == 'h' || c == 'l' || c == 'L') {
12128 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012130 }
12131 }
12132 if (fmtcnt < 0) {
12133 PyErr_SetString(PyExc_ValueError,
12134 "incomplete format");
12135 goto onError;
12136 }
12137 if (c != '%') {
12138 v = getnextarg(args, arglen, &argidx);
12139 if (v == NULL)
12140 goto onError;
12141 }
12142 sign = 0;
12143 fill = ' ';
12144 switch (c) {
12145
12146 case '%':
12147 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012149 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012151 len = 1;
12152 break;
12153
12154 case 's':
12155 case 'r':
12156 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012157 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012158 temp = v;
12159 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012160 }
12161 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012162 if (c == 's')
12163 temp = PyObject_Str(v);
12164 else if (c == 'r')
12165 temp = PyObject_Repr(v);
12166 else
12167 temp = PyObject_ASCII(v);
12168 if (temp == NULL)
12169 goto onError;
12170 if (PyUnicode_Check(temp))
12171 /* nothing to do */;
12172 else {
12173 Py_DECREF(temp);
12174 PyErr_SetString(PyExc_TypeError,
12175 "%s argument has non-string str()");
12176 goto onError;
12177 }
12178 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 if (PyUnicode_READY(temp) == -1) {
12180 Py_CLEAR(temp);
12181 goto onError;
12182 }
12183 pbuf = PyUnicode_DATA(temp);
12184 kind = PyUnicode_KIND(temp);
12185 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012186 if (prec >= 0 && len > prec)
12187 len = prec;
12188 break;
12189
12190 case 'i':
12191 case 'd':
12192 case 'u':
12193 case 'o':
12194 case 'x':
12195 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012196 isnumok = 0;
12197 if (PyNumber_Check(v)) {
12198 PyObject *iobj=NULL;
12199
12200 if (PyLong_Check(v)) {
12201 iobj = v;
12202 Py_INCREF(iobj);
12203 }
12204 else {
12205 iobj = PyNumber_Long(v);
12206 }
12207 if (iobj!=NULL) {
12208 if (PyLong_Check(iobj)) {
12209 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012210 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012211 Py_DECREF(iobj);
12212 if (!temp)
12213 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 if (PyUnicode_READY(temp) == -1) {
12215 Py_CLEAR(temp);
12216 goto onError;
12217 }
12218 pbuf = PyUnicode_DATA(temp);
12219 kind = PyUnicode_KIND(temp);
12220 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012221 sign = 1;
12222 }
12223 else {
12224 Py_DECREF(iobj);
12225 }
12226 }
12227 }
12228 if (!isnumok) {
12229 PyErr_Format(PyExc_TypeError,
12230 "%%%c format: a number is required, "
12231 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12232 goto onError;
12233 }
12234 if (flags & F_ZERO)
12235 fill = '0';
12236 break;
12237
12238 case 'e':
12239 case 'E':
12240 case 'f':
12241 case 'F':
12242 case 'g':
12243 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012244 temp = formatfloat(v, flags, prec, c);
12245 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012246 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 if (PyUnicode_READY(temp) == -1) {
12248 Py_CLEAR(temp);
12249 goto onError;
12250 }
12251 pbuf = PyUnicode_DATA(temp);
12252 kind = PyUnicode_KIND(temp);
12253 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012254 sign = 1;
12255 if (flags & F_ZERO)
12256 fill = '0';
12257 break;
12258
12259 case 'c':
12260 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012262 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012263 if (len < 0)
12264 goto onError;
12265 break;
12266
12267 default:
12268 PyErr_Format(PyExc_ValueError,
12269 "unsupported format character '%c' (0x%x) "
12270 "at index %zd",
12271 (31<=c && c<=126) ? (char)c : '?',
12272 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012274 goto onError;
12275 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012276 /* pbuf is initialized here. */
12277 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012278 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12280 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12281 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012282 len--;
12283 }
12284 else if (flags & F_SIGN)
12285 sign = '+';
12286 else if (flags & F_BLANK)
12287 sign = ' ';
12288 else
12289 sign = 0;
12290 }
12291 if (width < len)
12292 width = len;
12293 if (rescnt - (sign != 0) < width) {
12294 reslen -= rescnt;
12295 rescnt = width + fmtcnt + 100;
12296 reslen += rescnt;
12297 if (reslen < 0) {
12298 Py_XDECREF(temp);
12299 PyErr_NoMemory();
12300 goto onError;
12301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12303 if (res0 == 0) {
12304 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 Py_XDECREF(temp);
12306 goto onError;
12307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012309 }
12310 if (sign) {
12311 if (fill != ' ')
12312 *res++ = sign;
12313 rescnt--;
12314 if (width > len)
12315 width--;
12316 }
12317 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12319 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012320 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012321 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12322 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012323 }
12324 rescnt -= 2;
12325 width -= 2;
12326 if (width < 0)
12327 width = 0;
12328 len -= 2;
12329 }
12330 if (width > len && !(flags & F_LJUST)) {
12331 do {
12332 --rescnt;
12333 *res++ = fill;
12334 } while (--width > len);
12335 }
12336 if (fill == ' ') {
12337 if (sign)
12338 *res++ = sign;
12339 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12341 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12342 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12343 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012344 }
12345 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 /* Copy all characters, preserving len */
12347 len1 = len;
12348 while (len1--) {
12349 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12350 rescnt--;
12351 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 while (--width >= len) {
12353 --rescnt;
12354 *res++ = ' ';
12355 }
12356 if (dict && (argidx < arglen) && c != '%') {
12357 PyErr_SetString(PyExc_TypeError,
12358 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012359 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012360 goto onError;
12361 }
12362 Py_XDECREF(temp);
12363 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364 } /* until end */
12365 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012366 PyErr_SetString(PyExc_TypeError,
12367 "not all arguments converted during string formatting");
12368 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369 }
12370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371
12372 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12373 if (*res > max)
12374 max = *res;
12375 result = PyUnicode_New(reslen - rescnt, max);
12376 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012377 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378 kind = PyUnicode_KIND(result);
12379 for (res = res0; res < res0+reslen-rescnt; res++)
12380 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12381 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384 }
12385 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386 return (PyObject *)result;
12387
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390 Py_DECREF(uformat);
12391 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012392 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393 }
12394 return NULL;
12395}
12396
Jeremy Hylton938ace62002-07-17 16:30:39 +000012397static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012398unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12399
Tim Peters6d6c1a32001-08-02 04:15:00 +000012400static PyObject *
12401unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12402{
Benjamin Peterson29060642009-01-31 22:14:21 +000012403 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012404 static char *kwlist[] = {"object", "encoding", "errors", 0};
12405 char *encoding = NULL;
12406 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012407
Benjamin Peterson14339b62009-01-31 16:36:08 +000012408 if (type != &PyUnicode_Type)
12409 return unicode_subtype_new(type, args, kwds);
12410 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012412 return NULL;
12413 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012414 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012415 if (encoding == NULL && errors == NULL)
12416 return PyObject_Str(x);
12417 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012418 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012419}
12420
Guido van Rossume023fe02001-08-30 03:12:59 +000012421static PyObject *
12422unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12423{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012424 PyUnicodeObject *tmp, *pnew;
12425 Py_ssize_t n;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 PyObject *err = NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012427
Benjamin Peterson14339b62009-01-31 16:36:08 +000012428 assert(PyType_IsSubtype(type, &PyUnicode_Type));
12429 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12430 if (tmp == NULL)
12431 return NULL;
12432 assert(PyUnicode_Check(tmp));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
12434 // it seems kind of strange that tp_alloc gets passed the size
12435 // of the unicode string because there will follow another
12436 // malloc.
12437 pnew = (PyUnicodeObject *) type->tp_alloc(type,
12438 n = PyUnicode_GET_SIZE(tmp));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012439 if (pnew == NULL) {
12440 Py_DECREF(tmp);
12441 return NULL;
12442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
12444 if (_PyUnicode_WSTR(pnew) == NULL) {
12445 err = PyErr_NoMemory();
12446 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
12449 _PyUnicode_WSTR_LENGTH(pnew) = n;
12450 _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
12451 _PyUnicode_STATE(pnew).interned = 0;
12452 _PyUnicode_STATE(pnew).kind = 0;
12453 _PyUnicode_STATE(pnew).compact = 0;
12454 _PyUnicode_STATE(pnew).ready = 0;
12455 _PyUnicode_STATE(pnew).ascii = 0;
12456 pnew->data.any = NULL;
12457 _PyUnicode_LENGTH(pnew) = 0;
12458 pnew->_base.utf8 = NULL;
12459 pnew->_base.utf8_length = 0;
12460
12461 if (PyUnicode_READY(pnew) == -1) {
12462 PyObject_FREE(_PyUnicode_WSTR(pnew));
12463 goto onError;
12464 }
12465
Benjamin Peterson14339b62009-01-31 16:36:08 +000012466 Py_DECREF(tmp);
12467 return (PyObject *)pnew;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468
12469 onError:
12470 _Py_ForgetReference((PyObject *)pnew);
12471 PyObject_Del(pnew);
12472 Py_DECREF(tmp);
12473 return err;
Guido van Rossume023fe02001-08-30 03:12:59 +000012474}
12475
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012476PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012477 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012478\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012479Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012480encoding defaults to the current default string encoding.\n\
12481errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012482
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012483static PyObject *unicode_iter(PyObject *seq);
12484
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012486 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012487 "str", /* tp_name */
12488 sizeof(PyUnicodeObject), /* tp_size */
12489 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012491 (destructor)unicode_dealloc, /* tp_dealloc */
12492 0, /* tp_print */
12493 0, /* tp_getattr */
12494 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012495 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012496 unicode_repr, /* tp_repr */
12497 &unicode_as_number, /* tp_as_number */
12498 &unicode_as_sequence, /* tp_as_sequence */
12499 &unicode_as_mapping, /* tp_as_mapping */
12500 (hashfunc) unicode_hash, /* tp_hash*/
12501 0, /* tp_call*/
12502 (reprfunc) unicode_str, /* tp_str */
12503 PyObject_GenericGetAttr, /* tp_getattro */
12504 0, /* tp_setattro */
12505 0, /* tp_as_buffer */
12506 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012507 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012508 unicode_doc, /* tp_doc */
12509 0, /* tp_traverse */
12510 0, /* tp_clear */
12511 PyUnicode_RichCompare, /* tp_richcompare */
12512 0, /* tp_weaklistoffset */
12513 unicode_iter, /* tp_iter */
12514 0, /* tp_iternext */
12515 unicode_methods, /* tp_methods */
12516 0, /* tp_members */
12517 0, /* tp_getset */
12518 &PyBaseObject_Type, /* tp_base */
12519 0, /* tp_dict */
12520 0, /* tp_descr_get */
12521 0, /* tp_descr_set */
12522 0, /* tp_dictoffset */
12523 0, /* tp_init */
12524 0, /* tp_alloc */
12525 unicode_new, /* tp_new */
12526 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527};
12528
12529/* Initialize the Unicode implementation */
12530
Thomas Wouters78890102000-07-22 19:25:51 +000012531void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012533 int i;
12534
Thomas Wouters477c8d52006-05-27 19:21:47 +000012535 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012536 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012537 0x000A, /* LINE FEED */
12538 0x000D, /* CARRIAGE RETURN */
12539 0x001C, /* FILE SEPARATOR */
12540 0x001D, /* GROUP SEPARATOR */
12541 0x001E, /* RECORD SEPARATOR */
12542 0x0085, /* NEXT LINE */
12543 0x2028, /* LINE SEPARATOR */
12544 0x2029, /* PARAGRAPH SEPARATOR */
12545 };
12546
Fred Drakee4315f52000-05-09 19:53:39 +000012547 /* Init the implementation */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 unicode_empty = (PyUnicodeObject *) PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012549 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012551
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012552 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012553 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012554 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012555 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012556
12557 /* initialize the linebreak bloom filter */
12558 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012560 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012561
12562 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563}
12564
12565/* Finalize the Unicode implementation */
12566
Christian Heimesa156e092008-02-16 07:38:31 +000012567int
12568PyUnicode_ClearFreeList(void)
12569{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012571}
12572
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573void
Thomas Wouters78890102000-07-22 19:25:51 +000012574_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012576 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012578 Py_XDECREF(unicode_empty);
12579 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012580
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012581 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012582 if (unicode_latin1[i]) {
12583 Py_DECREF(unicode_latin1[i]);
12584 unicode_latin1[i] = NULL;
12585 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012586 }
Christian Heimesa156e092008-02-16 07:38:31 +000012587 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012589
Walter Dörwald16807132007-05-25 13:52:07 +000012590void
12591PyUnicode_InternInPlace(PyObject **p)
12592{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012593 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12594 PyObject *t;
12595 if (s == NULL || !PyUnicode_Check(s))
12596 Py_FatalError(
12597 "PyUnicode_InternInPlace: unicode strings only please!");
12598 /* If it's a subclass, we don't really know what putting
12599 it in the interned dict might do. */
12600 if (!PyUnicode_CheckExact(s))
12601 return;
12602 if (PyUnicode_CHECK_INTERNED(s))
12603 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604 if (PyUnicode_READY(s) == -1) {
12605 assert(0 && "ready fail in intern...");
12606 return;
12607 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012608 if (interned == NULL) {
12609 interned = PyDict_New();
12610 if (interned == NULL) {
12611 PyErr_Clear(); /* Don't leave an exception */
12612 return;
12613 }
12614 }
12615 /* It might be that the GetItem call fails even
12616 though the key is present in the dictionary,
12617 namely when this happens during a stack overflow. */
12618 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012619 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012620 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012621
Benjamin Peterson29060642009-01-31 22:14:21 +000012622 if (t) {
12623 Py_INCREF(t);
12624 Py_DECREF(*p);
12625 *p = t;
12626 return;
12627 }
Walter Dörwald16807132007-05-25 13:52:07 +000012628
Benjamin Peterson14339b62009-01-31 16:36:08 +000012629 PyThreadState_GET()->recursion_critical = 1;
12630 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12631 PyErr_Clear();
12632 PyThreadState_GET()->recursion_critical = 0;
12633 return;
12634 }
12635 PyThreadState_GET()->recursion_critical = 0;
12636 /* The two references in interned are not counted by refcnt.
12637 The deallocator will take care of this */
12638 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000012640}
12641
12642void
12643PyUnicode_InternImmortal(PyObject **p)
12644{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 PyUnicodeObject *u = (PyUnicodeObject *)*p;
12646
Benjamin Peterson14339b62009-01-31 16:36:08 +000012647 PyUnicode_InternInPlace(p);
12648 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012650 Py_INCREF(*p);
12651 }
Walter Dörwald16807132007-05-25 13:52:07 +000012652}
12653
12654PyObject *
12655PyUnicode_InternFromString(const char *cp)
12656{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012657 PyObject *s = PyUnicode_FromString(cp);
12658 if (s == NULL)
12659 return NULL;
12660 PyUnicode_InternInPlace(&s);
12661 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000012662}
12663
Alexander Belopolsky40018472011-02-26 01:02:56 +000012664void
12665_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000012666{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012667 PyObject *keys;
12668 PyUnicodeObject *s;
12669 Py_ssize_t i, n;
12670 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000012671
Benjamin Peterson14339b62009-01-31 16:36:08 +000012672 if (interned == NULL || !PyDict_Check(interned))
12673 return;
12674 keys = PyDict_Keys(interned);
12675 if (keys == NULL || !PyList_Check(keys)) {
12676 PyErr_Clear();
12677 return;
12678 }
Walter Dörwald16807132007-05-25 13:52:07 +000012679
Benjamin Peterson14339b62009-01-31 16:36:08 +000012680 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
12681 detector, interned unicode strings are not forcibly deallocated;
12682 rather, we give them their stolen references back, and then clear
12683 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000012684
Benjamin Peterson14339b62009-01-31 16:36:08 +000012685 n = PyList_GET_SIZE(keys);
12686 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000012687 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012688 for (i = 0; i < n; i++) {
12689 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 if (PyUnicode_READY(s) == -1)
12691 fprintf(stderr, "could not ready string\n");
12692 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012693 case SSTATE_NOT_INTERNED:
12694 /* XXX Shouldn't happen */
12695 break;
12696 case SSTATE_INTERNED_IMMORTAL:
12697 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012699 break;
12700 case SSTATE_INTERNED_MORTAL:
12701 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012702 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012703 break;
12704 default:
12705 Py_FatalError("Inconsistent interned string state.");
12706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012707 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012708 }
12709 fprintf(stderr, "total size of all interned strings: "
12710 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
12711 "mortal/immortal\n", mortal_size, immortal_size);
12712 Py_DECREF(keys);
12713 PyDict_Clear(interned);
12714 Py_DECREF(interned);
12715 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000012716}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012717
12718
12719/********************* Unicode Iterator **************************/
12720
12721typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012722 PyObject_HEAD
12723 Py_ssize_t it_index;
12724 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012725} unicodeiterobject;
12726
12727static void
12728unicodeiter_dealloc(unicodeiterobject *it)
12729{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012730 _PyObject_GC_UNTRACK(it);
12731 Py_XDECREF(it->it_seq);
12732 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012733}
12734
12735static int
12736unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
12737{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012738 Py_VISIT(it->it_seq);
12739 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012740}
12741
12742static PyObject *
12743unicodeiter_next(unicodeiterobject *it)
12744{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012745 PyUnicodeObject *seq;
12746 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012747
Benjamin Peterson14339b62009-01-31 16:36:08 +000012748 assert(it != NULL);
12749 seq = it->it_seq;
12750 if (seq == NULL)
12751 return NULL;
12752 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
12755 int kind = PyUnicode_KIND(seq);
12756 void *data = PyUnicode_DATA(seq);
12757 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
12758 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012759 if (item != NULL)
12760 ++it->it_index;
12761 return item;
12762 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012763
Benjamin Peterson14339b62009-01-31 16:36:08 +000012764 Py_DECREF(seq);
12765 it->it_seq = NULL;
12766 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012767}
12768
12769static PyObject *
12770unicodeiter_len(unicodeiterobject *it)
12771{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012772 Py_ssize_t len = 0;
12773 if (it->it_seq)
12774 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
12775 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012776}
12777
12778PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
12779
12780static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012781 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000012782 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000012783 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012784};
12785
12786PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012787 PyVarObject_HEAD_INIT(&PyType_Type, 0)
12788 "str_iterator", /* tp_name */
12789 sizeof(unicodeiterobject), /* tp_basicsize */
12790 0, /* tp_itemsize */
12791 /* methods */
12792 (destructor)unicodeiter_dealloc, /* tp_dealloc */
12793 0, /* tp_print */
12794 0, /* tp_getattr */
12795 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012796 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012797 0, /* tp_repr */
12798 0, /* tp_as_number */
12799 0, /* tp_as_sequence */
12800 0, /* tp_as_mapping */
12801 0, /* tp_hash */
12802 0, /* tp_call */
12803 0, /* tp_str */
12804 PyObject_GenericGetAttr, /* tp_getattro */
12805 0, /* tp_setattro */
12806 0, /* tp_as_buffer */
12807 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
12808 0, /* tp_doc */
12809 (traverseproc)unicodeiter_traverse, /* tp_traverse */
12810 0, /* tp_clear */
12811 0, /* tp_richcompare */
12812 0, /* tp_weaklistoffset */
12813 PyObject_SelfIter, /* tp_iter */
12814 (iternextfunc)unicodeiter_next, /* tp_iternext */
12815 unicodeiter_methods, /* tp_methods */
12816 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012817};
12818
12819static PyObject *
12820unicode_iter(PyObject *seq)
12821{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012822 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012823
Benjamin Peterson14339b62009-01-31 16:36:08 +000012824 if (!PyUnicode_Check(seq)) {
12825 PyErr_BadInternalCall();
12826 return NULL;
12827 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828 if (PyUnicode_READY(seq) == -1)
12829 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012830 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
12831 if (it == NULL)
12832 return NULL;
12833 it->it_index = 0;
12834 Py_INCREF(seq);
12835 it->it_seq = (PyUnicodeObject *)seq;
12836 _PyObject_GC_TRACK(it);
12837 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012838}
12839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012840#define UNIOP(x) Py_UNICODE_##x
12841#define UNIOP_t Py_UNICODE
12842#include "uniops.h"
12843#undef UNIOP
12844#undef UNIOP_t
12845#define UNIOP(x) Py_UCS4_##x
12846#define UNIOP_t Py_UCS4
12847#include "uniops.h"
12848#undef UNIOP
12849#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000012850
Victor Stinner71133ff2010-09-01 23:43:53 +000012851Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000012852PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000012853{
12854 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
12855 Py_UNICODE *copy;
12856 Py_ssize_t size;
12857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012858 if (!PyUnicode_Check(unicode)) {
12859 PyErr_BadArgument();
12860 return NULL;
12861 }
Victor Stinner71133ff2010-09-01 23:43:53 +000012862 /* Ensure we won't overflow the size. */
12863 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
12864 PyErr_NoMemory();
12865 return NULL;
12866 }
12867 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
12868 size *= sizeof(Py_UNICODE);
12869 copy = PyMem_Malloc(size);
12870 if (copy == NULL) {
12871 PyErr_NoMemory();
12872 return NULL;
12873 }
12874 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
12875 return copy;
12876}
Martin v. Löwis5b222132007-06-10 09:51:05 +000012877
Georg Brandl66c221e2010-10-14 07:04:07 +000012878/* A _string module, to export formatter_parser and formatter_field_name_split
12879 to the string.Formatter class implemented in Python. */
12880
12881static PyMethodDef _string_methods[] = {
12882 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
12883 METH_O, PyDoc_STR("split the argument as a field name")},
12884 {"formatter_parser", (PyCFunction) formatter_parser,
12885 METH_O, PyDoc_STR("parse the argument as a format string")},
12886 {NULL, NULL}
12887};
12888
12889static struct PyModuleDef _string_module = {
12890 PyModuleDef_HEAD_INIT,
12891 "_string",
12892 PyDoc_STR("string helper module"),
12893 0,
12894 _string_methods,
12895 NULL,
12896 NULL,
12897 NULL,
12898 NULL
12899};
12900
12901PyMODINIT_FUNC
12902PyInit__string(void)
12903{
12904 return PyModule_Create(&_string_module);
12905}
12906
12907
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012908#ifdef __cplusplus
12909}
12910#endif